def show_image_with_boxes(img, out_boxes, show=False): if show: im = np.array(img) for box in out_boxes: pts = box[0:8] pts = pts.reshape(4, -1) draw_box_points(im, pts, color=(0, 255, 0), thickness=1) cv2.imshow('img', im) cv2.waitKey(1)
def draw_detections(img, boxes, color=(255, 0, 0)): draw2 = np.copy(img) if len(boxes) == 0: return draw2 for i in range(0, boxes.shape[0]): pts = boxes[i] pts = pts[0:8] pts = pts.reshape(4, -1) pts = np.asarray(pts, dtype=np.int) draw_box_points(draw2, pts, color=color, thickness=2) # cv2.imshow('nms', draw2) return draw2
def process_boxes(images, im_data, iou_pred, roi_pred, angle_pred, score_maps, gt_idxs, gtso, lbso, features, net, ctc_loss, opts, debug=False): ctc_loss_count = 0 loss = torch.from_numpy(np.asarray([0])).type(torch.FloatTensor).cuda() for bid in range(iou_pred.size(0)): gts = gtso[bid] lbs = lbso[bid] gt_proc = 0 gt_good = 0 gts_count = {} iou_pred_np = iou_pred[bid].data.cpu().numpy() iou_map = score_maps[bid] to_walk = iou_pred_np.squeeze(0) * iou_map * (iou_pred_np.squeeze(0) > 0.5) roi_p_bid = roi_pred[bid].data.cpu().numpy() gt_idx = gt_idxs[bid] if debug: img = images[bid] img += 1 img *= 128 img = np.asarray(img, dtype=np.uint8) xy_text = np.argwhere(to_walk > 0) random.shuffle(xy_text) xy_text = xy_text[0:min(xy_text.shape[0], 100)] for i in range(0, xy_text.shape[0]): if opts.geo_type == 1: break pos = xy_text[i, :] gt_id = gt_idx[pos[0], pos[1]] if not gt_id in gts_count: gts_count[gt_id] = 0 if gts_count[gt_id] > 2: continue gt = gts[gt_id] gt_txt = lbs[gt_id] if gt_txt.startswith('##'): continue angle_sin = angle_pred[bid, 0, pos[0], pos[1]] angle_cos = angle_pred[bid, 1, pos[0], pos[1]] angle = math.atan2(angle_sin, angle_cos) angle_gt = (math.atan2( (gt[2][1] - gt[1][1]), gt[2][0] - gt[1][0]) + math.atan2( (gt[3][1] - gt[0][1]), gt[3][0] - gt[0][0])) / 2 if math.fabs(angle_gt - angle) > math.pi / 16: continue offset = roi_p_bid[:, pos[0], pos[1]] posp = pos + 0.25 pos_g = np.array([(posp[1] - offset[0] * math.sin(angle)) * 4, (posp[0] - offset[0] * math.cos(angle)) * 4]) pos_g2 = np.array([(posp[1] + offset[1] * math.sin(angle)) * 4, (posp[0] + offset[1] * math.cos(angle)) * 4]) pos_r = np.array([(posp[1] - offset[2] * math.cos(angle)) * 4, (posp[0] - offset[2] * math.sin(angle)) * 4]) pos_r2 = np.array([(posp[1] + offset[3] * math.cos(angle)) * 4, (posp[0] + offset[3] * math.sin(angle)) * 4]) center = (pos_g + pos_g2 + pos_r + pos_r2) / 2 - [ 4 * pos[1], 4 * pos[0] ] #center = (pos_g + pos_g2 + pos_r + pos_r2) / 4 dw = pos_r - pos_r2 dh = pos_g - pos_g2 w = math.sqrt(dw[0] * dw[0] + dw[1] * dw[1]) h = math.sqrt(dh[0] * dh[0] + dh[1] * dh[1]) dhgt = gt[1] - gt[0] h_gt = math.sqrt(dhgt[0] * dhgt[0] + dhgt[1] * dhgt[1]) if h_gt < 10: continue rect = ((center[0], center[1]), (w, h), angle * 180 / math.pi) pts = cv2.boxPoints(rect) pred_bbox = cv2.boundingRect(pts) pred_bbox = [ pred_bbox[0], pred_bbox[1], pred_bbox[2], pred_bbox[3] ] pred_bbox[2] += pred_bbox[0] pred_bbox[3] += pred_bbox[1] if gt[:, 0].max() > im_data.size(3) or gt[:, 1].max() > im_data.size(3): continue gt_bbox = [ gt[:, 0].min(), gt[:, 1].min(), gt[:, 0].max(), gt[:, 1].max() ] inter = intersect(pred_bbox, gt_bbox) uni = union(pred_bbox, gt_bbox) ratio = area(inter) / float(area(uni)) if ratio < 0.90: continue hratio = min(h, h_gt) / max(h, h_gt) if hratio < 0.5: continue input_W = im_data.size(3) input_H = im_data.size(2) target_h = norm_height scale = target_h / h target_gw = (int(w * scale) + target_h) target_gw = max(8, int(round(target_gw / 4)) * 4) #show pooled image in image layer scalex = (w + h) / input_W scaley = h / input_H th11 = scalex * math.cos(angle) th12 = -math.sin(angle) * scaley th13 = (2 * center[0] - input_W - 1) / ( input_W - 1 ) #* torch.cos(angle_var) - (2 * yc - input_H - 1) / (input_H - 1) * torch.sin(angle_var) th21 = math.sin(angle) * scalex th22 = scaley * math.cos(angle) th23 = (2 * center[1] - input_H - 1) / ( input_H - 1 ) #* torch.cos(angle_var) + (2 * xc - input_W - 1) / (input_W - 1) * torch.sin(angle_var) t = np.asarray([th11, th12, th13, th21, th22, th23], dtype=np.float) t = torch.from_numpy(t).type(torch.FloatTensor).cuda() #t = torch.stack((th11, th12, th13, th21, th22, th23), dim=1) theta = t.view(-1, 2, 3) grid = F.affine_grid( theta, torch.Size((1, 3, int(target_h), int(target_gw)))) x = F.grid_sample(im_data[bid].unsqueeze(0), grid) if debug: x_c = x.data.cpu().numpy()[0] x_data_draw = x_c.swapaxes(0, 2) x_data_draw = x_data_draw.swapaxes(0, 1) x_data_draw += 1 x_data_draw *= 128 x_data_draw = np.asarray(x_data_draw, dtype=np.uint8) x_data_draw = x_data_draw[:, :, ::-1] cv2.circle(img, (int(center[0]), int(center[1])), 5, (0, 255, 0)) cv2.imshow('im_data', x_data_draw) draw_box_points(img, pts) draw_box_points(img, gt, color=(0, 0, 255)) cv2.imshow('img', img) cv2.waitKey(100) gt_labels = [] gt_labels.append(codec_rev[' ']) for k in range(len(gt_txt)): if gt_txt[k] in codec_rev: gt_labels.append(codec_rev[gt_txt[k]]) else: print('Unknown char: {0}'.format(gt_txt[k])) gt_labels.append(3) if 'ARABIC' in ud.name(gt_txt[0]): gt_labels = gt_labels[::-1] gt_labels.append(codec_rev[' ']) features = net.forward_features(x) labels_pred = net.forward_ocr(features) label_length = [] label_length.append(len(gt_labels)) probs_sizes = autograd.Variable( torch.IntTensor([(labels_pred.permute(2, 0, 1).size()[0])] * (labels_pred.permute(2, 0, 1).size()[1]))) label_sizes = autograd.Variable( torch.IntTensor( torch.from_numpy(np.array(label_length)).int())) labels = autograd.Variable( torch.IntTensor(torch.from_numpy(np.array(gt_labels)).int())) loss = loss + ctc_loss(labels_pred.permute(2, 0, 1), labels, probs_sizes, label_sizes).cuda() ctc_loss_count += 1 if debug: ctc_f = labels_pred.data.cpu().numpy() ctc_f = ctc_f.swapaxes(1, 2) labels = ctc_f.argmax(2) det_text, conf, dec_s, splits = print_seq_ext( labels[0, :], codec) print('{0} \t {1}'.format(det_text, gt_txt)) gts_count[gt_id] += 1 if ctc_loss_count > 64 or debug: break for gt_id in range(0, len(gts)): gt = gts[gt_id] gt_txt = lbs[gt_id] gt_txt_low = gt_txt.lower() if gt_txt.startswith('##'): continue if gt[:, 0].max() > im_data.size(3) or gt[:, 1].max() > im_data.size(3): continue if gt.min() < 0: continue center = (gt[0, :] + gt[1, :] + gt[2, :] + gt[3, :]) / 4 dw = gt[2, :] - gt[1, :] dh = gt[1, :] - gt[0, :] w = math.sqrt(dw[0] * dw[0] + dw[1] * dw[1]) h = math.sqrt(dh[0] * dh[0] + dh[1] * dh[1]) + random.randint( -2, 2) if h < 8: #print('too small h!') continue angle_gt = (math.atan2( (gt[2][1] - gt[1][1]), gt[2][0] - gt[1][0]) + math.atan2( (gt[3][1] - gt[0][1]), gt[3][0] - gt[0][0])) / 2 input_W = im_data.size(3) input_H = im_data.size(2) target_h = norm_height scale = target_h / h target_gw = int(w * scale) + random.randint(0, int(target_h)) target_gw = max(8, int(round(target_gw / 4)) * 4) xc = center[0] yc = center[1] w2 = w h2 = h #show pooled image in image layer scalex = (w2 + random.randint(0, int(h2))) / input_W scaley = h2 / input_H th11 = scalex * math.cos(angle_gt) th12 = -math.sin(angle_gt) * scaley th13 = (2 * xc - input_W - 1) / ( input_W - 1 ) #* torch.cos(angle_var) - (2 * yc - input_H - 1) / (input_H - 1) * torch.sin(angle_var) th21 = math.sin(angle_gt) * scalex th22 = scaley * math.cos(angle_gt) th23 = (2 * yc - input_H - 1) / ( input_H - 1 ) #* torch.cos(angle_var) + (2 * xc - input_W - 1) / (input_W - 1) * torch.sin(angle_var) t = np.asarray([th11, th12, th13, th21, th22, th23], dtype=np.float) t = torch.from_numpy(t).type(torch.FloatTensor) t = t.cuda() theta = t.view(-1, 2, 3) grid = F.affine_grid( theta, torch.Size((1, 3, int(target_h), int(target_gw)))) x = F.grid_sample(im_data[bid].unsqueeze(0), grid) #score_sampled = F.grid_sample(iou_pred[bid].unsqueeze(0), grid) gt_labels = [] gt_labels.append(codec_rev[' ']) for k in range(len(gt_txt)): if gt_txt[k] in codec_rev: gt_labels.append(codec_rev[gt_txt[k]]) else: print('Unknown char: {0}'.format(gt_txt[k])) gt_labels.append(3) gt_labels.append(codec_rev[' ']) if 'ARABIC' in ud.name(gt_txt[0]): gt_labels = gt_labels[::-1] features = net.forward_features(x) labels_pred = net.forward_ocr(features) label_length = [] label_length.append(len(gt_labels)) probs_sizes = torch.IntTensor( [(labels_pred.permute(2, 0, 1).size()[0])] * (labels_pred.permute(2, 0, 1).size()[1])) label_sizes = torch.IntTensor( torch.from_numpy(np.array(label_length)).int()) labels = torch.IntTensor( torch.from_numpy(np.array(gt_labels)).int()) loss = loss + ctc_loss(labels_pred.permute(2, 0, 1), labels, probs_sizes, label_sizes).cuda() ctc_loss_count += 1 if debug: x_d = x.data.cpu().numpy()[0] x_data_draw = x_d.swapaxes(0, 2) x_data_draw = x_data_draw.swapaxes(0, 1) x_data_draw += 1 x_data_draw *= 128 x_data_draw = np.asarray(x_data_draw, dtype=np.uint8) x_data_draw = x_data_draw[:, :, ::-1] cv2.imshow('im_data_gt', x_data_draw) cv2.waitKey(100) gt_proc += 1 if True: ctc_f = labels_pred.data.cpu().numpy() ctc_f = ctc_f.swapaxes(1, 2) labels = ctc_f.argmax(2) det_text, conf, dec_s, splits = print_seq_ext( labels[0, :], codec) if debug: print('{0} \t {1}'.format(det_text, gt_txt)) if det_text.lower() == gt_txt.lower(): gt_good += 1 if ctc_loss_count > 128 or debug: break if ctc_loss_count > 0: loss /= ctc_loss_count return loss, gt_good, gt_proc
for box in boxes: pts = box[0:8] pts = pts.reshape(4, -1) # det_text, conf, dec_s = ocr_image(net, codec, im_data, box) det_text, conf, dec_s = align_ocr(net, converter, im_data, box, features, debug=0) if len(det_text) == 0: continue width, height = draw.textsize(det_text, font=font2) center = [box[0], box[1]] draw.text((center[0], center[1]), det_text, fill = (0,255,0),font=font2) out_boxes.append(box) print(det_text) im = np.array(img) for box in out_boxes: pts = box[0:8] pts = pts.reshape(4, -1) draw_box_points(im, pts, color=(0, 255, 0), thickness=1) cv2.imshow('img', im) basename = os.path.basename(path) cv2.imwrite(os.path.join(args.output, basename), im) cv2.waitKey(1000)
def process_splits(trans, word_splits, conf, splits, start, ctc_f, rot_mat, angle, box_points, w, h, draw, is_dict, debug=False): ''' Summary : Split the transciption and corresponding bounding-box based on spaces predicted by recognizer FCN. Description : Parameters ---------- trans : string String containing the predicted transcription for the corresponding predicted bounding-box. conf : list List containing sum of confidence for all the character by recognizer FCN, start and end position in bounding-box for generated transciption. splits : list List containing index of position of predicted spaces by the recognizer FCN. norm2 : matrix Matrix containing the cropped bounding-box predicted by localization FCN in the originial image. ctc_f : matrix Matrix containing output of recognizer FCN for the given input bounding-box. rot_mat : matrix Rotation matrix returned by get_normalized_image function. boxt : tuple of tuples Tuple of tuples containing parametes of predicted bounding-box by localization FCN. draw : matrix Matrix containing input image. is_dict : debug : boolean Boolean parameter representing debug mode, if it is True visualization boxes are generated. Returns ------- boxes_out : list of tuples List of tuples containing predicted bounding-box parameters, predicted transcription and mean confidence score from the recognizer. ''' spl = word_splits boxout = np.copy(box_points) #draw_box_points(draw, boxout, color = (0, 255, 0), thickness=2) start_f = start[0, 0] mean_conf = conf[0, 0] / max( 1, len(trans)) # Overall confidence of recognizer FCN boxes_out = [] y = 0 for s in range(len(spl)): text = spl[s] end_f = splits[0, s] if s < len(spl) - 1: try: if splits[0, s] > start_f: end_f = splits[ 0, s] # New ending point of bounding-box transcription except IndexError: pass scalex = w / float(ctc_f.shape[1]) poss = start_f * scalex pose = (end_f + 2) * scalex rect = [[poss, h], [poss, y], [pose, y], [pose, h]] rect = np.array(rect) int_t = rot_mat dst_rect = np.copy(rect) dst_rect[:, 0] = int_t[0, 0] * rect[:, 0] + int_t[ 0, 1] * rect[:, 1] + int_t[0, 2] dst_rect[:, 1] = int_t[1, 0] * rect[:, 0] + int_t[ 1, 1] * rect[:, 1] + int_t[1, 2] dst_rect[:, 0] += boxout[1, 0] dst_rect[:, 1] += boxout[1, 1] if debug: draw_box_points(draw, dst_rect, color=(0, 255, 0)) cv2.imshow('draw', draw) cv2.waitKey(0) boxes_out.append((dst_rect, [text, mean_conf, is_dict])) start_f = end_f + 1 return boxes_out
def evaluate_image(img, detections, gt_rect, gt_txts, iou_th=0.5, iou_th_vis=0.5, iou_th_eval=0.5, eval_text_length=1): ''' Summary : Returns end-to-end true-positives, detection true-positives, number of GT to be considered for eval (len > 2). Description : For each predicted bounding-box, comparision is made with each GT entry. Values of number of end-to-end true positives, number of detection true positives, number of GT entries to be considered for evaluation are computed. Parameters ---------- iou_th_eval : float Threshold value of intersection-over-union used for evaluation of predicted bounding-boxes iou_th_vis : float Threshold value of intersection-over-union used for visualization when transciption is true but IoU is lesser. iou_th : float Threshold value of intersection-over-union between GT and prediction. word_gto : list of lists List of ground-truth bounding boxes along with transcription. batch : list of lists List containing data (input image, image file name, ground truth). detections : tuple of tuples Tuple of predicted bounding boxes along with transcriptions and text/no-text score. Returns ------- tp : int Number of predicted bounding-boxes having IoU with GT greater than iou_th_eval. tp_e2e : int Number of predicted bounding-boxes having same transciption as GT and len > 2. gt_e2e : int Number of GT entries for which transcription len > 2. ''' gt_to_detection = {} detection_to_gt = {} tp = 0 tp_e2e = 0 tp_e2e_ed1 = 0 gt_e2e = 0 gt_matches = np.zeros(gt_rect.shape[0]) gt_matches_ed1 = np.zeros(gt_rect.shape[0]) for i in range(0, len(detections)): det = detections[i] box = det[0] # Predicted bounding-box parameters box = np.array( box, dtype="int") # Convert predicted bounding-box to numpy array box = box[0:8].reshape(4, 2) bbox = cv2.boundingRect(box) bbox = [bbox[0], bbox[1], bbox[2], bbox[3]] bbox[2] += bbox[0] # Convert width to right-coordinate bbox[3] += bbox[1] # Convert height to bottom-coordinate det_text = det[1] # Predicted transcription for bounding-box for gt_no in range(len(gt_rect)): gtbox = gt_rect[gt_no] txt = gt_txts[gt_no] # GT transcription for given GT bounding-box gtbox = np.array(gtbox, dtype="int") gtbox = gtbox[0:8].reshape(4, 2) rect_gt = cv2.boundingRect(gtbox) rect_gt = [rect_gt[0], rect_gt[1], rect_gt[2], rect_gt[3]] rect_gt[2] += rect_gt[0] # Convert GT width to right-coordinate rect_gt[3] += rect_gt[1] # Convert GT height to bottom-coordinate inter = intersect( bbox, rect_gt) # Intersection of predicted and GT bounding-boxes uni = union(bbox, rect_gt) # Union of predicted and GT bounding-boxes ratio = area(inter) / float(area( uni)) # IoU measure between predicted and GT bounding-boxes # 1). Visualize the predicted-bounding box if IoU with GT is higher than IoU threshold (iou_th) (Always required) # 2). Visualize the predicted-bounding box if transcription matches the GT and condition 1. holds # 3). Visualize the predicted-bounding box if transcription matches and IoU with GT is less than iou_th_vis and 1. and 2. hold if ratio > iou_th: if not gt_no in gt_to_detection: gt_to_detection[gt_no] = [0, 0] edit_dist = editdistance.eval(det_text.lower(), txt.lower()) if edit_dist <= 1: gt_matches_ed1[gt_no] = 1 draw_box_points(img, box, color=(0, 128, 0), thickness=2) if edit_dist == 0: #det_text.lower().find(txt.lower()) != -1: draw_box_points(img, box, color=(0, 255, 0), thickness=2) gt_matches[ gt_no] = 1 # Change this parameter to 1 when predicted transcription is correct. if ratio < iou_th_vis: #draw_box_points(draw, box, color = (255, 255, 255), thickness=2) #cv2.imshow('draw', draw) #cv2.waitKey(0) pass tupl = gt_to_detection[gt_no] if tupl[0] < ratio: tupl[0] = ratio tupl[1] = i detection_to_gt[i] = [gt_no, ratio, edit_dist] # Count the number of end-to-end and detection true-positives for gt_no in range(gt_matches.shape[0]): gt = gt_matches[gt_no] gt_ed1 = gt_matches_ed1[gt_no] txt = gt_txts[gt_no] gtbox = gt_rect[gt_no] gtbox = np.array(gtbox, dtype="int") gtbox = gtbox[0:8].reshape(4, 2) if len(txt) >= eval_text_length and not txt.startswith('##'): gt_e2e += 1 if gt == 1: tp_e2e += 1 if gt_ed1 == 1: tp_e2e_ed1 += 1 if gt_no in gt_to_detection: tupl = gt_to_detection[gt_no] if tupl[0] > iou_th_eval: # Increment detection true-positive, if IoU is greater than iou_th_eval if len(txt) >= eval_text_length and not txt.startswith('##'): tp += 1 #else: # draw_box_points(img, gtbox, color = (255, 255, 255), thickness=2) for i in range(0, len(detections)): det = detections[i] box = det[0] # Predicted bounding-box parameters box = np.array( box, dtype="int") # Convert predicted bounding-box to numpy array box = box[0:8].reshape(4, 2) if not i in detection_to_gt: draw_box_points(img, box, color=(0, 0, 255), thickness=2) else: [gt_no, ratio, edit_dist] = detection_to_gt[i] if edit_dist > 0: draw_box_points(img, box, color=(255, 0, 0), thickness=2) #cv2.imshow('draw', draw) return tp, tp_e2e, gt_e2e, tp_e2e_ed1, detection_to_gt
rot_mat = cv2.getRotationMatrix2D( (0, 0), -angle * 180 / math.pi, 1) splits_raw = process_splits( det_text, word_splits, conf_raw, dec_s, conf2, ctc_f, rot_mat, angle, boxr, w, h, im_resized, 0) # Process the split and improve the localization for spl in splits_raw: spl[1][0] = spl[1][0].strip() if len(spl[1][0]) >= eval_text_length: has_long = True boxw = spl[0] boxw[:, 0] /= im_scalex boxw[:, 1] /= im_scaley draw_box_points(img, boxw, color=(0, 255, 0)) #cv2.imshow('img', img) #cv2.waitKey() #print('{0} - {1}'.format(spl[1][0], conf_factor)) #if conf_factor < 0.01: # print('Skipping {0} - {1}'.format(spl[1][0], conf_factor)) # continue print('{0} - {1}'.format(spl[1][0], conf_factor)) boxw = boxw.reshape(8) detections_out.append([boxw, spl[1][0]]) pix = img if args.evaluate == 1: # detections_out = np.expand_dims(gt_rect, axis=1) # this is only for spoofing the bbox w/ gt
if gt_no in gt_to_detection: tupl = gt_to_detection[gt_no] if tupl[0] > iou_th_eval: # Increment detection true-positive, if IoU is greater than iou_th_eval if len(txt) >= eval_text_length and not txt.startswith('##'): tp += 1 #else: # draw_box_points(img, gtbox, color = (255, 255, 255), thickness=2) for i in range(0, len(detections)): det = detections[i] box = det[0] # Predicted bounding-box parameters box = np.array(box, dtype="int") # Convert predicted bounding-box to numpy array box = box[0:8].reshape(4, 2) if not i in detection_to_gt: draw_box_points(img, box, color = (0, 0, 255), thickness=2) else: [gt_no, ratio, edit_dist] = detection_to_gt[i] if edit_dist > 0: draw_box_points(img, box, color = (255, 0, 0), thickness=2) #cv2.imshow('draw', draw) return tp, tp_e2e, gt_e2e, tp_e2e_ed1, detection_to_gt from PIL import Image from PIL import ImageFont from PIL import ImageDraw import glob def process_splits(trans, word_splits, conf, splits, start, ctc_f, rot_mat, angle, box_points, w, h, draw, is_dict, debug = False):