def get_normalized_image(img, rr, debug=False): box = cv2.boxPoints(rr) extbox = cv2.boundingRect(box) if extbox[2] * extbox[3] > img.shape[0] * img.shape[1]: print("Too big proposal: {0}x{1}".format(extbox[2], extbox[3])) return None, None extbox = [extbox[0], extbox[1], extbox[2], extbox[3]] extbox[2] += extbox[0] extbox[3] += extbox[1] extbox = np.array(extbox, np.int) extbox[0] = max(0, extbox[0]) extbox[1] = max(0, extbox[1]) extbox[2] = min(img.shape[1], extbox[2]) extbox[3] = min(img.shape[0], extbox[3]) tmp = img[extbox[1]:extbox[3], extbox[0]:extbox[2]] center = (tmp.shape[1] / 2, tmp.shape[0] / 2) rot_mat = cv2.getRotationMatrix2D(center, rr[2], 1) if tmp.shape[0] == 0 or tmp.shape[1] == 0: return None, rot_mat if debug: vis.draw_box_points(img, np.array(extbox, dtype="int"), color=(0, 255, 0)) cv2.imshow('scaled', img) rot_mat[0, 2] += rr[1][0] / 2.0 - center[0] rot_mat[1, 2] += rr[1][1] / 2.0 - center[1] try: norm_line = cv2.warpAffine(tmp, rot_mat, (int(rr[1][0]), int(rr[1][1])), borderMode=cv2.BORDER_REPLICATE) except: return None, rot_mat return norm_line, rot_mat
def get_normalized_image(img, rr, debug = False): box = cv2.boxPoints(rr) extbox = cv2.boundingRect(box) if extbox[2] * extbox[3] > img.shape[0] * img.shape[1]: print("Too big proposal: {0}x{1}".format(extbox[2], extbox[3])) return None, None extbox = [extbox[0], extbox[1], extbox[2], extbox[3]] extbox[2] += extbox[0] extbox[3] += extbox[1] extbox = np.array(extbox, np.int) extbox[0] = max(0, extbox[0]) extbox[1] = max(0, extbox[1]) extbox[2] = min(img.shape[1], extbox[2]) extbox[3] = min(img.shape[0], extbox[3]) tmp = img[extbox[1]:extbox[3], extbox[0]:extbox[2]] center = (tmp.shape[1] / 2, tmp.shape[0] / 2) rot_mat = cv2.getRotationMatrix2D( center, rr[2], 1 ) if tmp.shape[0] == 0 or tmp.shape[1] == 0: return None, rot_mat if debug: vis.draw_box_points(img, np.array(extbox, dtype="int"), color = (0, 255, 0)) cv2.imshow('scaled', img) rot_mat[0,2] += rr[1][0] /2.0 - center[0] rot_mat[1,2] += rr[1][1] /2.0 - center[1] try: norm_line = cv2.warpAffine( tmp, rot_mat, (int(rr[1][0]), int(rr[1][1])), borderMode=cv2.BORDER_REPLICATE ) except: return None, rot_mat return norm_line, rot_mat
def froward_image(nets, scaled, original): global rec_t, ext_factor, ext_factorx net, net_ctc = nets print("nets:", nets) print("net: ", net) print("net_ctc :", net_ctc) img = [scaled] draw = img[0] imgo = original im = np.asarray(img, dtype=np.float) im = im / 128.0 im = im - 1.0 # im = im.reshape((3, im.shape[0], im.shape[1])) im = np.swapaxes(im, 1, 3) im = np.swapaxes(im, 2, 3) net.blobs['data'].reshape(im.shape[0], im.shape[1], im.shape[2], im.shape[3]) print("net.blobs['data'] :", net.blobs['data']) net.blobs['data'].data[...] = im print("im: ", im) net.reshape() print("net.reshape(): ", net.reshape()) start = time.time() out = net.forward(start="conv1") end = time.time() seconds = end - start fps = 1 / seconds # print("loc fps:{0}".format(fps)) boxes = out['boxes'] print("boxes: ", boxes) boxes[0, 0, :, 0] *= image_size[0] boxes[0, 0, :, 1] *= image_size[1] normFactor = math.sqrt(image_size[1] * image_size[1] + image_size[0] * image_size[0]) boxes[0, 0, :, 2] *= normFactor boxes[0, 0, :, 3] *= normFactor nms = boxes[0, 0, :, 8] != 1 boxes = boxes[:, :, nms, :] print("boxes before boxes_count: ", boxes) boxes_count = 0 for i in range(0, boxes.shape[2]): det_word = boxes[0, 0, i] if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.1: break boxes_count += 1 detections_out = [] for i in range(0, boxes_count): det_word = boxes[0, 0, i] boxr = ((det_word[0], det_word[1]), (det_word[2], det_word[3]), det_word[4] * 180 / 3.14) print("boxr : this is r box,", boxr) box = cv2.boxPoints(boxr) print("box : sfter detection count,", box) box = np.array(box, dtype="int") vis.draw_box_points(draw, box, (255, 0, 0)) bbox = cv2.boundingRect(box) bbox = [bbox[0], bbox[1], bbox[2], bbox[3]] bbox[2] += bbox[0] bbox[3] += bbox[1] boxro = [[det_word[0], det_word[1]], [det_word[2] * ext_factorx, det_word[3] * ext_factor], det_word[4] * 180 / 3.14] boxt = get_obox(img[0], original, boxro) print("boxt 1 :", boxt) boxt = ((boxt[0][0], boxt[0][1]), (boxt[1][0], boxt[1][1]), boxt[2]) print("boxt 2 :", boxt) norm2, rot_mat = get_normalized_image(original, boxt) if norm2 is None: continue norm = cv2.cvtColor(norm2, cv2.COLOR_BGR2GRAY) print("Given Norm :", norm) width_scale = 32.0 / norm2.shape[0] width = norm.shape[1] * width_scale best_diff = width bestb = 0 for b in range(0, len(buckets)): if best_diff > abs(width - buckets[b]): best_diff = abs(width - buckets[b]) bestb = b scaled = cv2.resize(norm, (buckets[bestb], 32)) cv2.imshow('norm2', scaled) imtf = np.asarray([scaled], dtype=np.float) imtf = np.asarray(imtf, dtype=np.float) delta = imtf.max() - imtf.min() imtf /= (delta / 2) imtf -= imtf.mean() imtf = np.reshape(imtf, (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2])) net_ctc.blobs['data'].reshape(imtf.shape[0], imtf.shape[1], imtf.shape[2], imtf.shape[3]) net_ctc.blobs['data'].data[...] = imtf outctc = net_ctc.forward() print("outctc : ", outctc) ctc_f = outctc['softmax'] print("ctc_f : ", ctc_f) ctc_f = ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[1], ctc_f.shape[3]) labels = ctc_f.argmax(2) mask = labels > 2 masked = ctc_f.max(2)[mask] mean_conf = np.sum(masked) / masked.shape[0] print("mean_conf : ", mean_conf) if mean_conf < 0.2: vis.draw_box_points(scaled, box, color=(0, 0, 0)) continue if debug: vis.vis_square(imtf[0]) det_text, conf, dec_s = print_seq_ext(labels[:, 0], np.sum(masked)) if len(det_text) == 0: continue if len(det_text) < 3 and mean_conf < 0.8: continue print("detections_out: ", detections_out) detections_out.append((boxt, (det_text, mean_conf, int(det_word[6])))) continue splits_raw = process_splits(det_text, conf, dec_s, norm2, ctc_f, rot_mat, boxt, original, 0, mean_conf, alow_non_dict=True) detections_out.extend(splits_raw) continue if out_raw is not None: out_raw.write(u"{0}|{1}|{2}|{3}|{4}|{5}|{6}|{7}|{8}|{9}|{10}|{11}\n".format( \ 'vid', box[0, 0], box[0, 1], box[1, 0], box[1, 1], \ box[2, 0], box[2, 1], box[3, 0], box[3, 1], det_text, det_text, mean_conf).encode('utf8')) dec2, conf2, dec_splits = cmp_trie.decode_sofmax( ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2])) best_dict = print_seq2(dec2[0]) if len(best_dict) == 0: continue splits_out = process_splits(best_dict, conf2, dec_splits, norm2, ctc_f, rot_mat, boxt, original, 1, mean_conf) detections_out.extend(splits_out) return detections_out, fps
def test_video(nets): global rec_t, image_size cap = cv2.VideoCapture( '/mnt/textspotter/evaluation-sets/icdar2013-video-Test/Video_35_2_3.mp4' ) cap = cv2.VideoCapture(-1) font = ImageFont.truetype( "/usr/share/fonts/truetype/ubuntu-font-family/UbuntuMono-R.ttf", 16) font2 = ImageFont.truetype( "/usr/share/fonts/truetype/ubuntu-font-family/Ubuntu-B.ttf", 18) ret, im = cap.read() fourcc = cv2.VideoWriter_fourcc(*'X264') out = cv2.VideoWriter('/tmp/output.avi', fourcc, 20.0, (im.shape[1], im.shape[0])) frame_no = 0 while ret: image_size = [640 / 64 * 64, 480 / 64 * 64] ret, im = cap.read() if ret == True: scaled = cv2.resize(im, (image_size[0], image_size[1])) if nets[0].blobs['data'].data[...].shape[1] == 1: scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY) scaled = scaled.reshape((scaled.shape[0], scaled.shape[1], 1)) detections_out, fps = froward_image(nets, scaled, im) img = Image.fromarray(im) draw = ImageDraw.Draw(img) for detection in detections_out: text = detection[1][0] print(text) width, height = draw.textsize(text, font=font) center = [ detection[0][0][0] - width / 2, detection[0][0][1] - 10 ] sx = int(detection[0][0][0] - width / 2) ex = int(detection[0][0][0] + width / 2) sy = int(detection[0][0][1] - 10) ey = int(detection[0][0][1] + 10) im[sy:ey, sx:ex] = im[sy:ey, sx:ex] / 2 boxr = ((detection[0][0][0], detection[0][0][1]), (detection[0][1][0], detection[0][1][1]), detection[0][2]) box = cv2.boxPoints(boxr) color = (0, 255, 0) vis.draw_box_points(im, box, color, thickness=1) img = Image.fromarray(im) draw = ImageDraw.Draw(img) draw.text((10, 10), 'FPS: {0:.2f}'.format(fps), (0, 255, 0), font=font2) frame_no += 1 if frame_no < 30: draw.text((image_size[1] / 2 - 150, image_size[0] / 2 - 100), 'Raw Detections with Dictionary', (0, 0, 255), font=font2) for detection in detections_out: text = detection[1][0] width, height = draw.textsize(text, font=font) center = [ detection[0][0][0] - width / 2, detection[0][0][1] - 10 ] draw.text((center[0], center[1]), text, fill=(0, 255, 0), font=font) pix = np.array(img) cv2.imshow('draw', scaled) # if pix.shape[0] > 1024: pix = cv2.resize(pix, (pix.shape[1] / 2, pix.shape[0] / 2)) cv2.imshow('pix', pix) out.write(pix) cv2.waitKey(10) out.release()
def test_image(nets): img_dir = "src/images" #img_dir = "test" imgs = [os.path.join(path, f) for f in os.listdir(img_dir)] for img in imgs: im = cv2.imread(img) global rec_t, image_size font = ImageFont.truetype( "/usr/share/fonts/truetype/ubuntu-font-family/UbuntuMono-R.ttf", 16) font2 = ImageFont.truetype( "/usr/share/fonts/truetype/ubuntu-font-family/Ubuntu-B.ttf", 18) image_size = [640 / 64 * 64, 480 / 64 * 64] scaled = cv2.resize(im, (image_size[0], image_size[1])) if nets[0].blobs['data'].data[...].shape[1] == 1: scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY) scaled = scaled.reshape((scaled.shape[0], scaled.shape[1], 1)) detections_out, fps = froward_image(nets, scaled, im) img = Image.fromarray(im) draw = ImageDraw.Draw(img) text_list = list() for detection in detections_out: print("Detection", detection) text = detection[1][0] text = text.encode('ascii', 'ignore') text_list.append(text) print(text_list) width, height = draw.textsize(text, font=font) center = [detection[0][0][0] - width / 2, detection[0][0][1] - 10] sx = int(detection[0][0][0] - width / 2) ex = int(detection[0][0][0] + width / 2) sy = int(detection[0][0][1] - 10) ey = int(detection[0][0][1] + 10) im[sy:ey, sx:ex] = im[sy:ey, sx:ex] / 2 boxr = ((detection[0][0][0], detection[0][0][1]), (detection[0][1][0], detection[0][1][1]), detection[0][2]) box = cv2.boxPoints(boxr) color = (0, 255, 0) vis.draw_box_points(im, box, color, thickness=1) img = Image.fromarray(im) draw = ImageDraw.Draw(img) draw.text((10, 10), 'FPS: {0:.2f}'.format(fps), (0, 255, 0), font=font2) draw.text((image_size[1] / 2 - 150, image_size[0] / 2 - 100), 'Raw Detections with Dictionary', (0, 0, 255), font=font2) print('text detectionnnnnnnnnnnnnn', text_list) print(text_list[0]) center_list = list() center_text = list() for detection in detections_out: text = detection[1][0] width, height = draw.textsize(text, font=font) center = [detection[0][0][0] - width / 2, detection[0][0][1] - 10] print("center", center) center_list.append(center) print("text: ", text) print("center list", center_list) draw.text((center[0], center[1]), text, fill=(0, 255, 0), font=font) sorted_center = sorted(center_list, key=lambda x: (x[0], x[1])) print("sorted_center", sorted_center) pix = np.array(img) cv2.imshow('draw', scaled) # if pix.shape[0] > 1024: pix = cv2.resize(pix, (pix.shape[1] / 2, pix.shape[0] / 2)) cv2.imshow('pix', pix) cv2.waitKey(0)
def process_batch(nets, optim, optim2, image_size, args): global it, mean_loss, mean_rec it += 1 # 迭代次数加一 net, net_ctc = nets net = net.net net_ctc = net_ctc.net net.blobs['data'].reshape(args.batch_size, 1, image_size[1], image_size[0]) # 把一个batch的输入图片reshape net.reshape() optim2.step(1) im = net.blobs['data'].data[...] # shape [batch_size,1,416,416] draw = np.swapaxes(im, 2, 3) draw = np.swapaxes(draw, 1, 3) im_ctc = np.copy(draw) draw += 1 draw *= 128 draw = np.array(draw, dtype="uint8").copy() if args.debug: grid_step = 16 line = 0 while line < image_size[0]: cv2.line(draw[0], (0, line), (image_size[1], line), (128, 128, 128)) line += grid_step boxes = net.blobs['boxes'].data[...] # shape (4, 1, 500, 15) word_gtob = net.blobs['gt_boxes'].data[...] # shape (4, 6, 1, 6) word_txt = net.blobs['gt_labels'].data[...] # shape (4, 6, 1, 14) lines_gtob = net.blobs['line_boxes'].data[...] # shape (4, 1, 1, 5) lines_txt = net.blobs['line_labels'].data[...] # shape (4, 1, 1, 7) #nms = boxeso[:, 0, 0, 8] == 0 #boxes = boxes[:, :, nms, :] boxes[:, 0, :, 0] *= image_size[0] boxes[:, 0, :, 1] *= image_size[1] normFactor = math.sqrt(image_size[1] * image_size[1] + image_size[0] * image_size[0]) boxes[:, 0, :, 2] *= normFactor boxes[:, 0, :, 3] *= normFactor sum_cost = 0 count = 0 labels_gt = [] labels_det = [] gt_to_detection = {} net_ctc.clear_param_diffs() batch_buckets = [] dummy = {} matched_detections = 0 for bid in range(im.shape[0]): # 遍历batchsize下的每一个样本 o_image = net.layers[0].get_image_file_name(bid) o_image = cv2.imread(o_image, cv2.IMREAD_GRAYSCALE) cx = net.layers[0].get_crop(bid, 0) cy = net.layers[0].get_crop(bid, 1) cmx = net.layers[0].get_crop(bid, 2) cmy = net.layers[0].get_crop(bid, 3) o_image = o_image[cy:cmy, cx:cmx] boxes_count = 0 for i in range(0, boxes.shape[2]): det_word = boxes[bid, 0, i] if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.01: break boxes_count += 1 x = [i for i in range(boxes_count)] #random.shuffle(x) bucket_images = {} batch_buckets.append(bucket_images) word_gto = word_gtob[bid] word_gto_txt = word_txt[bid] gt_count = 0 for gt_no in range(word_gto.shape[0]): gt = word_gto[gt_no, :] gt = gt.reshape(6) gtnum = 1000 * bid + gt_no if gt[5] == -1: #print("ignore gt!") continue gt_count += 1 txt = word_gto_txt[gt_no, :] gtbox = ((gt[0] * image_size[0], gt[1] * image_size[1]), (gt[2] * normFactor, gt[3] * normFactor), gt[4] * 180 / 3.14) gtbox = cv2.boxPoints(gtbox) gtbox = np.array(gtbox, dtype="int") rect_gt = cv2.boundingRect(gtbox) if rect_gt[0] == 0 or rect_gt[ 1] == 0 or rect_gt[0] + rect_gt[2] >= image_size[ 0] or rect_gt[1] + rect_gt[3] >= image_size[1]: continue if gt[3] * normFactor < 3: if args.debug: pass print('too small gt!') continue rect_gt = [rect_gt[0], rect_gt[1], rect_gt[2], rect_gt[3]] rect_gt[2] += rect_gt[0] rect_gt[3] += rect_gt[1] for i in range(0, min(100, boxes_count)): if math.fabs(gt[4] - det_word[4]) > math.pi / 16: continue det_word = boxes[bid, 0, x[i], :] if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.01: break box = ((det_word[0], det_word[1]), (det_word[2], det_word[3]), det_word[4] * 180 / 3.14) box = cv2.boxPoints(box) if args.debug: boxp = np.array(box, dtype="int") vis.draw_box_points(draw[bid], boxp, color=(0, 255, 0)) box = np.array(box, dtype="int") bbox = cv2.boundingRect(box) bbox = [bbox[0], bbox[1], bbox[2], bbox[3]] bbox[2] += bbox[0] bbox[3] += bbox[1] #rectangle intersection ... inter = intersect(bbox, rect_gt) uni = union(bbox, rect_gt) ratio = area(inter) / float(area(uni)) ratio_gt = area(inter) / float(area(rect_gt)) if ratio_gt < 0.95: continue if ratio < 0.5: continue if not gt_to_detection.has_key(gtnum): gt_to_detection[gtnum] = [0, 0, 0] tupl = gt_to_detection[gtnum] if tupl[0] < ratio: tupl[0] = ratio tupl[1] = x[i] tupl[2] = ratio_gt det_word = boxes[bid, 0, x[i], :] box = ([det_word[0], det_word[1]], [det_word[2], det_word[3]], det_word[4] * 180 / 3.14) boxO = get_obox(im_ctc[bid], o_image, box) boxO = ((boxO[0][0], boxO[0][1]), (boxO[1][0], boxO[1][1]), boxO[2]) norm2, rot_mat = get_normalized_image(o_image, boxO) #norm3, rot_mat = get_normalized_image(im_ctc[bid], ([det_word[0], det_word[1]], [det_word[2] * 1.2, det_word[3] * 1.1], det_word[4] * 180 / 3.14)) if norm2 is None: continue #if norm3 is None: # continue #continue #cv2.imshow('ts', norm2) #cv2.imshow('ts3', norm3) #cv2.waitKey(1) width_scale = 32.0 / norm2.shape[0] width = norm2.shape[1] * width_scale best_diff = width bestb = 0 for b in range(0, len(buckets)): if best_diff > abs(width * 1.3 - buckets[b]): best_diff = abs(width * 1.3 - buckets[b]) bestb = b scaled = cv2.resize(norm2, (buckets[bestb], 32)) scaled = np.asarray(scaled, dtype=np.float) delta = scaled.max() - scaled.min() scaled = (scaled) / (delta / 2) scaled -= scaled.mean() if not bucket_images.has_key(bestb): bucket_images[bestb] = {} bucket_images[bestb]['img'] = [] bucket_images[bestb]['sizes'] = [] bucket_images[bestb]['txt'] = [] bucket_images[bestb]['gt_enc'] = [] dummy[bestb] = 1 else: if args.debug and len(bucket_images[bestb]) > 4: continue elif len(bucket_images[bestb]) > 32: continue gt_labels = [] txt_enc = '' for k in range(txt.shape[1]): if txt[0, k] > 0: if codec_rev.has_key(txt[0, k]): gt_labels.append(codec_rev[txt[0, k]]) else: gt_labels.append(3) txt_enc += unichr(txt[0, k]) else: gt_labels.append(0) if scaled.ndim == 3: scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY) if args.debug: cv2.imshow('scaled', scaled) bucket_images[bestb]['sizes'].append(len(gt_labels)) bucket_images[bestb]['gt_enc'].append(gt_labels) bucket_images[bestb]['txt'].append(txt_enc) bucket_images[bestb]['img'].append(scaled) matched_detections += 1 #and learn OCR for bucket in bucket_images.keys(): imtf = np.asarray(bucket_images[bucket]['img'], dtype=np.float) imtf = np.reshape(imtf, (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2])) #imtf = imtf.reshape((imtf.shape[0], imtf.shape[1], imtf.shape[2], 1)) #imtf = np.swapaxes(imtf,1,3) net_ctc.blobs['data'].reshape(imtf.shape[0], imtf.shape[1], imtf.shape[2], imtf.shape[3]) net_ctc.blobs['data'].data[...] = imtf labels = bucket_images[bucket]['gt_enc'] txt = bucket_images[bucket]['txt'] max_len = 0 for l in range(0, len(labels)): max_len = max(max_len, len(labels[l])) for l in range(0, len(labels)): while len(labels[l]) < max_len: labels[l].append(0) labels = np.asarray(labels, np.float) net_ctc.blobs['label'].reshape(labels.shape[0], labels.shape[1]) net_ctc.blobs['label'].data[...] = labels if args.debug: vis.vis_square(imtf[0]) cv2.imshow('draw', draw[0]) cv2.waitKey(5) #optim.step(1) sum_cost += net_ctc.blobs['loss'].data[...] if net_ctc.blobs['loss'].data[...] > 10: #vis.vis_square(imtf[0]) #cv2.imshow('draw', draw[0]) sf = net_ctc.blobs['transpose'].data[...] labels2 = sf.argmax(3) out = utils.print_seq(labels2[:, 0, :]) print(u'{0} --- {1}'.format(out, txt[0])) #cv2.waitKey(5) count += imtf.shape[0] correct_cout = 0 for i in range(len(labels_gt)): det_text = labels_det[i] gt_text = labels_gt[i] if it % 100 == 0: pass #print( u"{0} -- {1}".format(det_text, gt_text).encode('utf8') ) if det_text == gt_text: correct_cout += 1 count = max(count, 1) mean_loss = 0.99 * mean_loss + 0.01 * sum_cost / count mean_rec = mean_rec * 0.99 + 0.01 * correct_cout / float( max(1, len(labels_gt))) #count detection ratio tp = 0 for bid in range(im.shape[0]): word_gto = word_gtob[bid] for gt_no in range(len(word_gto)): gt = word_gto[gt_no] gtnum = 1000 * bid + gt_no if gt_to_detection.has_key(gtnum): tupl = gt_to_detection[gtnum] if tupl[0] > 0.5: tp += 1 loc_recall = tp / float(max(1, gt_count)) if it % 10 == 0: print( '{0} - lr:{1:.3e} ctc:{2:.4f}/{3:.4f} wr:{4:.2f}/{5:.2f}, loc:{6:.2f} {7}' .format(it, 0.0001, sum_cost / count, mean_loss, correct_cout / float(max(1, len(labels_gt))), mean_rec, loc_recall, matched_detections)) if it % snapshot_interval == 0: #optim.snapshot() optim2.snapshot()
def process_batch(nets, optim, optim2, image_size, args): global it, mean_loss, mean_rec net, net_ctc = nets net = net.net net_ctc = net_ctc.net net.blobs['data'].reshape(args.batch_size,1,image_size[1],image_size[0]) net.reshape() it += 1 optim2.step(1) im = net.blobs['data'].data[...] draw = np.swapaxes(im,2,3) draw = np.swapaxes(draw,1,3) im_ctc = np.copy(draw) draw += 1 draw *= 128 draw = np.array(draw, dtype="uint8").copy() if args.debug: grid_step = 16 line = 0 while line < image_size[0]: cv2.line(draw[0], (0, line), (image_size[1], line), (128, 128, 128)) line += grid_step boxes = net.blobs['boxes'].data[...] word_gtob = net.blobs['gt_boxes'].data[...] word_txt = net.blobs['gt_labels'].data[...] lines_gtob = net.blobs['line_boxes'].data[...] lines_txt = net.blobs['line_labels'].data[...] #nms = boxeso[:, 0, 0, 8] == 0 #boxes = boxes[:, :, nms, :] boxes[:, 0, :, 0] *= image_size[0] boxes[:, 0, :, 1] *= image_size[1] normFactor = math.sqrt(image_size[1] * image_size[1] + image_size[0] * image_size[0]) boxes[:, 0, :, 2] *= normFactor boxes[:, 0, :, 3] *= normFactor sum_cost = 0 count = 0 labels_gt = [] labels_det = [] gt_to_detection = {} net_ctc.clear_param_diffs() batch_buckets = [] dummy = {} matched_detections = 0 for bid in range(im.shape[0]): o_image = net.layers[0].get_image_file_name(bid) o_image = cv2.imread(o_image, cv2.IMREAD_GRAYSCALE) cx = net.layers[0].get_crop(bid, 0) cy = net.layers[0].get_crop(bid, 1) cmx = net.layers[0].get_crop(bid, 2) cmy = net.layers[0].get_crop(bid, 3) o_image = o_image[cy:cmy, cx:cmx] boxes_count = 0 for i in range(0, boxes.shape[2]): det_word = boxes[bid, 0, i] if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.01: break boxes_count += 1 x = [i for i in range(boxes_count)] #random.shuffle(x) bucket_images = {} batch_buckets.append(bucket_images) word_gto = word_gtob[bid] word_gto_txt = word_txt[bid] gt_count = 0 for gt_no in range(word_gto.shape[0]): gt = word_gto[gt_no, :] gt = gt.reshape(6) gtnum = 1000 * bid + gt_no if gt[5] == -1: #print("ignore gt!") continue gt_count += 1 txt = word_gto_txt[gt_no, :] gtbox = ((gt[0] * image_size[0], gt[1] * image_size[1]), (gt[2] * normFactor, gt[3] * normFactor), gt[4] * 180 / 3.14) gtbox = cv2.boxPoints(gtbox) gtbox = np.array(gtbox, dtype="int") rect_gt = cv2.boundingRect(gtbox) if rect_gt[0] == 0 or rect_gt[1] == 0 or rect_gt[0] + rect_gt[2] >= image_size[0] or rect_gt[1] + rect_gt[3] >= image_size[1]: continue if gt[3] * normFactor < 3: if args.debug: #print('too small gt!') vis.draw_box_points(draw[bid], gtbox, color = (255, 255, 0)) cv2.imshow('draw', draw[bid]) continue if args.debug: vis.draw_box_points(draw[bid], gtbox, color = (0, 0, 0), thickness=2) #vis.draw_box_points(draw[bid], gtbox, color = (255, 255, 255)) #cv2.imshow('draw', draw[bid]) rect_gt = [rect_gt[0], rect_gt[1], rect_gt[2], rect_gt[3]] rect_gt[2] += rect_gt[0] rect_gt[3] += rect_gt[1] for i in range(0, min(100, boxes_count)): if math.fabs(gt[4] - det_word[4]) > math.pi / 16: continue det_word = boxes[bid, 0, x[i], :] if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.01: break box = ((det_word[0], det_word[1]), (det_word[2], det_word[3]), det_word[4] * 180 / 3.14) box = cv2.boxPoints(box) if args.debug: boxp = np.array(box, dtype="int") vis.draw_box_points(draw[bid], boxp, color = (0, 255, 0)) box = np.array(box, dtype="int") bbox = cv2.boundingRect(box) bbox = [bbox[0], bbox[1], bbox[2], bbox[3]] bbox[2] += bbox[0] bbox[3] += bbox[1] #rectangle intersection ... inter = intersect(bbox, rect_gt) uni = union(bbox, rect_gt) ratio = area(inter) / float(area(uni)) ratio_gt = area(inter) / float(area(rect_gt)) if ratio_gt < 0.95: continue if ratio < 0.5: continue if not gt_to_detection.has_key(gtnum): gt_to_detection[gtnum] = [0, 0, 0] tupl = gt_to_detection[gtnum] if tupl[0] < ratio: tupl[0] = ratio tupl[1] = x[i] tupl[2] = ratio_gt det_word = boxes[bid, 0, x[i], :] box = ([det_word[0], det_word[1]], [det_word[2], det_word[3]], det_word[4] * 180 / 3.14) boxO = get_obox(im_ctc[bid], o_image, box) boxO = ((boxO[0][0], boxO[0][1]), (boxO[1][0], boxO[1][1]), boxO[2]) norm2, rot_mat = get_normalized_image(o_image, boxO) #norm3, rot_mat = get_normalized_image(im_ctc[bid], ([det_word[0], det_word[1]], [det_word[2] * 1.2, det_word[3] * 1.1], det_word[4] * 180 / 3.14)) if norm2 is None: continue #if norm3 is None: # continue #continue #cv2.imshow('ts', norm2) #cv2.imshow('ts3', norm3) #cv2.waitKey(1) width_scale = 32.0 / norm2.shape[0] width = norm2.shape[1] * width_scale best_diff = width bestb = 0 for b in range(0, len(buckets)): if best_diff > abs(width * 1.3 - buckets[b]): best_diff = abs(width * 1.3 - buckets[b]) bestb = b scaled = cv2.resize(norm2, (buckets[bestb], 32)) scaled = np.asarray(scaled, dtype=np.float) delta = scaled.max() - scaled.min() scaled = (scaled) / (delta / 2) scaled -= scaled.mean() if not bucket_images.has_key(bestb): bucket_images[bestb] = {} bucket_images[bestb]['img'] = [] bucket_images[bestb]['sizes'] = [] bucket_images[bestb]['txt'] = [] bucket_images[bestb]['gt_enc'] = [] dummy[bestb] = 1 else: if args.debug and len(bucket_images[bestb]) > 4: continue elif len(bucket_images[bestb]) > 32: continue gt_labels = [] txt_enc = '' for k in range(txt.shape[1]): if txt[0, k] > 0: if codec_rev.has_key(txt[0, k]): gt_labels.append( codec_rev[txt[0, k]] ) else: gt_labels.append( 3 ) txt_enc += unichr(txt[0, k]) else: gt_labels.append( 0 ) if scaled.ndim == 3: scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY) if args.debug: cv2.imshow('scaled', scaled) bucket_images[bestb]['sizes'].append(len(gt_labels)) bucket_images[bestb]['gt_enc'].append(gt_labels) bucket_images[bestb]['txt'].append(txt_enc) bucket_images[bestb]['img'].append(scaled) matched_detections += 1 #and learn OCR for bucket in bucket_images.keys(): imtf = np.asarray(bucket_images[bucket]['img'], dtype=np.float) imtf = np.reshape(imtf, (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2])) #imtf = imtf.reshape((imtf.shape[0], imtf.shape[1], imtf.shape[2], 1)) #imtf = np.swapaxes(imtf,1,3) net_ctc.blobs['data'].reshape(imtf.shape[0],imtf.shape[1],imtf.shape[2], imtf.shape[3]) net_ctc.blobs['data'].data[...] = imtf labels = bucket_images[bucket]['gt_enc'] txt = bucket_images[bucket]['txt'] max_len = 0 for l in range(0, len(labels)): max_len = max(max_len, len(labels[l])) for l in range(0, len(labels)): while len(labels[l]) < max_len: labels[l].append(0) labels = np.asarray(labels, np.float) net_ctc.blobs['label'].reshape(labels.shape[0], labels.shape[1]) net_ctc.blobs['label'].data[...] = labels if args.debug: vis.vis_square(imtf[0]) cv2.imshow('draw', draw[0]) cv2.waitKey(5) optim.step(1) sum_cost += net_ctc.blobs['loss'].data[...] if net_ctc.blobs['loss'].data[...] > 10: vis.vis_square(imtf[0]) cv2.imshow('draw', draw[0]) sf = net_ctc.blobs['transpose'].data[...] labels2 = sf.argmax(3) out = utils.print_seq(labels2[:, 0, :]) print(u'{0} - {1}'.format(out, txt[0]) ) cv2.waitKey(5) count += imtf.shape[0] correct_cout = 0 for i in range(len(labels_gt)): det_text = labels_det[i] gt_text = labels_gt[i] if it % 100 == 0: print( u"{0} - {1}".format(det_text, gt_text).encode('utf8') ) if det_text == gt_text: correct_cout += 1 count = max(count, 1) mean_loss = 0.99 * mean_loss + 0.01 * sum_cost / count mean_rec = mean_rec * 0.99 + 0.01 * correct_cout / float(max(1, len(labels_gt))) #count detection ratio tp = 0 for bid in range(im.shape[0]): word_gto = word_gtob[bid] for gt_no in range(len(word_gto)): gt = word_gto[gt_no] gtnum = 1000 * bid + gt_no if gt_to_detection.has_key(gtnum): tupl = gt_to_detection[gtnum] if tupl[0] > 0.5: tp += 1 loc_recall = tp / float(max(1, gt_count)) if args.debug: cv2.imshow('draw', draw[0]) if im.shape[0] > 1: cv2.imshow('draw2', draw[1]) cv2.waitKey(10) if it % 10 == 0: print('{0} - lr:{1:.3e} ctc:{2:.4f}/{3:.4f} wr:{4:.2f}/{5:.2f}, loc:{6:.2f} {7}'.format(it, 0.0001, sum_cost / count, mean_loss, correct_cout / float(max(1, len(labels_gt))), mean_rec, loc_recall, matched_detections)) if it % 1000 == 0: optim.snapshot() optim2.snapshot()
def evaluate_image(batch, detections, word_gto, iou_th=0.3, iou_th_vis=0.5, iou_th_eval=0.4): ''' Summary : Returns end-to-end true-positives, detection true-positives, number of GT to be considered for eval (len > 2). Description : For each predicted bounding-box, comparision is made with each GT entry. Values of number of end-to-end true positives, number of detection true positives, number of GT entries to be considered for evaluation are computed. Parameters ---------- iou_th_eval : float Threshold value of intersection-over-union used for evaluation of predicted bounding-boxes iou_th_vis : float Threshold value of intersection-over-union used for visualization when transciption is true but IoU is lesser. iou_th : float Threshold value of intersection-over-union between GT and prediction. word_gto : list of lists List of ground-truth bounding boxes along with transcription. batch : list of lists List containing data (input image, image file name, ground truth). detections : tuple of tuples Tuple of predicted bounding boxes along with transcriptions and text/no-text score. Returns ------- tp : int Number of predicted bounding-boxes having IoU with GT greater than iou_th_eval. tp_e2e : int Number of predicted bounding-boxes having same transciption as GT and len > 2. gt_e2e : int Number of GT entries for which transcription len > 2. ''' gt_to_detection = {} tp = 0 tp_e2e = 0 gt_e2e = 0 draw = batch[4][0] normFactor = math.sqrt( draw.shape[1] * draw.shape[1] + draw.shape[0] * draw.shape[0]) # Normalization factor for i in range(0, len(detections)): det = detections[i] boxr = det[0] box = cv2.boxPoints(boxr) # Predicted bounding-box parameters box = np.array( box, dtype="int") # Convert predicted bounding-box to numpy array bbox = cv2.boundingRect(box) bbox = [bbox[0], bbox[1], bbox[2], bbox[3]] bbox[2] += bbox[0] # Convert width to right-coordinate bbox[3] += bbox[1] # Convert height to bottom-coordinate vis.draw_box_points(draw, box, color=(255, 0, 0)) det_text = det[1][0] # Predicted transcription for bounding-box #print(det_text) for gt_no in range(len(word_gto)): gt = word_gto[gt_no] txt = gt[5] # GT transcription for given GT bounding-box gtbox = ((gt[0] * draw.shape[1], gt[1] * draw.shape[0]), (gt[2] * normFactor, gt[3] * normFactor), gt[4] * 180 / 3.14) # Re-scaling GT values gtbox = cv2.boxPoints(gtbox) gtbox = np.array(gtbox, dtype="int") rect_gt = cv2.boundingRect(gtbox) rect_gt = [rect_gt[0], rect_gt[1], rect_gt[2], rect_gt[3]] rect_gt[2] += rect_gt[0] # Convert GT width to right-coordinate rect_gt[3] += rect_gt[1] # Convert GT height to bottom-coordinate inter = intersect( bbox, rect_gt) # Intersection of predicted and GT bounding-boxes uni = union(bbox, rect_gt) # Union of predicted and GT bounding-boxes ratio = area(inter) / float(area( uni)) # IoU measure between predicted and GT bounding-boxes # 1). Visualize the predicted-bounding box if IoU with GT is higher than IoU threshold (iou_th) (Always required) # 2). Visualize the predicted-bounding box if transcription matches the GT and condition 1. holds # 3). Visualize the predicted-bounding box if transcription matches and IoU with GT is less than iou_th_vis and 1. and 2. hold if ratio > iou_th: vis.draw_box_points(draw, box, color=(0, 128, 0)) if not gt_to_detection.has_key(gt_no): gt_to_detection[gt_no] = [0, 0] if txt.lower() == det_text.lower(): to_cls_x.append( [len(det_text), det[1][1], det[1][2], det[1][3]]) to_cls_y.append(1) vis.draw_box_points(draw, box, color=(0, 255, 0), thickness=2) gt[7] = 1 # Change this parameter to 1 when predicted transcription is correct. if ratio < iou_th_vis: vis.draw_box_points(draw, box, color=(255, 255, 255), thickness=2) cv2.imshow('draw', draw) #cv2.waitKey(0) else: to_cls_x.append( [len(det_text), det[1][1], det[1][2], det[1][3]]) to_cls_y.append(0) tupl = gt_to_detection[gt_no] if tupl[0] < ratio: tupl[0] = ratio tupl[1] = i # Count the number of end-to-end and detection true-positives for gt_no in range(len(word_gto)): gt = word_gto[gt_no] txt = gt[5] if len(txt) > 2: gt_e2e += 1 if gt[7] == 1: tp_e2e += 1 if gt_to_detection.has_key(gt_no): tupl = gt_to_detection[gt_no] if tupl[0] > iou_th_eval: # Increment detection true-positive, if IoU is greater than iou_th_eval tp += 1 cv2.imshow('draw', draw) return tp, tp_e2e, gt_e2e
def froward_image(nets, scaled, original): ''' :param nets: yolo网络,ctc网络 :param scaled:灰度reshape图片 :param original:原始图片 :return: detections_out:[( ((1181.9506549451335, 174.54442087680732), (116.45833333333334, 19.8), -2.3903521532498173), (u'FORQUEuEING', 0.885055888782848, True, 0)),()] (中心(x,y), (宽,高), 旋转角度) fps: 每秒传输帧数 ''' global rec_t, ext_factor, ext_factorx net, net_ctc = nets img = [scaled] # draw = img[0] # imgo = original im = np.asarray(img, dtype=np.float) im = im / 128.0 im = im - 1.0 #im = im.reshape((3, im.shape[0], im.shape[1])) im = np.swapaxes(im, 1, 3) im = np.swapaxes(im, 2, 3) net.blobs['data'].reshape(im.shape[0], im.shape[1], im.shape[2], im.shape[3]) net.blobs['data'].data[...] = im net.reshape() start = time.time() out = net.forward(start="conv1") end = time.time() seconds = end - start fps = 1 / seconds boxes = out['boxes'] #(1, 1, 500, 15) 500个anchor boxes[0, 0, :, 0] *= image_size[0] boxes[0, 0, :, 1] *= image_size[1] normFactor = math.sqrt(image_size[1] * image_size[1] + image_size[0] * image_size[0]) boxes[0, 0, :, 2] *= normFactor boxes[0, 0, :, 3] *= normFactor nms = boxes[0, 0, :, 8] != 1 boxes = boxes[:, :, nms, :] boxes_count = 0 for i in range(0, boxes.shape[2]): det_word = boxes[0, 0, i] if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.1: break boxes_count += 1 detections_out = [] # 对于每一个检测出来的框(nms之后且分数大于0.1),都识别一次 for i in range(0, boxes_count): det_word = boxes[0, 0, i] boxr = ((det_word[0], det_word[1]), (det_word[2], det_word[3]), det_word[4] * 180 / 3.14) # 用预测出来的 x,y h, w, angle box = cv2.boxPoints(boxr) # 得到四个点的坐标 box = np.array(box, dtype="int") #vis.draw_box_points(draw, box, (255, 0, 0)) bbox = cv2.boundingRect(box) # 变成最小矩形框, x, y, w, h bbox = [bbox[0], bbox[1], bbox[2], bbox[3]] bbox[2] += bbox[0] bbox[3] += bbox[1] # 后面也没用到bbox啊 boxro = [[det_word[0], det_word[1]], [det_word[2] * ext_factorx, det_word[3] * ext_factor], det_word[4] * 180 / 3.14] boxt = get_obox(img[0], original, boxro) boxt = ((boxt[0][0], boxt[0][1]), (boxt[1][0], boxt[1][1]), boxt[2]) norm2, rot_mat = get_normalized_image(original, boxt) if norm2 is None: continue norm = cv2.cvtColor(norm2, cv2.COLOR_BGR2GRAY) width_scale = 32.0 / norm2.shape[0] width = norm.shape[1] * width_scale best_diff = width bestb = 0 for b in range(0, len(buckets)): if best_diff > abs(width - buckets[b]): best_diff = abs(width - buckets[b]) bestb = b scaled = cv2.resize(norm, (buckets[bestb], 32)) imtf = np.asarray([scaled], dtype=np.float) imtf = np.asarray(imtf, dtype=np.float) delta = imtf.max() - imtf.min() imtf /= (delta / 2) imtf -= imtf.mean() imtf = np.reshape(imtf, (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2])) net_ctc.blobs['data'].reshape(imtf.shape[0], imtf.shape[1], imtf.shape[2], imtf.shape[3]) net_ctc.blobs['data'].data[...] = imtf outctc = net_ctc.forward() # ['loss', 'softmax'] ctc_f = outctc['softmax'] # shape (48, 1, 1, 141) ctc_f = ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[1], ctc_f.shape[3]) labels = ctc_f.argmax(2) #(48, 1) mask = labels > 2 masked = ctc_f.max(2)[mask] mean_conf = np.sum(masked) / masked.shape[0] if mean_conf < 0.2: vis.draw_box_points(scaled, box, color=(0, 0, 0)) continue if debug: vis.vis_square(imtf[0]) det_text, conf, dec_s = print_seq_ext( labels[:, 0], np.sum(masked)) # 得到det_text,识别出来的字 if len(det_text) == 0: continue if len(det_text) < 3 and mean_conf < 0.8: continue splits_raw = process_splits(det_text, conf, dec_s, norm2, ctc_f, rot_mat, boxt, original, 0, mean_conf, alow_non_dict=True) detections_out.extend(splits_raw) dec2, conf2, dec_splits = cmp_trie.decode_sofmax( ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2])) best_dict = print_seq2(dec2[0]) # 这个是什么?这里得到的是 “” 所以下面就continue了 if len(best_dict) == 0: continue splits_out = process_splits(best_dict, conf2, dec_splits, norm2, ctc_f, rot_mat, boxt, original, 1, mean_conf) detections_out.extend(splits_out) return detections_out, fps
def test_pic(nets): global rec_t, image_size font = ImageFont.truetype( "/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf", 16) font2 = ImageFont.truetype( "/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf", 18) impath = "images/demo.jpg" im = cv2.imread(impath) image_size = [640 / 64 * 64, 480 / 64 * 64] scaled = cv2.resize(im, (image_size[0], image_size[1])) # 转为灰度图 if nets[0].blobs['data'].data[...].shape[1] == 1: scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY) scaled = scaled.reshape((scaled.shape[0], scaled.shape[1], 1)) # 检测 & 识别 detections_out, fps = froward_image(nets, scaled, im) img = Image.fromarray(im) draw = ImageDraw.Draw(img) for detection in detections_out: text = detection[1][0] print(text) width, height = draw.textsize(text, font=font) # 返回一个两元素的元组,是给定字符串像素意义上的size center = [detection[0][0][0] - width / 2, detection[0][0][1] - 10] sx = int(detection[0][0][0] - width / 2) ex = int(detection[0][0][0] + width / 2) sy = int(detection[0][0][1] - 10) ey = int(detection[0][0][1] + 10) im[sy:ey, sx:ex] = im[sy:ey, sx:ex] / 2 boxr = ((detection[0][0][0], detection[0][0][1]), (detection[0][1][0], detection[0][1][1]), detection[0][2]) box = cv2.boxPoints( boxr ) # 返回值为numpy数组,四个点坐标[[x,y],[x,y],[x,y],[x,y]]:(中心(x,y), (宽,高), 旋转角度) color = (0, 255, 0) vis.draw_box_points(im, box, color, thickness=1) img = Image.fromarray(im) draw = ImageDraw.Draw(img) draw.text((10, 10), 'FPS: {0:.2f}'.format(fps), (0, 255, 0), font=font2) for detection in detections_out: text = detection[1][0] width, height = draw.textsize(text, font=font) center = [detection[0][0][0] - width / 2, detection[0][0][1] - 10] draw.text((center[0], center[1]), text, fill=(0, 255, 0), font=font) pix = np.array(img) if pix.shape[0] > 1024: pix = cv2.resize(pix, (pix.shape[1] / 2, pix.shape[0] / 2)) cv2.imwrite(impath + "_result_pix.jpg", pix) # 有框,框里有结果
def evaluate_image(batch, detections, word_gto, iou_th=0.3, iou_th_vis=0.5, iou_th_eval=0.4): ''' Summary : Returns end-to-end true-positives, detection true-positives, number of GT to be considered for eval (len > 2). Description : For each predicted bounding-box, comparision is made with each GT entry. Values of number of end-to-end true positives, number of detection true positives, number of GT entries to be considered for evaluation are computed. Parameters ---------- iou_th_eval : float Threshold value of intersection-over-union used for evaluation of predicted bounding-boxes iou_th_vis : float Threshold value of intersection-over-union used for visualization when transciption is true but IoU is lesser. iou_th : float Threshold value of intersection-over-union between GT and prediction. word_gto : list of lists List of ground-truth bounding boxes along with transcription. batch : list of lists List containing data (input image, image file name, ground truth). detections : tuple of tuples Tuple of predicted bounding boxes along with transcriptions and text/no-text score. Returns ------- tp : int Number of predicted bounding-boxes having IoU with GT greater than iou_th_eval. tp_e2e : int Number of predicted bounding-boxes having same transciption as GT and len > 2. gt_e2e : int Number of GT entries for which transcription len > 2. ''' gt_to_detection = {} tp = 0 tp_e2e = 0 gt_e2e = 0 draw = batch[4][0] normFactor = math.sqrt(draw.shape[1] * draw.shape[1] + draw.shape[0] * draw.shape[0]) # Normalization factor for i in range(0, len(detections)): det = detections[i] boxr = det[0] box = cv2.boxPoints(boxr) # Predicted bounding-box parameters box = np.array(box, dtype="int") # Convert predicted bounding-box to numpy array bbox = cv2.boundingRect(box) bbox = [bbox[0], bbox[1], bbox[2], bbox[3]] bbox[2] += bbox[0] # Convert width to right-coordinate bbox[3] += bbox[1] # Convert height to bottom-coordinate vis.draw_box_points(draw, box, color = (255, 0, 0)) det_text = det[1][0] # Predicted transcription for bounding-box #print(det_text) for gt_no in range(len(word_gto)): gt = word_gto[gt_no] txt = gt[5] # GT transcription for given GT bounding-box gtbox = ((gt[0] * draw.shape[1], gt[1] * draw.shape[0]), (gt[2] * normFactor, gt[3] * normFactor), gt[4] * 180 / 3.14) # Re-scaling GT values gtbox = cv2.boxPoints(gtbox) gtbox = np.array(gtbox, dtype="int") rect_gt = cv2.boundingRect(gtbox) rect_gt = [rect_gt[0], rect_gt[1], rect_gt[2], rect_gt[3]] rect_gt[2] += rect_gt[0] # Convert GT width to right-coordinate rect_gt[3] += rect_gt[1] # Convert GT height to bottom-coordinate inter = intersect(bbox, rect_gt) # Intersection of predicted and GT bounding-boxes uni = union(bbox, rect_gt) # Union of predicted and GT bounding-boxes ratio = area(inter) / float(area(uni)) # IoU measure between predicted and GT bounding-boxes # 1). Visualize the predicted-bounding box if IoU with GT is higher than IoU threshold (iou_th) (Always required) # 2). Visualize the predicted-bounding box if transcription matches the GT and condition 1. holds # 3). Visualize the predicted-bounding box if transcription matches and IoU with GT is less than iou_th_vis and 1. and 2. hold if ratio > iou_th: vis.draw_box_points(draw, box, color = (0, 128, 0)) if not gt_to_detection.has_key(gt_no): gt_to_detection[gt_no] = [0, 0] if txt.lower() == det_text.lower(): to_cls_x.append([len(det_text), det[1][1], det[1][2], det[1][3]]) to_cls_y.append(1) vis.draw_box_points(draw, box, color = (0, 255, 0), thickness=2) gt[7] = 1 # Change this parameter to 1 when predicted transcription is correct. if ratio < iou_th_vis: vis.draw_box_points(draw, box, color = (255, 255, 255), thickness=2) cv2.imshow('draw', draw) #cv2.waitKey(0) else: to_cls_x.append([len(det_text), det[1][1], det[1][2], det[1][3]]) to_cls_y.append(0) tupl = gt_to_detection[gt_no] if tupl[0] < ratio: tupl[0] = ratio tupl[1] = i # Count the number of end-to-end and detection true-positives for gt_no in range(len(word_gto)): gt = word_gto[gt_no] txt = gt[5] if len(txt) > 2: gt_e2e += 1 if gt[7] == 1: tp_e2e += 1 if gt_to_detection.has_key(gt_no): tupl = gt_to_detection[gt_no] if tupl[0] > iou_th_eval: # Increment detection true-positive, if IoU is greater than iou_th_eval tp += 1 cv2.imshow('draw', draw) return tp, tp_e2e, gt_e2e
def ocr_detections(net_ctc, img, scaled_img, boxes, image_size, r_p_th, out_raw, baseName, debug, split_words, alow_non_dict=False): global rec_t, ext_factor, use_per_image draw = np.copy(scaled_img) # Region layer returns normalized coordiates, convert the generated boxes to image coordinate system boxes[0, 0, :, 0] *= image_size[0] boxes[0, 0, :, 1] *= image_size[1] normFactor = math.sqrt(image_size[1] * image_size[1] + image_size[0] * image_size[0]) boxes[0, 0, :, 2] *= normFactor boxes[0, 0, :, 3] *= normFactor nms_mask = boxes[0, 0, :, 8] != 1 boxes = boxes[:, :, nms_mask, :] # Region layer returns boxes in sorted order by r_{p}, filter out the boxes with r_{p} below threshold value boxes_count = 0 for i in range(0, boxes.shape[2]): det_word = boxes[0, 0, i] if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < r_p_th: break boxes_count += 1 detections_out = [] for i in range(0, boxes_count): det_word = boxes[0, 0, i] boxr = ((det_word[0], det_word[1]), (det_word[2], det_word[3]), det_word[4] * 180 / 3.14) # Convert the rotation parameter to degrees box = cv2.boxPoints(boxr) # Gives the coordinates for 4 points of bounding-box box = np.array(box, dtype="int") if det_word[3] < 5: continue if debug: try: vis.draw_box_points(draw, box, (255, 0, 0)) # Visualize the predicted bounding-boxes except: pass bbox = cv2.boundingRect(box) bbox = [bbox[0], bbox[1], bbox[2], bbox[3]] bbox[2] += bbox[0] # Convert width to right-coordinate bbox[3] += bbox[1] # Convert height to bottom-coordinate boxro = [[det_word[0], det_word[1]], [det_word[2] * ext_factorx, det_word[3] * ext_factor], det_word[4] * 180 / 3.14] # Re-scaling the bounding-box parameters to increase height and width, this helps recognizer boxt = get_obox(scaled_img, img, boxro) # Rescale the predicted bounding box to original image size boxt = ((boxt[0][0], boxt[0][1]), (boxt[1][0], boxt[1][1]), boxt[2]) norm2, rot_mat = get_normalized_image(img, boxt) # norm2 stores normalized cropped region from original image determined by predicted bounding box if norm2 is None: continue #boxt[2] = boxt[2] * 180 / 3.14 #cv2.imshow('norm2', norm2) #cv2.imshow('draw', draw) if norm2.ndim > 2: norm = cv2.cvtColor(norm2, cv2.COLOR_BGR2GRAY ) # Convert the cropped region to GRAY scale for recognizer else: norm = norm2 # Do nothing if already GRAY scale # Change width for each cropped region, keeping height fixed (32). Map width to closest value from bucket width_scale = 32.0 / norm2.shape[0] width = norm.shape[1] * width_scale best_diff = width bestb = 0 for idx, val in enumerate(buckets): if (buckets[idx] - width) < 0 : bestb = idx best_diff = abs(buckets[idx] - width) * 3 continue if best_diff > (buckets[idx] - width): bestb = idx best_diff = (buckets[idx] - width) scaled = cv2.resize(norm, (buckets[bestb], 32)) # Resize cropped region for input for recognizer FCN if scaled.ndim == 3: scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY) imtf = np.asarray([scaled], dtype=np.float) imtf = np.asarray(imtf, dtype=np.float) imtf /= 128.0 imtf -= 1 imtf = np.reshape(imtf, (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2])) net_ctc.blobs['data'].reshape(imtf.shape[0],imtf.shape[1],imtf.shape[2], imtf.shape[3]) # Reshape the recognizer FCN to adapt varying cropped region size net_ctc.blobs['data'].data[...] = imtf # Load the data onto recognizer FCN (cropped region data) net_ctc.forward() # Recognizer FCN feed-forward ctc_f = net_ctc.blobs['softmax'].data[...] ctc_f = ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[1], ctc_f.shape[3]) labels = ctc_f.argmax(2) # 3rd dimension (ctc_f[:,:,2]) contains softmax distribution over all the possible characters for each position, thus labels store the index of character with maximum value (probability). mask = labels > 3 masked = ctc_f.max(2)[mask] # For each predicted character, fetch the corresponding score mean_conf = np.sum(masked) / masked.shape[0] # Mean score for all the predicted characters # Visualize if mean score for predicted characters is less than 0.3 if mean_conf < 0.3: continue if debug: vis.vis_square(imtf[0]) det_text, conf, dec_s = print_seq_ext(labels[:, 0], np.sum(masked) ) if not split_words: detections_out.extend( [(boxt, (det_text, mean_conf, 1, mean_conf) )] ) continue #print(det_text) #str_lm, pr = cmp_trie.decode_sofmax_lm(ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2])) #if det_text != str_lm: # print(' Decoding diff: {0} - {1}'.format(det_text, str_lm)) # det_text = str_lm.strip() if len(det_text.strip()) == 0: continue if len(det_text.strip()) <= 3: if mean_conf < 0.6 or det_word[5] < 0.4: continue pr = 1 for k in range(masked.shape[0]): pr = pr * masked[k] pr = math.exp(pr) #pr = math.pow(pr, 1.0/ len(det_text) ) #tex_conf = mean_conf / ctc_f.shape[0] #if tex_conf < 0.1: # continue #print(det_text) #cv2.imshow('norm2', norm2) splits_raw = process_splits(det_text, conf, dec_s, norm2, ctc_f, rot_mat, boxt, img, det_word[5], mean_conf, alow_non_dict = alow_non_dict) # Process the split and improve the localization results using "space" (' ') predicted by recognizer detections_out.extend( splits_raw ) spl = det_text.split(" ") if len(spl) == 1 and cmp_trie.is_dict(spl[0].lower().encode('utf-8')) == 1: continue dec2, conf2, dec_splits = cmp_trie.decode_sofmax(ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2])) best_dict = print_seq2(dec2[0]) if out_raw is not None and len(det_text) > 2: boxout = cv2.boxPoints(boxt) out_raw.write(u"{0}|{1}|{2}|{3}|{4}|{5}|{6}|{7}|{8}|{9}|{10}|{11}\n".format(\ baseName[:-4],boxout[0, 0],boxout[0, 1], boxout[1, 0], boxout[1, 1], \ boxout[2, 0], boxout[2, 1], boxout[3, 0], boxout[3, 1], det_text, best_dict, mean_conf).encode('utf8')) splits_out = process_splits(best_dict, conf2, dec_splits, norm2, ctc_f, rot_mat, boxt, img, det_word[5], pr, alow_non_dict=False) detections_out.extend( splits_out ) #detections_out = nms(detections_out) if out_raw is not None: out_raw.flush() cv2.imshow('draw', draw) cv2.waitKey(10) return detections_out
def random_crop(self, img, word_gto): xs = int(random.uniform(0, self.crop_ratio) * img.shape[1]) xe = int(random.uniform(0, self.crop_ratio) * img.shape[1]) maxx = img.shape[1] - xe ys = int(random.uniform(0, self.crop_ratio) * img.shape[0]) ye = int(random.uniform(0, self.crop_ratio) * img.shape[0]) maxy = img.shape[0] - ye crop_img = img[ys:maxy, xs:maxx] normo = math.sqrt(img.shape[0] * img.shape[0] + img.shape[1] * img.shape[1] ) image_size = (crop_img.shape[1], crop_img.shape[0]) normo2 = math.sqrt(image_size[1] * image_size[1] + image_size[0] * image_size[0] ) o_size = (img.shape[1], img.shape[0]) gt_out = [] for gt_no in range(len(word_gto)): #TODO - remove loop ... use numpy gt = word_gto[gt_no] gtbox = ((gt[0] * o_size[0], gt[1] * o_size[1]), (gt[2] * normo, gt[3] * normo), gt[4] * 180 / 3.14) gtbox = cv2.boxPoints(gtbox) gtbox = np.array(gtbox, dtype="float") gtbox[:, 0] -= xs gtbox[:, 1] -= ys gtbox[gtbox[:, 0] < 0, 0] = 0 gtbox[gtbox[:, 1] < 0, 1] = 0 gtbox[gtbox[:, 0] > maxx, 0] = maxx gtbox[gtbox[:, 1] > maxy, 1] = maxy dh = gtbox[0, :] - gtbox[1, :] dw = gtbox[1, :] - gtbox[2, :] centerx = np.sum( gtbox[:, 0] ) / float(gtbox.shape[0]) centery = np.sum( gtbox[:, 1] ) / float(gtbox.shape[0]) h = math.sqrt(dh[0] * dh[0] + dh[1] * dh[1]) / normo2 w = math.sqrt(dw[0] * dw[0] + dw[1] * dw[1]) / normo2 if w * normo2 < 2 or h * normo2 < 2 or np.isinf(w) or np.isinf(h): #print("warn: removig too small gt {0}".format(gt)) continue gt[0] = centerx / image_size[0] gt[1] = centery / image_size[1] gt[2] = w gt[3] = h gt[4] = math.atan2((gtbox[2, 1] - gtbox[1, 1]), (gtbox[2, 0] - gtbox[1, 0])) if False: draw_box_points(crop_img, np.array(gtbox, dtype="int"), color = (0, 255, 0)) gtbox2 = ((gt[0] * image_size[0], gt[1] * image_size[1]), (gt[2] * normo2, gt[3] * normo2), gt[4] * 180 / 3.14) gtbox2 = cv2.boxPoints(gtbox2) gtbox2 = np.array(gtbox2, dtype="float") draw_box_points(crop_img, np.array(gtbox2, dtype="int"), color = (0, 255, 0)) cv2.imshow('c2', crop_img) gt_out.append(gt) #cv2.imshow('crop_img', crop_img) #cv2.waitKey(0) return (crop_img, gt_out)
def process_splits(trans, conf, splits, norm2, ctc_f, rot_mat, boxt, draw, iou, debug = False, alow_non_dict = False): ''' Summary : Split the transciption and corresponding bounding-box based on spaces predicted by recognizer FCN. Description : Parameters ---------- trans : string String containing the predicted transcription for the corresponding predicted bounding-box. conf : list List containing sum of confidence for all the character by recognizer FCN, start and end position in bounding-box for generated transciption. splits : list List containing index of position of predicted spaces by the recognizer FCN. norm2 : matrix Matrix containing the cropped bounding-box predicted by localization FCN in the originial image. ctc_f : matrix Matrix containing output of recognizer FCN for the given input bounding-box. rot_mat : matrix Rotation matrix returned by get_normalized_image function. boxt : tuple of tuples Tuple of tuples containing parametes of predicted bounding-box by localization FCN. draw : matrix Matrix containing input image. debug : boolean Boolean parameter representing debug mode, if it is True visualization boxes are generated. Returns ------- boxes_out : list of tuples List of tuples containing predicted bounding-box parameters, predicted transcription and mean confidence score from the recognizer. ''' spl = trans.split(" ") boxout = cv2.boxPoints(boxt) start_f = 0 mean_conf = conf[0, 0] / len(trans) # Overall confidence of recognizer FCN boxes_out = [] for s in range(len(spl)): text = spl[s] end_f = conf[0, 2] if s < len(spl) - 1: try: if splits[0, s] > start_f: end_f = splits[0, s] # New ending point of bounding-box transcription except IndexError: pass scalex = norm2.shape[1] / float(ctc_f.shape[0]) poss = start_f * scalex pose = (end_f + 2) * scalex rect = [[poss, 0], [pose, 0], \ [pose, norm2.shape[0] - 1], [poss, norm2.shape[0] - 1]] rect = np.array(rect) #rect[:, 0] += boxt[0][0] #rect[:, 1] += boxt[0][1] int_t = cv2.invertAffineTransform(rot_mat) dst_rect = np.copy(rect) dst_rect[:,0] = int_t[0,0]*rect[:,0] + int_t[0,1]*rect[:, 1] + int_t[0,2] dst_rect[:,1] = int_t[1,0]*rect[:,0] + int_t[1,1]*rect[:, 1] + int_t[1,2] tx = np.sum(dst_rect[:,0]) / 4.0 ty = np.sum(dst_rect[:,1]) / 4.0 br = cv2.boundingRect(boxout) tx += br[0] ty += br[1] twidth = (pose - poss) #twidth = (pose - poss) / ext_factor theight = norm2.shape[0] box_back = ( (tx, ty), (twidth, theight * 0.9), boxt[2] ) if debug: boxout_u = cv2.boxPoints(box_back) vis.draw_box_points(draw, boxout_u, color = (0, 255, 0)) cv2.imshow('draw', draw) if len(text.strip()) == 0: print("zero length text!") continue textc = text.replace(".", "").replace(":", "").replace("!", "").replace("?", "").replace(",", "").replace("/", "").replace("-", "").replace("$", "").replace("'", "").replace("(", "").replace(")", "").replace("+", "") if textc.endswith("'s"): textc = textc[:-2] is_dict = cmp_trie.is_dict(textc.encode('utf-8')) or textc.isdigit() or alow_non_dict if len(text) > 2 and ( text.isdigit() or is_dict): boxes_out.append( (box_back, (text, mean_conf, is_dict, iou) ) ) start_f = end_f + 1 return boxes_out
def ocr_detections(net_ctc, img, scaled_img, boxes, image_size, r_p_th, out_raw, baseName, debug, split_words, alow_non_dict=False): global rec_t, ext_factor, use_per_image draw = np.copy(scaled_img) # Region layer returns normalized coordiates, convert the generated boxes to image coordinate system boxes[0, 0, :, 0] *= image_size[0] boxes[0, 0, :, 1] *= image_size[1] normFactor = math.sqrt(image_size[1] * image_size[1] + image_size[0] * image_size[0]) boxes[0, 0, :, 2] *= normFactor boxes[0, 0, :, 3] *= normFactor nms_mask = boxes[0, 0, :, 8] != 1 boxes = boxes[:, :, nms_mask, :] # Region layer returns boxes in sorted order by r_{p}, filter out the boxes with r_{p} below threshold value boxes_count = 0 for i in range(0, boxes.shape[2]): det_word = boxes[0, 0, i] if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < r_p_th: break boxes_count += 1 detections_out = [] for i in range(0, boxes_count): det_word = boxes[0, 0, i] boxr = ((det_word[0], det_word[1]), (det_word[2], det_word[3]), det_word[4] * 180 / 3.14 ) # Convert the rotation parameter to degrees box = cv2.boxPoints( boxr) # Gives the coordinates for 4 points of bounding-box box = np.array(box, dtype="int") if det_word[3] < 5: continue if debug: try: vis.draw_box_points( draw, box, (255, 0, 0)) # Visualize the predicted bounding-boxes except: pass bbox = cv2.boundingRect(box) bbox = [bbox[0], bbox[1], bbox[2], bbox[3]] bbox[2] += bbox[0] # Convert width to right-coordinate bbox[3] += bbox[1] # Convert height to bottom-coordinate boxro = [ [det_word[0], det_word[1]], [det_word[2] * ext_factorx, det_word[3] * ext_factor], det_word[4] * 180 / 3.14 ] # Re-scaling the bounding-box parameters to increase height and width, this helps recognizer boxt = get_obox( scaled_img, img, boxro) # Rescale the predicted bounding box to original image size boxt = ((boxt[0][0], boxt[0][1]), (boxt[1][0], boxt[1][1]), boxt[2]) norm2, rot_mat = get_normalized_image( img, boxt ) # norm2 stores normalized cropped region from original image determined by predicted bounding box if norm2 is None: continue #boxt[2] = boxt[2] * 180 / 3.14 #cv2.imshow('norm2', norm2) #cv2.imshow('draw', draw) if norm2.ndim > 2: norm = cv2.cvtColor( norm2, cv2.COLOR_BGR2GRAY ) # Convert the cropped region to GRAY scale for recognizer else: norm = norm2 # Do nothing if already GRAY scale # Change width for each cropped region, keeping height fixed (32). Map width to closest value from bucket width_scale = 32.0 / norm2.shape[0] width = norm.shape[1] * width_scale best_diff = width bestb = 0 for idx, val in enumerate(buckets): if (buckets[idx] - width) < 0: bestb = idx best_diff = abs(buckets[idx] - width) * 3 continue if best_diff > (buckets[idx] - width): bestb = idx best_diff = (buckets[idx] - width) scaled = cv2.resize( norm, (buckets[bestb], 32)) # Resize cropped region for input for recognizer FCN if scaled.ndim == 3: scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY) imtf = np.asarray([scaled], dtype=np.float) imtf = np.asarray(imtf, dtype=np.float) imtf /= 128.0 imtf -= 1 imtf = np.reshape(imtf, (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2])) net_ctc.blobs['data'].reshape( imtf.shape[0], imtf.shape[1], imtf.shape[2], imtf.shape[3] ) # Reshape the recognizer FCN to adapt varying cropped region size net_ctc.blobs['data'].data[ ...] = imtf # Load the data onto recognizer FCN (cropped region data) net_ctc.forward() # Recognizer FCN feed-forward ctc_f = net_ctc.blobs['softmax'].data[...] ctc_f = ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[1], ctc_f.shape[3]) labels = ctc_f.argmax( 2 ) # 3rd dimension (ctc_f[:,:,2]) contains softmax distribution over all the possible characters for each position, thus labels store the index of character with maximum value (probability). mask = labels > 3 masked = ctc_f.max( 2 )[mask] # For each predicted character, fetch the corresponding score mean_conf = np.sum(masked) / masked.shape[ 0] # Mean score for all the predicted characters # Visualize if mean score for predicted characters is less than 0.3 if mean_conf < 0.3: continue if debug: vis.vis_square(imtf[0]) det_text, conf, dec_s = print_seq_ext(labels[:, 0], np.sum(masked)) if not split_words: detections_out.extend([(boxt, (det_text, mean_conf, 1, mean_conf)) ]) continue #print(det_text) #str_lm, pr = cmp_trie.decode_sofmax_lm(ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2])) #if det_text != str_lm: # print(' Decoding diff: {0} - {1}'.format(det_text, str_lm)) # det_text = str_lm.strip() if len(det_text.strip()) == 0: continue if len(det_text.strip()) <= 3: if mean_conf < 0.6 or det_word[5] < 0.4: continue pr = 1 for k in range(masked.shape[0]): pr = pr * masked[k] pr = math.exp(pr) #pr = math.pow(pr, 1.0/ len(det_text) ) #tex_conf = mean_conf / ctc_f.shape[0] #if tex_conf < 0.1: # continue #print(det_text) #cv2.imshow('norm2', norm2) splits_raw = process_splits( det_text, conf, dec_s, norm2, ctc_f, rot_mat, boxt, img, det_word[5], mean_conf, alow_non_dict=alow_non_dict ) # Process the split and improve the localization results using "space" (' ') predicted by recognizer detections_out.extend(splits_raw) spl = det_text.split(" ") if len(spl) == 1 and cmp_trie.is_dict( spl[0].lower().encode('utf-8')) == 1: continue dec2, conf2, dec_splits = cmp_trie.decode_sofmax( ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2])) best_dict = print_seq2(dec2[0]) if out_raw is not None and len(det_text) > 2: boxout = cv2.boxPoints(boxt) out_raw.write(u"{0}|{1}|{2}|{3}|{4}|{5}|{6}|{7}|{8}|{9}|{10}|{11}\n".format(\ baseName[:-4],boxout[0, 0],boxout[0, 1], boxout[1, 0], boxout[1, 1], \ boxout[2, 0], boxout[2, 1], boxout[3, 0], boxout[3, 1], det_text, best_dict, mean_conf).encode('utf8')) splits_out = process_splits(best_dict, conf2, dec_splits, norm2, ctc_f, rot_mat, boxt, img, det_word[5], pr, alow_non_dict=False) detections_out.extend(splits_out) #detections_out = nms(detections_out) if out_raw is not None: out_raw.flush() cv2.imshow('draw', draw) cv2.waitKey(10) return detections_out
def process_splits(trans, conf, splits, norm2, ctc_f, rot_mat, boxt, draw, iou, debug=False, alow_non_dict=False): ''' Summary : Split the transciption and corresponding bounding-box based on spaces predicted by recognizer FCN. Description : Parameters ---------- trans : string String containing the predicted transcription for the corresponding predicted bounding-box. conf : list List containing sum of confidence for all the character by recognizer FCN, start and end position in bounding-box for generated transciption. splits : list List containing index of position of predicted spaces by the recognizer FCN. norm2 : matrix Matrix containing the cropped bounding-box predicted by localization FCN in the originial image. ctc_f : matrix Matrix containing output of recognizer FCN for the given input bounding-box. rot_mat : matrix Rotation matrix returned by get_normalized_image function. boxt : tuple of tuples Tuple of tuples containing parametes of predicted bounding-box by localization FCN. draw : matrix Matrix containing input image. debug : boolean Boolean parameter representing debug mode, if it is True visualization boxes are generated. Returns ------- boxes_out : list of tuples List of tuples containing predicted bounding-box parameters, predicted transcription and mean confidence score from the recognizer. ''' spl = trans.split(" ") boxout = cv2.boxPoints(boxt) start_f = 0 mean_conf = conf[0, 0] / len(trans) # Overall confidence of recognizer FCN boxes_out = [] for s in range(len(spl)): text = spl[s] end_f = conf[0, 2] if s < len(spl) - 1: try: if splits[0, s] > start_f: end_f = splits[ 0, s] # New ending point of bounding-box transcription except IndexError: pass scalex = norm2.shape[1] / float(ctc_f.shape[0]) poss = start_f * scalex pose = (end_f + 2) * scalex rect = [[poss, 0], [pose, 0], \ [pose, norm2.shape[0] - 1], [poss, norm2.shape[0] - 1]] rect = np.array(rect) #rect[:, 0] += boxt[0][0] #rect[:, 1] += boxt[0][1] int_t = cv2.invertAffineTransform(rot_mat) dst_rect = np.copy(rect) dst_rect[:, 0] = int_t[0, 0] * rect[:, 0] + int_t[ 0, 1] * rect[:, 1] + int_t[0, 2] dst_rect[:, 1] = int_t[1, 0] * rect[:, 0] + int_t[ 1, 1] * rect[:, 1] + int_t[1, 2] tx = np.sum(dst_rect[:, 0]) / 4.0 ty = np.sum(dst_rect[:, 1]) / 4.0 br = cv2.boundingRect(boxout) tx += br[0] ty += br[1] twidth = (pose - poss) #twidth = (pose - poss) / ext_factor theight = norm2.shape[0] box_back = ((tx, ty), (twidth, theight * 0.9), boxt[2]) if debug: boxout_u = cv2.boxPoints(box_back) vis.draw_box_points(draw, boxout_u, color=(0, 255, 0)) cv2.imshow('draw', draw) if len(text.strip()) == 0: print("zero length text!") continue textc = text.replace(".", "").replace(":", "").replace( "!", "").replace("?", "").replace(",", "").replace("/", "").replace( "-", "").replace("$", "").replace("'", "").replace("(", "").replace( ")", "").replace("+", "") if textc.endswith("'s"): textc = textc[:-2] is_dict = cmp_trie.is_dict( textc.encode('utf-8')) or textc.isdigit() or alow_non_dict if len(text) > 2 and (text.isdigit() or is_dict): boxes_out.append((box_back, (text, mean_conf, is_dict, iou))) start_f = end_f + 1 return boxes_out
def random_crop(self, img, word_gto): xs = int(random.uniform(0, self.crop_ratio) * img.shape[1]) xe = int(random.uniform(0, self.crop_ratio) * img.shape[1]) maxx = img.shape[1] - xe ys = int(random.uniform(0, self.crop_ratio) * img.shape[0]) ye = int(random.uniform(0, self.crop_ratio) * img.shape[0]) maxy = img.shape[0] - ye crop_img = img[ys:maxy, xs:maxx] normo = math.sqrt(img.shape[0] * img.shape[0] + img.shape[1] * img.shape[1]) image_size = (crop_img.shape[1], crop_img.shape[0]) normo2 = math.sqrt(image_size[1] * image_size[1] + image_size[0] * image_size[0]) o_size = (img.shape[1], img.shape[0]) gt_out = [] for gt_no in range(len(word_gto)): #TODO - remove loop ... use numpy gt = word_gto[gt_no] gtbox = ((gt[0] * o_size[0], gt[1] * o_size[1]), (gt[2] * normo, gt[3] * normo), gt[4] * 180 / 3.14) gtbox = cv2.boxPoints(gtbox) gtbox = np.array(gtbox, dtype="float") gtbox[:, 0] -= xs gtbox[:, 1] -= ys gtbox[gtbox[:, 0] < 0, 0] = 0 gtbox[gtbox[:, 1] < 0, 1] = 0 gtbox[gtbox[:, 0] > maxx, 0] = maxx gtbox[gtbox[:, 1] > maxy, 1] = maxy dh = gtbox[0, :] - gtbox[1, :] dw = gtbox[1, :] - gtbox[2, :] centerx = np.sum(gtbox[:, 0]) / float(gtbox.shape[0]) centery = np.sum(gtbox[:, 1]) / float(gtbox.shape[0]) h = math.sqrt(dh[0] * dh[0] + dh[1] * dh[1]) / normo2 w = math.sqrt(dw[0] * dw[0] + dw[1] * dw[1]) / normo2 if w * normo2 < 2 or h * normo2 < 2 or np.isinf(w) or np.isinf(h): #print("warn: removig too small gt {0}".format(gt)) continue gt[0] = centerx / image_size[0] gt[1] = centery / image_size[1] gt[2] = w gt[3] = h gt[4] = math.atan2((gtbox[2, 1] - gtbox[1, 1]), (gtbox[2, 0] - gtbox[1, 0])) if False: draw_box_points(crop_img, np.array(gtbox, dtype="int"), color=(0, 255, 0)) gtbox2 = ((gt[0] * image_size[0], gt[1] * image_size[1]), (gt[2] * normo2, gt[3] * normo2), gt[4] * 180 / 3.14) gtbox2 = cv2.boxPoints(gtbox2) gtbox2 = np.array(gtbox2, dtype="float") draw_box_points(crop_img, np.array(gtbox2, dtype="int"), color=(0, 255, 0)) cv2.imshow('c2', crop_img) gt_out.append(gt) #cv2.imshow('crop_img', crop_img) #cv2.waitKey(0) return (crop_img, gt_out)