def evaluate(): for image_path in image_paths: image = cv2.imread(image_path)[:, :, ::-1] image_fname = osp.split(image_path)[-1] image_fname_noext = osp.splitext(image_fname)[0] result_label_fname = 'res_' + image_fname_noext + '.txt' result_label_path = osp.join(result_label_dir, result_label_fname) h, w = image.shape[:2] src_image = image.copy() image, scale, pad, window = utils.resize_and_pad_image(image, 512) image = utils.mold_image(image) image = np.expand_dims(image, axis=0) # Run object detection batch_rpn_proposals, batch_rpn_probs = model.keras_model.predict( [image, anchors], verbose=0) rpn_proposals = batch_rpn_proposals[0] rpn_probs = batch_rpn_probs[0] boxes = utils.denorm_boxes(rpn_proposals, (512, 512)) scores = rpn_probs[..., np.newaxis] # keep_ix = np.where(rpn_probs > 0.7)[0] # boxes = boxes[keep_ix] # scores = rpn_probs[keep_ix] # for box in boxes: # cv2.rectangle(src_image, (box[1], box[0]), (box[3], box[2]), (0, 255, 0), 1) # show_image(src_image, 'image') # cv2.waitKey(0) detector = TextDetector() if inference_mode == 'rpn': text_boxes = detector.detect( boxes[:, [1, 0, 3, 2]].astype(np.float32), scores, (512, 512)) # inference_mode == 'text' else: text_boxes = detector.detect2( boxes[:, [1, 0, 3, 2]].astype(np.float32), scores, (512, 512)) text_boxes = text_boxes[:, [0, 1, 4, 5]] # -left_pad text_boxes[:, [0, 2]] -= pad[1][0] # -top_pad text_boxes[:, [1, 3]] -= pad[0][0] text_boxes = np.round(text_boxes / scale).astype(np.int32) text_boxes[:, [0, 2]] = np.clip(text_boxes[:, [0, 2]], 0, w - 1) text_boxes[:, [1, 3]] = np.clip(text_boxes[:, [1, 3]], 0, h - 1) with open(result_label_path, 'w') as f: for text_box in text_boxes: f.write(','.join(map(str, text_box.tolist())) + '\n')
# image_paths = ['/home/adam/Public/test.jpg'] # image_paths = glob.glob('datasets/art/train_images/*.jpg') # image_paths = glob.glob('/home/adam/.keras/datasets/text/ctpn/VOCdevkit/VOC2007/JPEGImages/*.jpg') image_paths = glob.glob('/home/adam/.keras/datasets/icdar2013/focused_scene_text/task12_images/*.jpg') for image_path in image_paths: image = cv2.imread(image_path)[:, :, ::-1] image, scale, pad, window = utils.resize_and_pad_image(image, 512) src_image = image.copy() image = utils.mold_image(image) image = np.expand_dims(image, axis=0) # Run object detection start = time.time() batch_rpn_proposals, batch_rpn_probs = model.keras_model.predict([image, anchors], verbose=0) rpn_proposals = batch_rpn_proposals[0] rpn_probs = batch_rpn_probs[0] boxes = utils.denorm_boxes(rpn_proposals, (512, 512)) scores = rpn_probs[..., np.newaxis] # keep_ix = np.where(rpn_probs > 0.7)[0] # boxes = boxes[keep_ix] # scores = rpn_probs[keep_ix] # for box in boxes: # cv2.rectangle(src_image, (box[1], box[0]), (box[3], box[2]), (0, 255, 0), 1) # show_image(src_image, 'image') # cv2.waitKey(0) detector = TextDetector() if inference_mode == 'rpn': text_boxes = detector.detect(boxes[:, [1, 0, 3, 2]].astype(np.float32), scores, (512, 512)) # inference_mode == 'text' else: text_boxes = detector.detect2(boxes[:, [1, 0, 3, 2]].astype(np.float32), scores, (512, 512)) end = time.time()
def unmold_detections(detections, mrcnn_mask, original_image_shape, image_shape, window): """Reformats the detections of one image from the format of the neural network output to a format suitable for use in the rest of the application. detections: [N, (y1, x1, y2, x2, class_id, score)] in normalized coordinates mrcnn_mask: [N, height, width, num_classes] original_image_shape: [H, W, C] Original image shape before resizing image_shape: [H, W, C] Shape of the image after resizing and padding window: [y1, x1, y2, x2] Pixel coordinates of box in the image where the real image is excluding the padding. Returns: boxes: [N, (y1, x1, y2, x2)] Bounding boxes in pixels class_ids: [N] Integer class IDs for each bounding box scores: [N] Float probability scores of the class_id masks: [height, width, num_instances] Instance masks """ # How many detections do we have? # Detections array is padded with zeros. Find the first class_id == 0. zero_ix = np.where(detections[:, 4] == 0)[0] N = zero_ix[0] if zero_ix.shape[0] > 0 else detections.shape[0] # Extract boxes, class_ids, scores, and class-specific masks boxes = detections[:N, :4] class_ids = detections[:N, 4].astype(np.int32) scores = detections[:N, 5] masks = mrcnn_mask[np.arange(N), :, :, class_ids] # Translate normalized coordinates in the resized image to pixel # coordinates in the original image before resizing window = utils.norm_boxes(window, image_shape[:2]) wy1, wx1, wy2, wx2 = window shift = np.array([wy1, wx1, wy1, wx1]) wh = wy2 - wy1 # window height ww = wx2 - wx1 # window width scale = np.array([wh, ww, wh, ww]) # Convert boxes to normalized coordinates on the window boxes = np.divide(boxes - shift, scale) # Convert boxes to pixel coordinates on the original image boxes = utils.denorm_boxes(boxes, original_image_shape[:2]) # Filter out detections with zero area. Happens in early training when # network weights are still random exclude_ix = np.where((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0] if exclude_ix.shape[0] > 0: boxes = np.delete(boxes, exclude_ix, axis=0) class_ids = np.delete(class_ids, exclude_ix, axis=0) scores = np.delete(scores, exclude_ix, axis=0) masks = np.delete(masks, exclude_ix, axis=0) N = class_ids.shape[0] # Resize masks to original image size and set boundary threshold. full_masks = [] for i in range(N): # Convert neural network mask to full size mask full_mask = utils.unmold_mask(masks[i], boxes[i], original_image_shape) full_masks.append(full_mask) full_masks = np.stack(full_masks, axis=-1)\ if full_masks else np.empty(masks.shape[1:3] + (0,)) return boxes, class_ids, scores, full_masks