Exemplo n.º 1
0
def get_normalized_image(img, rr, debug=False):

    box = cv2.boxPoints(rr)
    extbox = cv2.boundingRect(box)

    if extbox[2] * extbox[3] > img.shape[0] * img.shape[1]:
        print("Too big proposal: {0}x{1}".format(extbox[2], extbox[3]))
        return None, None
    extbox = [extbox[0], extbox[1], extbox[2], extbox[3]]
    extbox[2] += extbox[0]
    extbox[3] += extbox[1]
    extbox = np.array(extbox, np.int)

    extbox[0] = max(0, extbox[0])
    extbox[1] = max(0, extbox[1])
    extbox[2] = min(img.shape[1], extbox[2])
    extbox[3] = min(img.shape[0], extbox[3])

    tmp = img[extbox[1]:extbox[3], extbox[0]:extbox[2]]
    center = (tmp.shape[1] / 2, tmp.shape[0] / 2)
    rot_mat = cv2.getRotationMatrix2D(center, rr[2], 1)

    if tmp.shape[0] == 0 or tmp.shape[1] == 0:
        return None, rot_mat

    if debug:
        vis.draw_box_points(img,
                            np.array(extbox, dtype="int"),
                            color=(0, 255, 0))
        cv2.imshow('scaled', img)

    rot_mat[0, 2] += rr[1][0] / 2.0 - center[0]
    rot_mat[1, 2] += rr[1][1] / 2.0 - center[1]
    try:
        norm_line = cv2.warpAffine(tmp,
                                   rot_mat, (int(rr[1][0]), int(rr[1][1])),
                                   borderMode=cv2.BORDER_REPLICATE)
    except:
        return None, rot_mat
    return norm_line, rot_mat
Exemplo n.º 2
0
def get_normalized_image(img, rr, debug = False):

  box = cv2.boxPoints(rr)
  extbox = cv2.boundingRect(box)

  if extbox[2] *  extbox[3] > img.shape[0] * img.shape[1]:
    print("Too big proposal: {0}x{1}".format(extbox[2], extbox[3]))
    return None, None
  extbox = [extbox[0], extbox[1], extbox[2], extbox[3]]
  extbox[2] += extbox[0]
  extbox[3] += extbox[1]
  extbox = np.array(extbox, np.int)
  
  extbox[0] = max(0, extbox[0])
  extbox[1] = max(0, extbox[1])
  extbox[2] = min(img.shape[1], extbox[2])
  extbox[3] = min(img.shape[0], extbox[3])
  
  tmp = img[extbox[1]:extbox[3], extbox[0]:extbox[2]]
  center = (tmp.shape[1] / 2,  tmp.shape[0] / 2)
  rot_mat = cv2.getRotationMatrix2D( center, rr[2], 1 )
  
  if tmp.shape[0] == 0 or tmp.shape[1] == 0:
    return None, rot_mat
  
  if debug:
    vis.draw_box_points(img,  np.array(extbox, dtype="int"), color = (0, 255, 0))
    cv2.imshow('scaled', img)

  rot_mat[0,2] += rr[1][0] /2.0 - center[0]
  rot_mat[1,2] += rr[1][1] /2.0 - center[1]
  try:
    norm_line = cv2.warpAffine( tmp, rot_mat, (int(rr[1][0]), int(rr[1][1])), borderMode=cv2.BORDER_REPLICATE )
  except:
    return None, rot_mat
  return norm_line, rot_mat
Exemplo n.º 3
0
def froward_image(nets, scaled, original):
    global rec_t, ext_factor, ext_factorx

    net, net_ctc = nets
    print("nets:", nets)
    print("net: ", net)
    print("net_ctc :", net_ctc)

    img = [scaled]
    draw = img[0]

    imgo = original

    im = np.asarray(img, dtype=np.float)
    im = im / 128.0
    im = im - 1.0
    # im = im.reshape((3, im.shape[0], im.shape[1]))
    im = np.swapaxes(im, 1, 3)
    im = np.swapaxes(im, 2, 3)

    net.blobs['data'].reshape(im.shape[0], im.shape[1], im.shape[2],
                              im.shape[3])
    print("net.blobs['data'] :", net.blobs['data'])
    net.blobs['data'].data[...] = im
    print("im: ", im)
    net.reshape()
    print("net.reshape(): ", net.reshape())
    start = time.time()
    out = net.forward(start="conv1")
    end = time.time()
    seconds = end - start
    fps = 1 / seconds
    # print("loc fps:{0}".format(fps))

    boxes = out['boxes']
    print("boxes: ", boxes)
    boxes[0, 0, :, 0] *= image_size[0]
    boxes[0, 0, :, 1] *= image_size[1]
    normFactor = math.sqrt(image_size[1] * image_size[1] +
                           image_size[0] * image_size[0])
    boxes[0, 0, :, 2] *= normFactor
    boxes[0, 0, :, 3] *= normFactor

    nms = boxes[0, 0, :, 8] != 1
    boxes = boxes[:, :, nms, :]
    print("boxes before boxes_count: ", boxes)
    boxes_count = 0
    for i in range(0, boxes.shape[2]):
        det_word = boxes[0, 0, i]
        if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.1:
            break
        boxes_count += 1

    detections_out = []

    for i in range(0, boxes_count):
        det_word = boxes[0, 0, i]
        boxr = ((det_word[0], det_word[1]), (det_word[2], det_word[3]),
                det_word[4] * 180 / 3.14)
        print("boxr : this is r box,", boxr)
        box = cv2.boxPoints(boxr)
        print("box : sfter detection count,", box)

        box = np.array(box, dtype="int")
        vis.draw_box_points(draw, box, (255, 0, 0))
        bbox = cv2.boundingRect(box)
        bbox = [bbox[0], bbox[1], bbox[2], bbox[3]]
        bbox[2] += bbox[0]
        bbox[3] += bbox[1]

        boxro = [[det_word[0], det_word[1]],
                 [det_word[2] * ext_factorx, det_word[3] * ext_factor],
                 det_word[4] * 180 / 3.14]
        boxt = get_obox(img[0], original, boxro)
        print("boxt 1 :", boxt)
        boxt = ((boxt[0][0], boxt[0][1]), (boxt[1][0], boxt[1][1]), boxt[2])
        print("boxt 2 :", boxt)

        norm2, rot_mat = get_normalized_image(original, boxt)
        if norm2 is None:
            continue

        norm = cv2.cvtColor(norm2, cv2.COLOR_BGR2GRAY)
        print("Given Norm :", norm)

        width_scale = 32.0 / norm2.shape[0]
        width = norm.shape[1] * width_scale
        best_diff = width
        bestb = 0
        for b in range(0, len(buckets)):
            if best_diff > abs(width - buckets[b]):
                best_diff = abs(width - buckets[b])
                bestb = b

        scaled = cv2.resize(norm, (buckets[bestb], 32))

        cv2.imshow('norm2', scaled)

        imtf = np.asarray([scaled], dtype=np.float)
        imtf = np.asarray(imtf, dtype=np.float)
        delta = imtf.max() - imtf.min()
        imtf /= (delta / 2)
        imtf -= imtf.mean()
        imtf = np.reshape(imtf,
                          (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2]))

        net_ctc.blobs['data'].reshape(imtf.shape[0], imtf.shape[1],
                                      imtf.shape[2], imtf.shape[3])
        net_ctc.blobs['data'].data[...] = imtf

        outctc = net_ctc.forward()
        print("outctc : ", outctc)
        ctc_f = outctc['softmax']
        print("ctc_f : ", ctc_f)

        ctc_f = ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[1], ctc_f.shape[3])
        labels = ctc_f.argmax(2)
        mask = labels > 2
        masked = ctc_f.max(2)[mask]
        mean_conf = np.sum(masked) / masked.shape[0]
        print("mean_conf : ", mean_conf)

        if mean_conf < 0.2:
            vis.draw_box_points(scaled, box, color=(0, 0, 0))
            continue

        if debug:
            vis.vis_square(imtf[0])

        det_text, conf, dec_s = print_seq_ext(labels[:, 0], np.sum(masked))

        if len(det_text) == 0:
            continue

        if len(det_text) < 3 and mean_conf < 0.8:
            continue
        print("detections_out: ", detections_out)
        detections_out.append((boxt, (det_text, mean_conf, int(det_word[6]))))
        continue

        splits_raw = process_splits(det_text,
                                    conf,
                                    dec_s,
                                    norm2,
                                    ctc_f,
                                    rot_mat,
                                    boxt,
                                    original,
                                    0,
                                    mean_conf,
                                    alow_non_dict=True)
        detections_out.extend(splits_raw)
        continue

        if out_raw is not None:
            out_raw.write(u"{0}|{1}|{2}|{3}|{4}|{5}|{6}|{7}|{8}|{9}|{10}|{11}\n".format( \
                'vid', box[0, 0], box[0, 1], box[1, 0], box[1, 1], \
                box[2, 0], box[2, 1], box[3, 0], box[3, 1], det_text, det_text, mean_conf).encode('utf8'))

        dec2, conf2, dec_splits = cmp_trie.decode_sofmax(
            ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2]))
        best_dict = print_seq2(dec2[0])

        if len(best_dict) == 0:
            continue
        splits_out = process_splits(best_dict, conf2, dec_splits, norm2, ctc_f,
                                    rot_mat, boxt, original, 1, mean_conf)
        detections_out.extend(splits_out)

    return detections_out, fps
Exemplo n.º 4
0
def test_video(nets):
    global rec_t, image_size
    cap = cv2.VideoCapture(
        '/mnt/textspotter/evaluation-sets/icdar2013-video-Test/Video_35_2_3.mp4'
    )
    cap = cv2.VideoCapture(-1)
    font = ImageFont.truetype(
        "/usr/share/fonts/truetype/ubuntu-font-family/UbuntuMono-R.ttf", 16)
    font2 = ImageFont.truetype(
        "/usr/share/fonts/truetype/ubuntu-font-family/Ubuntu-B.ttf", 18)

    ret, im = cap.read()
    fourcc = cv2.VideoWriter_fourcc(*'X264')
    out = cv2.VideoWriter('/tmp/output.avi', fourcc, 20.0,
                          (im.shape[1], im.shape[0]))

    frame_no = 0
    while ret:
        image_size = [640 / 64 * 64, 480 / 64 * 64]
        ret, im = cap.read()

        if ret == True:

            scaled = cv2.resize(im, (image_size[0], image_size[1]))
            if nets[0].blobs['data'].data[...].shape[1] == 1:
                scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY)
                scaled = scaled.reshape((scaled.shape[0], scaled.shape[1], 1))

            detections_out, fps = froward_image(nets, scaled, im)

            img = Image.fromarray(im)
            draw = ImageDraw.Draw(img)

            for detection in detections_out:
                text = detection[1][0]
                print(text)
                width, height = draw.textsize(text, font=font)
                center = [
                    detection[0][0][0] - width / 2, detection[0][0][1] - 10
                ]

                sx = int(detection[0][0][0] - width / 2)
                ex = int(detection[0][0][0] + width / 2)
                sy = int(detection[0][0][1] - 10)
                ey = int(detection[0][0][1] + 10)

                im[sy:ey, sx:ex] = im[sy:ey, sx:ex] / 2

                boxr = ((detection[0][0][0], detection[0][0][1]),
                        (detection[0][1][0],
                         detection[0][1][1]), detection[0][2])
                box = cv2.boxPoints(boxr)
                color = (0, 255, 0)
                vis.draw_box_points(im, box, color, thickness=1)

            img = Image.fromarray(im)
            draw = ImageDraw.Draw(img)

            draw.text((10, 10),
                      'FPS: {0:.2f}'.format(fps), (0, 255, 0),
                      font=font2)
            frame_no += 1

            if frame_no < 30:
                draw.text((image_size[1] / 2 - 150, image_size[0] / 2 - 100),
                          'Raw Detections with Dictionary', (0, 0, 255),
                          font=font2)

            for detection in detections_out:
                text = detection[1][0]
                width, height = draw.textsize(text, font=font)
                center = [
                    detection[0][0][0] - width / 2, detection[0][0][1] - 10
                ]
                draw.text((center[0], center[1]),
                          text,
                          fill=(0, 255, 0),
                          font=font)

            pix = np.array(img)

            cv2.imshow('draw', scaled)
            #
            if pix.shape[0] > 1024:
                pix = cv2.resize(pix, (pix.shape[1] / 2, pix.shape[0] / 2))
            cv2.imshow('pix', pix)

            out.write(pix)
            cv2.waitKey(10)

    out.release()
Exemplo n.º 5
0
def test_image(nets):
    img_dir = "src/images"
    #img_dir = "test"

    imgs = [os.path.join(path, f) for f in os.listdir(img_dir)]
    for img in imgs:
        im = cv2.imread(img)
        global rec_t, image_size
        font = ImageFont.truetype(
            "/usr/share/fonts/truetype/ubuntu-font-family/UbuntuMono-R.ttf",
            16)
        font2 = ImageFont.truetype(
            "/usr/share/fonts/truetype/ubuntu-font-family/Ubuntu-B.ttf", 18)
        image_size = [640 / 64 * 64, 480 / 64 * 64]

        scaled = cv2.resize(im, (image_size[0], image_size[1]))
        if nets[0].blobs['data'].data[...].shape[1] == 1:
            scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY)
            scaled = scaled.reshape((scaled.shape[0], scaled.shape[1], 1))

        detections_out, fps = froward_image(nets, scaled, im)

        img = Image.fromarray(im)
        draw = ImageDraw.Draw(img)

        text_list = list()
        for detection in detections_out:
            print("Detection", detection)
            text = detection[1][0]
            text = text.encode('ascii', 'ignore')
            text_list.append(text)
            print(text_list)

            width, height = draw.textsize(text, font=font)
            center = [detection[0][0][0] - width / 2, detection[0][0][1] - 10]

            sx = int(detection[0][0][0] - width / 2)
            ex = int(detection[0][0][0] + width / 2)
            sy = int(detection[0][0][1] - 10)
            ey = int(detection[0][0][1] + 10)

            im[sy:ey, sx:ex] = im[sy:ey, sx:ex] / 2

            boxr = ((detection[0][0][0], detection[0][0][1]),
                    (detection[0][1][0], detection[0][1][1]), detection[0][2])
            box = cv2.boxPoints(boxr)
            color = (0, 255, 0)
            vis.draw_box_points(im, box, color, thickness=1)

        img = Image.fromarray(im)
        draw = ImageDraw.Draw(img)

        draw.text((10, 10),
                  'FPS: {0:.2f}'.format(fps), (0, 255, 0),
                  font=font2)

        draw.text((image_size[1] / 2 - 150, image_size[0] / 2 - 100),
                  'Raw Detections with Dictionary', (0, 0, 255),
                  font=font2)
        print('text detectionnnnnnnnnnnnnn', text_list)

        print(text_list[0])
        center_list = list()
        center_text = list()
        for detection in detections_out:
            text = detection[1][0]

            width, height = draw.textsize(text, font=font)
            center = [detection[0][0][0] - width / 2, detection[0][0][1] - 10]
            print("center", center)
            center_list.append(center)
            print("text: ", text)
            print("center list", center_list)

            draw.text((center[0], center[1]),
                      text,
                      fill=(0, 255, 0),
                      font=font)
        sorted_center = sorted(center_list, key=lambda x: (x[0], x[1]))
        print("sorted_center", sorted_center)
        pix = np.array(img)

        cv2.imshow('draw', scaled)
        #
        if pix.shape[0] > 1024:
            pix = cv2.resize(pix, (pix.shape[1] / 2, pix.shape[0] / 2))
        cv2.imshow('pix', pix)
        cv2.waitKey(0)
Exemplo n.º 6
0
def process_batch(nets, optim, optim2, image_size, args):
    global it, mean_loss, mean_rec
    it += 1  # 迭代次数加一

    net, net_ctc = nets

    net = net.net
    net_ctc = net_ctc.net

    net.blobs['data'].reshape(args.batch_size, 1, image_size[1],
                              image_size[0])  # 把一个batch的输入图片reshape
    net.reshape()

    optim2.step(1)

    im = net.blobs['data'].data[...]  # shape [batch_size,1,416,416]
    draw = np.swapaxes(im, 2, 3)
    draw = np.swapaxes(draw, 1, 3)
    im_ctc = np.copy(draw)
    draw += 1
    draw *= 128
    draw = np.array(draw, dtype="uint8").copy()

    if args.debug:
        grid_step = 16
        line = 0
        while line < image_size[0]:
            cv2.line(draw[0], (0, line), (image_size[1], line),
                     (128, 128, 128))
            line += grid_step

    boxes = net.blobs['boxes'].data[...]  # shape (4, 1, 500, 15)

    word_gtob = net.blobs['gt_boxes'].data[...]  # shape  (4, 6, 1, 6)
    word_txt = net.blobs['gt_labels'].data[...]  # shape (4, 6, 1, 14)

    lines_gtob = net.blobs['line_boxes'].data[...]  # shape (4, 1, 1, 5)
    lines_txt = net.blobs['line_labels'].data[...]  # shape (4, 1, 1, 7)

    #nms = boxeso[:, 0, 0, 8] == 0
    #boxes = boxes[:, :, nms, :]

    boxes[:, 0, :, 0] *= image_size[0]
    boxes[:, 0, :, 1] *= image_size[1]
    normFactor = math.sqrt(image_size[1] * image_size[1] +
                           image_size[0] * image_size[0])
    boxes[:, 0, :, 2] *= normFactor
    boxes[:, 0, :, 3] *= normFactor

    sum_cost = 0
    count = 0

    labels_gt = []
    labels_det = []

    gt_to_detection = {}
    net_ctc.clear_param_diffs()

    batch_buckets = []
    dummy = {}

    matched_detections = 0
    for bid in range(im.shape[0]):  # 遍历batchsize下的每一个样本

        o_image = net.layers[0].get_image_file_name(bid)
        o_image = cv2.imread(o_image, cv2.IMREAD_GRAYSCALE)
        cx = net.layers[0].get_crop(bid, 0)
        cy = net.layers[0].get_crop(bid, 1)
        cmx = net.layers[0].get_crop(bid, 2)
        cmy = net.layers[0].get_crop(bid, 3)
        o_image = o_image[cy:cmy, cx:cmx]

        boxes_count = 0
        for i in range(0, boxes.shape[2]):
            det_word = boxes[bid, 0, i]
            if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.01:
                break
            boxes_count += 1

        x = [i for i in range(boxes_count)]
        #random.shuffle(x)

        bucket_images = {}
        batch_buckets.append(bucket_images)

        word_gto = word_gtob[bid]
        word_gto_txt = word_txt[bid]
        gt_count = 0
        for gt_no in range(word_gto.shape[0]):
            gt = word_gto[gt_no, :]
            gt = gt.reshape(6)
            gtnum = 1000 * bid + gt_no

            if gt[5] == -1:
                #print("ignore gt!")
                continue

            gt_count += 1

            txt = word_gto_txt[gt_no, :]
            gtbox = ((gt[0] * image_size[0], gt[1] * image_size[1]),
                     (gt[2] * normFactor,
                      gt[3] * normFactor), gt[4] * 180 / 3.14)
            gtbox = cv2.boxPoints(gtbox)

            gtbox = np.array(gtbox, dtype="int")
            rect_gt = cv2.boundingRect(gtbox)

            if rect_gt[0] == 0 or rect_gt[
                    1] == 0 or rect_gt[0] + rect_gt[2] >= image_size[
                        0] or rect_gt[1] + rect_gt[3] >= image_size[1]:
                continue

            if gt[3] * normFactor < 3:
                if args.debug:
                    pass
                    print('too small gt!')
                continue

            rect_gt = [rect_gt[0], rect_gt[1], rect_gt[2], rect_gt[3]]
            rect_gt[2] += rect_gt[0]
            rect_gt[3] += rect_gt[1]

            for i in range(0, min(100, boxes_count)):
                if math.fabs(gt[4] - det_word[4]) > math.pi / 16:
                    continue

                det_word = boxes[bid, 0, x[i], :]

                if (det_word[0] == 0
                        and det_word[1] == 0) or det_word[5] < 0.01:
                    break

                box = ((det_word[0], det_word[1]), (det_word[2], det_word[3]),
                       det_word[4] * 180 / 3.14)
                box = cv2.boxPoints(box)

                if args.debug:
                    boxp = np.array(box, dtype="int")
                    vis.draw_box_points(draw[bid], boxp, color=(0, 255, 0))

                box = np.array(box, dtype="int")
                bbox = cv2.boundingRect(box)
                bbox = [bbox[0], bbox[1], bbox[2], bbox[3]]
                bbox[2] += bbox[0]
                bbox[3] += bbox[1]

                #rectangle intersection ...
                inter = intersect(bbox, rect_gt)
                uni = union(bbox, rect_gt)
                ratio = area(inter) / float(area(uni))

                ratio_gt = area(inter) / float(area(rect_gt))
                if ratio_gt < 0.95:
                    continue

                if ratio < 0.5:
                    continue

                if not gt_to_detection.has_key(gtnum):
                    gt_to_detection[gtnum] = [0, 0, 0]
                tupl = gt_to_detection[gtnum]
                if tupl[0] < ratio:
                    tupl[0] = ratio
                    tupl[1] = x[i]
                    tupl[2] = ratio_gt

                det_word = boxes[bid, 0, x[i], :]
                box = ([det_word[0],
                        det_word[1]], [det_word[2],
                                       det_word[3]], det_word[4] * 180 / 3.14)

                boxO = get_obox(im_ctc[bid], o_image, box)
                boxO = ((boxO[0][0], boxO[0][1]), (boxO[1][0], boxO[1][1]),
                        boxO[2])
                norm2, rot_mat = get_normalized_image(o_image, boxO)
                #norm3, rot_mat = get_normalized_image(im_ctc[bid], ([det_word[0], det_word[1]], [det_word[2] * 1.2, det_word[3] * 1.1], det_word[4] * 180 / 3.14))
                if norm2 is None:
                    continue
                #if norm3 is None:
                #  continue
                #continue
                #cv2.imshow('ts', norm2)
                #cv2.imshow('ts3', norm3)
                #cv2.waitKey(1)
                width_scale = 32.0 / norm2.shape[0]
                width = norm2.shape[1] * width_scale
                best_diff = width
                bestb = 0
                for b in range(0, len(buckets)):
                    if best_diff > abs(width * 1.3 - buckets[b]):
                        best_diff = abs(width * 1.3 - buckets[b])
                        bestb = b

                scaled = cv2.resize(norm2, (buckets[bestb], 32))
                scaled = np.asarray(scaled, dtype=np.float)
                delta = scaled.max() - scaled.min()
                scaled = (scaled) / (delta / 2)
                scaled -= scaled.mean()

                if not bucket_images.has_key(bestb):
                    bucket_images[bestb] = {}
                    bucket_images[bestb]['img'] = []
                    bucket_images[bestb]['sizes'] = []
                    bucket_images[bestb]['txt'] = []
                    bucket_images[bestb]['gt_enc'] = []
                    dummy[bestb] = 1
                else:
                    if args.debug and len(bucket_images[bestb]) > 4:
                        continue
                    elif len(bucket_images[bestb]) > 32:
                        continue

                gt_labels = []
                txt_enc = ''
                for k in range(txt.shape[1]):
                    if txt[0, k] > 0:
                        if codec_rev.has_key(txt[0, k]):
                            gt_labels.append(codec_rev[txt[0, k]])
                        else:
                            gt_labels.append(3)

                        txt_enc += unichr(txt[0, k])
                    else:
                        gt_labels.append(0)

                if scaled.ndim == 3:
                    scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY)
                if args.debug:
                    cv2.imshow('scaled', scaled)
                bucket_images[bestb]['sizes'].append(len(gt_labels))
                bucket_images[bestb]['gt_enc'].append(gt_labels)
                bucket_images[bestb]['txt'].append(txt_enc)
                bucket_images[bestb]['img'].append(scaled)
                matched_detections += 1

    #and learn OCR
    for bucket in bucket_images.keys():

        imtf = np.asarray(bucket_images[bucket]['img'], dtype=np.float)
        imtf = np.reshape(imtf,
                          (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2]))
        #imtf = imtf.reshape((imtf.shape[0], imtf.shape[1], imtf.shape[2], 1))
        #imtf = np.swapaxes(imtf,1,3)

        net_ctc.blobs['data'].reshape(imtf.shape[0], imtf.shape[1],
                                      imtf.shape[2], imtf.shape[3])
        net_ctc.blobs['data'].data[...] = imtf

        labels = bucket_images[bucket]['gt_enc']
        txt = bucket_images[bucket]['txt']

        max_len = 0
        for l in range(0, len(labels)):
            max_len = max(max_len, len(labels[l]))
        for l in range(0, len(labels)):
            while len(labels[l]) < max_len:
                labels[l].append(0)

        labels = np.asarray(labels, np.float)

        net_ctc.blobs['label'].reshape(labels.shape[0], labels.shape[1])

        net_ctc.blobs['label'].data[...] = labels

        if args.debug:
            vis.vis_square(imtf[0])
            cv2.imshow('draw', draw[0])
            cv2.waitKey(5)

        #optim.step(1)
        sum_cost += net_ctc.blobs['loss'].data[...]
        if net_ctc.blobs['loss'].data[...] > 10:
            #vis.vis_square(imtf[0])
            #cv2.imshow('draw', draw[0])
            sf = net_ctc.blobs['transpose'].data[...]
            labels2 = sf.argmax(3)
            out = utils.print_seq(labels2[:, 0, :])
            print(u'{0} --- {1}'.format(out, txt[0]))
            #cv2.waitKey(5)

        count += imtf.shape[0]

    correct_cout = 0
    for i in range(len(labels_gt)):
        det_text = labels_det[i]
        gt_text = labels_gt[i]

        if it % 100 == 0:
            pass
            #print( u"{0} -- {1}".format(det_text, gt_text).encode('utf8') )
        if det_text == gt_text:
            correct_cout += 1

    count = max(count, 1)
    mean_loss = 0.99 * mean_loss + 0.01 * sum_cost / count
    mean_rec = mean_rec * 0.99 + 0.01 * correct_cout / float(
        max(1, len(labels_gt)))

    #count detection ratio

    tp = 0
    for bid in range(im.shape[0]):
        word_gto = word_gtob[bid]
        for gt_no in range(len(word_gto)):
            gt = word_gto[gt_no]
            gtnum = 1000 * bid + gt_no
            if gt_to_detection.has_key(gtnum):
                tupl = gt_to_detection[gtnum]
                if tupl[0] > 0.5:
                    tp += 1

    loc_recall = tp / float(max(1, gt_count))

    if it % 10 == 0:
        print(
            '{0} - lr:{1:.3e} ctc:{2:.4f}/{3:.4f} wr:{4:.2f}/{5:.2f}, loc:{6:.2f} {7}'
            .format(it, 0.0001, sum_cost / count, mean_loss,
                    correct_cout / float(max(1, len(labels_gt))), mean_rec,
                    loc_recall, matched_detections))

    if it % snapshot_interval == 0:
        #optim.snapshot()
        optim2.snapshot()
Exemplo n.º 7
0
def process_batch(nets, optim, optim2, image_size, args):
  global it, mean_loss, mean_rec
  
  net, net_ctc = nets
  
  net = net.net
  net_ctc = net_ctc.net
  
  
  net.blobs['data'].reshape(args.batch_size,1,image_size[1],image_size[0])
  net.reshape()
      
  it += 1 
  
  optim2.step(1)
  
  im = net.blobs['data'].data[...]
  draw = np.swapaxes(im,2,3)
  draw = np.swapaxes(draw,1,3)
  im_ctc = np.copy(draw)
  draw += 1
  draw *= 128
  draw = np.array(draw, dtype="uint8").copy() 
  
  
  if args.debug:
    grid_step = 16
    line = 0
    while line < image_size[0]:
      cv2.line(draw[0], (0, line), (image_size[1], line), (128, 128, 128))
      line += grid_step
  
  boxes  =  net.blobs['boxes'].data[...]
                 
  word_gtob = net.blobs['gt_boxes'].data[...]
  word_txt = net.blobs['gt_labels'].data[...]
  
  lines_gtob = net.blobs['line_boxes'].data[...]
  lines_txt = net.blobs['line_labels'].data[...]
  
  #nms = boxeso[:, 0, 0, 8] == 0
  #boxes = boxes[:, :, nms, :]
  
  boxes[:, 0, :, 0] *= image_size[0]
  boxes[:, 0, :, 1] *= image_size[1]
  normFactor = math.sqrt(image_size[1] * image_size[1] + image_size[0] * image_size[0])
  boxes[:, 0, :, 2] *= normFactor
  boxes[:, 0, :, 3] *= normFactor
  
  sum_cost = 0
  count = 0
  
  labels_gt = []
  labels_det = []
  
  gt_to_detection = {}
  net_ctc.clear_param_diffs()
  
  
  batch_buckets = []    
  dummy = {} 
  
  matched_detections = 0
  for bid in range(im.shape[0]):
    
    o_image = net.layers[0].get_image_file_name(bid)
    o_image = cv2.imread(o_image, cv2.IMREAD_GRAYSCALE)
    cx = net.layers[0].get_crop(bid, 0)
    cy = net.layers[0].get_crop(bid, 1)
    cmx = net.layers[0].get_crop(bid, 2)
    cmy = net.layers[0].get_crop(bid, 3)
    o_image = o_image[cy:cmy, cx:cmx]
    
    boxes_count = 0
    for i in range(0, boxes.shape[2]):
      det_word = boxes[bid, 0, i]
      if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.01:
          break
      boxes_count += 1
        
    x = [i for i in range(boxes_count)]
    #random.shuffle(x)
    
    bucket_images = {}
    batch_buckets.append(bucket_images)
    
    word_gto = word_gtob[bid]
    word_gto_txt = word_txt[bid]
    gt_count = 0 
    for gt_no in range(word_gto.shape[0]):
      gt = word_gto[gt_no, :]
      gt = gt.reshape(6)
      gtnum = 1000 * bid +  gt_no
      
      if gt[5] == -1:
        #print("ignore gt!")
        continue
      
      gt_count += 1
                  
      txt = word_gto_txt[gt_no, :]
      gtbox  = ((gt[0] * image_size[0], gt[1] * image_size[1]), (gt[2] * normFactor, gt[3] * normFactor), gt[4] * 180 / 3.14)
      gtbox = cv2.boxPoints(gtbox)
      
      gtbox = np.array(gtbox, dtype="int")
      rect_gt = cv2.boundingRect(gtbox)

      if rect_gt[0] == 0 or rect_gt[1] == 0 or  rect_gt[0] + rect_gt[2]  >= image_size[0] or rect_gt[1] + rect_gt[3]  >= image_size[1]:
        continue
      
      if gt[3] * normFactor <  3:
        if args.debug:
          #print('too small gt!')
          vis.draw_box_points(draw[bid], gtbox, color = (255, 255, 0))
          cv2.imshow('draw', draw[bid])
        continue
        
      if args.debug:
        vis.draw_box_points(draw[bid], gtbox, color = (0, 0, 0), thickness=2)
      
      #vis.draw_box_points(draw[bid], gtbox, color = (255, 255, 255))
      #cv2.imshow('draw', draw[bid])
      
      rect_gt = [rect_gt[0], rect_gt[1], rect_gt[2], rect_gt[3]]
      rect_gt[2] += rect_gt[0]
      rect_gt[3] += rect_gt[1]

      for i in range(0, min(100, boxes_count)):
        if math.fabs(gt[4] - det_word[4]) > math.pi / 16:
          continue
        
        det_word = boxes[bid, 0, x[i], :]
        
        if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.01:
          break
        
        box  = ((det_word[0], det_word[1]), (det_word[2], det_word[3]), det_word[4] * 180 / 3.14)
        box = cv2.boxPoints(box)
        
        if args.debug:
          boxp = np.array(box, dtype="int")
          vis.draw_box_points(draw[bid], boxp, color = (0, 255, 0))
        
        box = np.array(box, dtype="int")
        bbox = cv2.boundingRect(box)
        bbox = [bbox[0], bbox[1], bbox[2], bbox[3]]
        bbox[2] += bbox[0]
        bbox[3] += bbox[1]
   
        #rectangle intersection ... 
        inter = intersect(bbox, rect_gt)
        uni = union(bbox, rect_gt)
        ratio = area(inter) / float(area(uni))
        
        ratio_gt = area(inter) / float(area(rect_gt))
        if ratio_gt < 0.95:
          continue 
        
        if ratio < 0.5:
          continue
        
        if not gt_to_detection.has_key(gtnum):
            gt_to_detection[gtnum] = [0, 0, 0]
        tupl = gt_to_detection[gtnum] 
        if tupl[0] < ratio:
          tupl[0] = ratio 
          tupl[1] = x[i]  
          tupl[2] = ratio_gt       
        
        det_word = boxes[bid, 0, x[i], :]
        box  = ([det_word[0], det_word[1]], [det_word[2], det_word[3]], det_word[4] * 180 / 3.14)
        
        boxO = get_obox(im_ctc[bid], o_image, box)
        boxO = ((boxO[0][0], boxO[0][1]), (boxO[1][0], boxO[1][1]), boxO[2])
        norm2, rot_mat = get_normalized_image(o_image, boxO)
        #norm3, rot_mat = get_normalized_image(im_ctc[bid], ([det_word[0], det_word[1]], [det_word[2] * 1.2, det_word[3] * 1.1], det_word[4] * 180 / 3.14))
        if norm2 is None:
          continue
        #if norm3 is None:
        #  continue
        #continue
        #cv2.imshow('ts', norm2)
        #cv2.imshow('ts3', norm3)
        #cv2.waitKey(1)
        width_scale = 32.0 / norm2.shape[0]
        width = norm2.shape[1] * width_scale
        best_diff = width
        bestb = 0
        for b in range(0, len(buckets)):
          if best_diff > abs(width * 1.3 - buckets[b]):
            best_diff = abs(width * 1.3 - buckets[b])
            bestb = b
        
        scaled = cv2.resize(norm2, (buckets[bestb], 32))  
        scaled = np.asarray(scaled, dtype=np.float)
        delta = scaled.max() - scaled.min()
        scaled = (scaled) / (delta / 2)
        scaled -= scaled.mean()
                
        if not bucket_images.has_key(bestb):
          bucket_images[bestb] = {}
          bucket_images[bestb]['img'] = []  
          bucket_images[bestb]['sizes'] = []    
          bucket_images[bestb]['txt'] = []
          bucket_images[bestb]['gt_enc'] = []
          dummy[bestb] = 1
        else:
          if args.debug and len(bucket_images[bestb]) > 4:
            continue    
          elif  len(bucket_images[bestb]) > 32:
            continue
        
        gt_labels = []
        txt_enc = ''
        for k in range(txt.shape[1]):
          if txt[0, k] > 0:
            if codec_rev.has_key(txt[0, k]):                
              gt_labels.append( codec_rev[txt[0, k]] )
            else:
              gt_labels.append( 3 )
                              
            txt_enc += unichr(txt[0, k])
          else:
            gt_labels.append( 0 )
        
        if scaled.ndim == 3:
          scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY)
        if args.debug:
          cv2.imshow('scaled', scaled)
        bucket_images[bestb]['sizes'].append(len(gt_labels))
        bucket_images[bestb]['gt_enc'].append(gt_labels)
        bucket_images[bestb]['txt'].append(txt_enc)
        bucket_images[bestb]['img'].append(scaled)
        matched_detections += 1   
      
  #and learn OCR
  for bucket in bucket_images.keys():
      
    imtf = np.asarray(bucket_images[bucket]['img'], dtype=np.float)
    imtf = np.reshape(imtf, (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2]))    
    #imtf = imtf.reshape((imtf.shape[0], imtf.shape[1], imtf.shape[2], 1))
    #imtf = np.swapaxes(imtf,1,3)
    
    
    net_ctc.blobs['data'].reshape(imtf.shape[0],imtf.shape[1],imtf.shape[2], imtf.shape[3]) 
    net_ctc.blobs['data'].data[...] = imtf
    
    labels = bucket_images[bucket]['gt_enc']
    txt = bucket_images[bucket]['txt']
    
    max_len = 0
    for l in range(0, len(labels)):
      max_len = max(max_len, len(labels[l]))
    for l in range(0, len(labels)):
      while len(labels[l]) <  max_len:
        labels[l].append(0)
      
    
    labels = np.asarray(labels, np.float)
    
    net_ctc.blobs['label'].reshape(labels.shape[0], labels.shape[1])
    
    net_ctc.blobs['label'].data[...] = labels
    
    if args.debug:
        vis.vis_square(imtf[0])
        cv2.imshow('draw', draw[0])
        cv2.waitKey(5)
         
     
    optim.step(1)  
    sum_cost += net_ctc.blobs['loss'].data[...]
    if net_ctc.blobs['loss'].data[...] > 10:
      vis.vis_square(imtf[0])
      cv2.imshow('draw', draw[0])
      sf = net_ctc.blobs['transpose'].data[...]
      labels2 = sf.argmax(3)
      out = utils.print_seq(labels2[:, 0, :])
      print(u'{0} - {1}'.format(out, txt[0])  )
      cv2.waitKey(5)
          
          
    count += imtf.shape[0]
              
  correct_cout = 0    
  for i in range(len(labels_gt)):
    det_text = labels_det[i]
    gt_text = labels_gt[i]
    
    if it % 100 == 0:
      print( u"{0} - {1}".format(det_text, gt_text).encode('utf8') )
    if det_text == gt_text:
      correct_cout += 1
      
  count = max(count, 1)    
  mean_loss = 0.99 * mean_loss + 0.01 * sum_cost / count
  mean_rec = mean_rec * 0.99 + 0.01 * correct_cout / float(max(1, len(labels_gt)))
  
  #count detection ratio

  tp = 0
  for bid in range(im.shape[0]):
    word_gto = word_gtob[bid]
    for gt_no in range(len(word_gto)):
      gt = word_gto[gt_no]
      gtnum = 1000 * bid +  gt_no
      if gt_to_detection.has_key(gtnum):
        tupl = gt_to_detection[gtnum] 
        if tupl[0] > 0.5:
          tp += 1
          
                      
  loc_recall = tp / float(max(1, gt_count))             
  if args.debug:
    cv2.imshow('draw', draw[0])
    if im.shape[0] > 1:
        cv2.imshow('draw2', draw[1])
        
    cv2.waitKey(10)
  
  if it % 10 == 0:
    print('{0} - lr:{1:.3e} ctc:{2:.4f}/{3:.4f} wr:{4:.2f}/{5:.2f}, loc:{6:.2f} {7}'.format(it, 0.0001, sum_cost / count, mean_loss, correct_cout / float(max(1, len(labels_gt))), mean_rec, loc_recall, matched_detections))
  
  if it % 1000 == 0:
    optim.snapshot()
    optim2.snapshot()
Exemplo n.º 8
0
def evaluate_image(batch,
                   detections,
                   word_gto,
                   iou_th=0.3,
                   iou_th_vis=0.5,
                   iou_th_eval=0.4):
    '''
  Summary : Returns end-to-end true-positives, detection true-positives, number of GT to be considered for eval (len > 2).
  Description : For each predicted bounding-box, comparision is made with each GT entry. Values of number of end-to-end true
                positives, number of detection true positives, number of GT entries to be considered for evaluation are computed.
  
  Parameters
  ----------
  iou_th_eval : float
      Threshold value of intersection-over-union used for evaluation of predicted bounding-boxes
  iou_th_vis : float
      Threshold value of intersection-over-union used for visualization when transciption is true but IoU is lesser.
  iou_th : float
      Threshold value of intersection-over-union between GT and prediction.
  word_gto : list of lists
      List of ground-truth bounding boxes along with transcription.
  batch : list of lists
      List containing data (input image, image file name, ground truth).
  detections : tuple of tuples
      Tuple of predicted bounding boxes along with transcriptions and text/no-text score.
  
  Returns
  -------
  tp : int
      Number of predicted bounding-boxes having IoU with GT greater than iou_th_eval.
  tp_e2e : int
      Number of predicted bounding-boxes having same transciption as GT and len > 2.
  gt_e2e : int
      Number of GT entries for which transcription len > 2.
  '''

    gt_to_detection = {}
    tp = 0
    tp_e2e = 0
    gt_e2e = 0

    draw = batch[4][0]
    normFactor = math.sqrt(
        draw.shape[1] * draw.shape[1] +
        draw.shape[0] * draw.shape[0])  # Normalization factor
    for i in range(0, len(detections)):

        det = detections[i]
        boxr = det[0]
        box = cv2.boxPoints(boxr)  # Predicted bounding-box parameters
        box = np.array(
            box, dtype="int")  # Convert predicted bounding-box to numpy array
        bbox = cv2.boundingRect(box)

        bbox = [bbox[0], bbox[1], bbox[2], bbox[3]]
        bbox[2] += bbox[0]  # Convert width to right-coordinate
        bbox[3] += bbox[1]  # Convert height to bottom-coordinate

        vis.draw_box_points(draw, box, color=(255, 0, 0))

        det_text = det[1][0]  # Predicted transcription for bounding-box
        #print(det_text)

        for gt_no in range(len(word_gto)):

            gt = word_gto[gt_no]
            txt = gt[5]  # GT transcription for given GT bounding-box
            gtbox = ((gt[0] * draw.shape[1], gt[1] * draw.shape[0]),
                     (gt[2] * normFactor, gt[3] * normFactor),
                     gt[4] * 180 / 3.14)  # Re-scaling GT values
            gtbox = cv2.boxPoints(gtbox)
            gtbox = np.array(gtbox, dtype="int")
            rect_gt = cv2.boundingRect(gtbox)

            rect_gt = [rect_gt[0], rect_gt[1], rect_gt[2], rect_gt[3]]
            rect_gt[2] += rect_gt[0]  # Convert GT width to right-coordinate
            rect_gt[3] += rect_gt[1]  # Convert GT height to bottom-coordinate

            inter = intersect(
                bbox,
                rect_gt)  # Intersection of predicted and GT bounding-boxes
            uni = union(bbox,
                        rect_gt)  # Union of predicted and GT bounding-boxes
            ratio = area(inter) / float(area(
                uni))  # IoU measure between predicted and GT bounding-boxes

            # 1). Visualize the predicted-bounding box if IoU with GT is higher than IoU threshold (iou_th) (Always required)
            # 2). Visualize the predicted-bounding box if transcription matches the GT and condition 1. holds
            # 3). Visualize the predicted-bounding box if transcription matches and IoU with GT is less than iou_th_vis and 1. and 2. hold
            if ratio > iou_th:
                vis.draw_box_points(draw, box, color=(0, 128, 0))
                if not gt_to_detection.has_key(gt_no):
                    gt_to_detection[gt_no] = [0, 0]

                if txt.lower() == det_text.lower():
                    to_cls_x.append(
                        [len(det_text), det[1][1], det[1][2], det[1][3]])
                    to_cls_y.append(1)
                    vis.draw_box_points(draw,
                                        box,
                                        color=(0, 255, 0),
                                        thickness=2)
                    gt[7] = 1  # Change this parameter to 1 when predicted transcription is correct.

                    if ratio < iou_th_vis:
                        vis.draw_box_points(draw,
                                            box,
                                            color=(255, 255, 255),
                                            thickness=2)
                        cv2.imshow('draw', draw)
                        #cv2.waitKey(0)

                else:
                    to_cls_x.append(
                        [len(det_text), det[1][1], det[1][2], det[1][3]])
                    to_cls_y.append(0)

                tupl = gt_to_detection[gt_no]
                if tupl[0] < ratio:
                    tupl[0] = ratio
                    tupl[1] = i

    # Count the number of end-to-end and detection true-positives
    for gt_no in range(len(word_gto)):
        gt = word_gto[gt_no]
        txt = gt[5]
        if len(txt) > 2:
            gt_e2e += 1
            if gt[7] == 1:
                tp_e2e += 1

        if gt_to_detection.has_key(gt_no):
            tupl = gt_to_detection[gt_no]
            if tupl[0] > iou_th_eval:  # Increment detection true-positive, if IoU is greater than iou_th_eval
                tp += 1

    cv2.imshow('draw', draw)
    return tp, tp_e2e, gt_e2e
Exemplo n.º 9
0
def froward_image(nets, scaled, original):
    '''
  :param nets: yolo网络,ctc网络
  :param scaled:灰度reshape图片
  :param original:原始图片
  :return:
  detections_out:[( ((1181.9506549451335, 174.54442087680732), (116.45833333333334, 19.8), -2.3903521532498173), (u'FORQUEuEING', 0.885055888782848, True, 0)),()]
  (中心(x,y), (宽,高), 旋转角度)
  fps: 每秒传输帧数
  '''
    global rec_t, ext_factor, ext_factorx

    net, net_ctc = nets

    img = [scaled]

    # draw = img[0]
    # imgo = original

    im = np.asarray(img, dtype=np.float)
    im = im / 128.0
    im = im - 1.0
    #im = im.reshape((3, im.shape[0], im.shape[1]))
    im = np.swapaxes(im, 1, 3)
    im = np.swapaxes(im, 2, 3)

    net.blobs['data'].reshape(im.shape[0], im.shape[1], im.shape[2],
                              im.shape[3])
    net.blobs['data'].data[...] = im
    net.reshape()
    start = time.time()
    out = net.forward(start="conv1")
    end = time.time()
    seconds = end - start
    fps = 1 / seconds

    boxes = out['boxes']  #(1, 1, 500, 15)  500个anchor

    boxes[0, 0, :, 0] *= image_size[0]
    boxes[0, 0, :, 1] *= image_size[1]
    normFactor = math.sqrt(image_size[1] * image_size[1] +
                           image_size[0] * image_size[0])
    boxes[0, 0, :, 2] *= normFactor
    boxes[0, 0, :, 3] *= normFactor

    nms = boxes[0, 0, :, 8] != 1
    boxes = boxes[:, :, nms, :]

    boxes_count = 0
    for i in range(0, boxes.shape[2]):
        det_word = boxes[0, 0, i]
        if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < 0.1:
            break
        boxes_count += 1

    detections_out = []
    # 对于每一个检测出来的框(nms之后且分数大于0.1),都识别一次
    for i in range(0, boxes_count):
        det_word = boxes[0, 0, i]
        boxr = ((det_word[0], det_word[1]), (det_word[2], det_word[3]),
                det_word[4] * 180 / 3.14)  # 用预测出来的 x,y h, w, angle
        box = cv2.boxPoints(boxr)  # 得到四个点的坐标

        box = np.array(box, dtype="int")
        #vis.draw_box_points(draw, box, (255, 0, 0))
        bbox = cv2.boundingRect(box)  # 变成最小矩形框, x, y, w, h
        bbox = [bbox[0], bbox[1], bbox[2], bbox[3]]
        bbox[2] += bbox[0]
        bbox[3] += bbox[1]  # 后面也没用到bbox啊

        boxro = [[det_word[0], det_word[1]],
                 [det_word[2] * ext_factorx, det_word[3] * ext_factor],
                 det_word[4] * 180 / 3.14]
        boxt = get_obox(img[0], original, boxro)
        boxt = ((boxt[0][0], boxt[0][1]), (boxt[1][0], boxt[1][1]), boxt[2])

        norm2, rot_mat = get_normalized_image(original, boxt)
        if norm2 is None:
            continue

        norm = cv2.cvtColor(norm2, cv2.COLOR_BGR2GRAY)

        width_scale = 32.0 / norm2.shape[0]
        width = norm.shape[1] * width_scale
        best_diff = width
        bestb = 0
        for b in range(0, len(buckets)):
            if best_diff > abs(width - buckets[b]):
                best_diff = abs(width - buckets[b])
                bestb = b

        scaled = cv2.resize(norm, (buckets[bestb], 32))

        imtf = np.asarray([scaled], dtype=np.float)
        imtf = np.asarray(imtf, dtype=np.float)
        delta = imtf.max() - imtf.min()
        imtf /= (delta / 2)
        imtf -= imtf.mean()
        imtf = np.reshape(imtf,
                          (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2]))

        net_ctc.blobs['data'].reshape(imtf.shape[0], imtf.shape[1],
                                      imtf.shape[2], imtf.shape[3])
        net_ctc.blobs['data'].data[...] = imtf

        outctc = net_ctc.forward()  # ['loss', 'softmax']
        ctc_f = outctc['softmax']  # shape (48, 1, 1, 141)

        ctc_f = ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[1], ctc_f.shape[3])
        labels = ctc_f.argmax(2)  #(48, 1)

        mask = labels > 2
        masked = ctc_f.max(2)[mask]
        mean_conf = np.sum(masked) / masked.shape[0]

        if mean_conf < 0.2:
            vis.draw_box_points(scaled, box, color=(0, 0, 0))
            continue

        if debug:
            vis.vis_square(imtf[0])

        det_text, conf, dec_s = print_seq_ext(
            labels[:, 0], np.sum(masked))  # 得到det_text,识别出来的字

        if len(det_text) == 0:
            continue
        if len(det_text) < 3 and mean_conf < 0.8:
            continue

        splits_raw = process_splits(det_text,
                                    conf,
                                    dec_s,
                                    norm2,
                                    ctc_f,
                                    rot_mat,
                                    boxt,
                                    original,
                                    0,
                                    mean_conf,
                                    alow_non_dict=True)
        detections_out.extend(splits_raw)

        dec2, conf2, dec_splits = cmp_trie.decode_sofmax(
            ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2]))
        best_dict = print_seq2(dec2[0])  # 这个是什么?这里得到的是 “” 所以下面就continue了

        if len(best_dict) == 0:
            continue
        splits_out = process_splits(best_dict, conf2, dec_splits, norm2, ctc_f,
                                    rot_mat, boxt, original, 1, mean_conf)
        detections_out.extend(splits_out)

    return detections_out, fps
Exemplo n.º 10
0
def test_pic(nets):

    global rec_t, image_size

    font = ImageFont.truetype(
        "/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf", 16)
    font2 = ImageFont.truetype(
        "/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf", 18)

    impath = "images/demo.jpg"
    im = cv2.imread(impath)

    image_size = [640 / 64 * 64, 480 / 64 * 64]

    scaled = cv2.resize(im, (image_size[0], image_size[1]))  # 转为灰度图
    if nets[0].blobs['data'].data[...].shape[1] == 1:
        scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY)
        scaled = scaled.reshape((scaled.shape[0], scaled.shape[1], 1))

    # 检测 & 识别
    detections_out, fps = froward_image(nets, scaled, im)

    img = Image.fromarray(im)
    draw = ImageDraw.Draw(img)

    for detection in detections_out:
        text = detection[1][0]
        print(text)
        width, height = draw.textsize(text,
                                      font=font)  # 返回一个两元素的元组,是给定字符串像素意义上的size
        center = [detection[0][0][0] - width / 2, detection[0][0][1] - 10]

        sx = int(detection[0][0][0] - width / 2)
        ex = int(detection[0][0][0] + width / 2)
        sy = int(detection[0][0][1] - 10)
        ey = int(detection[0][0][1] + 10)

        im[sy:ey, sx:ex] = im[sy:ey, sx:ex] / 2

        boxr = ((detection[0][0][0], detection[0][0][1]),
                (detection[0][1][0], detection[0][1][1]), detection[0][2])
        box = cv2.boxPoints(
            boxr
        )  # 返回值为numpy数组,四个点坐标[[x,y],[x,y],[x,y],[x,y]]:(中心(x,y), (宽,高), 旋转角度)
        color = (0, 255, 0)
        vis.draw_box_points(im, box, color, thickness=1)

    img = Image.fromarray(im)
    draw = ImageDraw.Draw(img)

    draw.text((10, 10), 'FPS: {0:.2f}'.format(fps), (0, 255, 0), font=font2)

    for detection in detections_out:
        text = detection[1][0]
        width, height = draw.textsize(text, font=font)
        center = [detection[0][0][0] - width / 2, detection[0][0][1] - 10]
        draw.text((center[0], center[1]), text, fill=(0, 255, 0), font=font)

    pix = np.array(img)
    if pix.shape[0] > 1024:
        pix = cv2.resize(pix, (pix.shape[1] / 2, pix.shape[0] / 2))

    cv2.imwrite(impath + "_result_pix.jpg", pix)  #  有框,框里有结果
Exemplo n.º 11
0
def evaluate_image(batch, detections, word_gto, iou_th=0.3, iou_th_vis=0.5, iou_th_eval=0.4):
    
  '''
  Summary : Returns end-to-end true-positives, detection true-positives, number of GT to be considered for eval (len > 2).
  Description : For each predicted bounding-box, comparision is made with each GT entry. Values of number of end-to-end true
                positives, number of detection true positives, number of GT entries to be considered for evaluation are computed.
  
  Parameters
  ----------
  iou_th_eval : float
      Threshold value of intersection-over-union used for evaluation of predicted bounding-boxes
  iou_th_vis : float
      Threshold value of intersection-over-union used for visualization when transciption is true but IoU is lesser.
  iou_th : float
      Threshold value of intersection-over-union between GT and prediction.
  word_gto : list of lists
      List of ground-truth bounding boxes along with transcription.
  batch : list of lists
      List containing data (input image, image file name, ground truth).
  detections : tuple of tuples
      Tuple of predicted bounding boxes along with transcriptions and text/no-text score.
  
  Returns
  -------
  tp : int
      Number of predicted bounding-boxes having IoU with GT greater than iou_th_eval.
  tp_e2e : int
      Number of predicted bounding-boxes having same transciption as GT and len > 2.
  gt_e2e : int
      Number of GT entries for which transcription len > 2.
  '''
  
  gt_to_detection = {}
  tp = 0
  tp_e2e = 0
  gt_e2e = 0
  
  draw = batch[4][0]    
  normFactor = math.sqrt(draw.shape[1] * draw.shape[1] + draw.shape[0] * draw.shape[0]) # Normalization factor
  for i in range(0, len(detections)):
      
    det = detections[i]
    boxr = det[0]
    box = cv2.boxPoints(boxr) # Predicted bounding-box parameters
    box = np.array(box, dtype="int") # Convert predicted bounding-box to numpy array
    bbox = cv2.boundingRect(box)
    
    bbox = [bbox[0], bbox[1], bbox[2], bbox[3]]
    bbox[2] += bbox[0] # Convert width to right-coordinate
    bbox[3] += bbox[1] # Convert height to bottom-coordinate
    
    vis.draw_box_points(draw, box, color = (255, 0, 0))
    
    det_text = det[1][0] # Predicted transcription for bounding-box
    #print(det_text)
    
    for gt_no in range(len(word_gto)):
        
      gt = word_gto[gt_no]
      txt = gt[5] # GT transcription for given GT bounding-box
      gtbox  = ((gt[0] * draw.shape[1], gt[1] * draw.shape[0]), (gt[2] * normFactor, gt[3] * normFactor), gt[4] * 180 / 3.14) # Re-scaling GT values
      gtbox = cv2.boxPoints(gtbox)
      gtbox = np.array(gtbox, dtype="int")
      rect_gt = cv2.boundingRect(gtbox)
      
      
      rect_gt = [rect_gt[0], rect_gt[1], rect_gt[2], rect_gt[3]]
      rect_gt[2] += rect_gt[0] # Convert GT width to right-coordinate
      rect_gt[3] += rect_gt[1] # Convert GT height to bottom-coordinate 

      inter = intersect(bbox, rect_gt) # Intersection of predicted and GT bounding-boxes
      uni = union(bbox, rect_gt) # Union of predicted and GT bounding-boxes
      ratio = area(inter) / float(area(uni)) # IoU measure between predicted and GT bounding-boxes
      
      # 1). Visualize the predicted-bounding box if IoU with GT is higher than IoU threshold (iou_th) (Always required)
      # 2). Visualize the predicted-bounding box if transcription matches the GT and condition 1. holds
      # 3). Visualize the predicted-bounding box if transcription matches and IoU with GT is less than iou_th_vis and 1. and 2. hold
      if ratio > iou_th:
        vis.draw_box_points(draw, box, color = (0, 128, 0))
        if not gt_to_detection.has_key(gt_no):
          gt_to_detection[gt_no] = [0, 0]
            
        if txt.lower() == det_text.lower():
          to_cls_x.append([len(det_text), det[1][1], det[1][2], det[1][3]])
          to_cls_y.append(1)
          vis.draw_box_points(draw, box, color = (0, 255, 0), thickness=2)
          gt[7] = 1 # Change this parameter to 1 when predicted transcription is correct.
          
          if ratio < iou_th_vis:
              vis.draw_box_points(draw, box, color = (255, 255, 255), thickness=2)
              cv2.imshow('draw', draw) 
              #cv2.waitKey(0)
                
        else:
          to_cls_x.append([len(det_text), det[1][1], det[1][2], det[1][3]])
          to_cls_y.append(0)
          
        tupl = gt_to_detection[gt_no] 
        if tupl[0] < ratio:
          tupl[0] = ratio 
          tupl[1] = i   
                  
  # Count the number of end-to-end and detection true-positives
  for gt_no in range(len(word_gto)):
    gt = word_gto[gt_no]
    txt = gt[5]
    if len(txt) > 2:
      gt_e2e += 1
      if gt[7] == 1:
        tp_e2e += 1
            
    if gt_to_detection.has_key(gt_no):
      tupl = gt_to_detection[gt_no] 
      if tupl[0] > iou_th_eval: # Increment detection true-positive, if IoU is greater than iou_th_eval
        tp += 1             
          
  cv2.imshow('draw', draw)             
  return tp, tp_e2e, gt_e2e 
Exemplo n.º 12
0
def ocr_detections(net_ctc, img, scaled_img, boxes, image_size, r_p_th, out_raw, baseName, debug, split_words, alow_non_dict=False):
    
  global rec_t, ext_factor, use_per_image
    
  draw = np.copy(scaled_img)
    
  # Region layer returns normalized coordiates, convert the generated boxes to image coordinate system
  boxes[0, 0, :, 0] *= image_size[0]
  boxes[0, 0, :, 1] *= image_size[1]
  normFactor = math.sqrt(image_size[1] * image_size[1] + image_size[0] * image_size[0])
  boxes[0, 0, :, 2] *= normFactor
  boxes[0, 0, :, 3] *= normFactor
  
  nms_mask = boxes[0, 0, :, 8] != 1
  boxes = boxes[:, :, nms_mask, :]
  
    # Region layer returns boxes in sorted order by r_{p}, filter out the boxes with r_{p} below threshold value
  boxes_count = 0
  for i in range(0, boxes.shape[2]):
      det_word = boxes[0, 0, i]
      if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < r_p_th:
        break
      boxes_count += 1
  
  detections_out = []
  
  for i in range(0, boxes_count):
      
    det_word = boxes[0, 0, i]
    boxr  = ((det_word[0], det_word[1]), (det_word[2], det_word[3]), det_word[4] * 180 / 3.14) # Convert the rotation parameter to degrees
    box = cv2.boxPoints(boxr) # Gives the coordinates for 4 points of bounding-box
    box = np.array(box, dtype="int")
    
    if det_word[3] < 5:
      continue
    
    if debug:
      try:
        vis.draw_box_points(draw, box, (255, 0, 0)) # Visualize the predicted bounding-boxes
      except:
        pass
    
    bbox = cv2.boundingRect(box)
    bbox = [bbox[0], bbox[1], bbox[2], bbox[3]]
    bbox[2] += bbox[0] # Convert width to right-coordinate
    bbox[3] += bbox[1] # Convert height to bottom-coordinate
    
    boxro  = [[det_word[0], det_word[1]], [det_word[2]  * ext_factorx, det_word[3] * ext_factor], det_word[4] * 180 / 3.14] # Re-scaling the bounding-box parameters to increase height and width, this helps recognizer
    boxt = get_obox(scaled_img, img, boxro) # Rescale the predicted bounding box to original image size
    boxt = ((boxt[0][0], boxt[0][1]), (boxt[1][0], boxt[1][1]), boxt[2])
    
    norm2, rot_mat = get_normalized_image(img, boxt) # norm2 stores normalized cropped region from original image determined by predicted bounding box
    if norm2 is None:
      continue
    #boxt[2] = boxt[2] * 180 / 3.14
    #cv2.imshow('norm2', norm2)
    #cv2.imshow('draw', draw)
    if norm2.ndim > 2:
        norm = cv2.cvtColor(norm2, cv2.COLOR_BGR2GRAY ) # Convert the cropped region to GRAY scale for recognizer
    else:
        norm = norm2 # Do nothing if already GRAY scale                             
    
    # Change width for each cropped region, keeping height fixed (32). Map width to closest value from bucket
    width_scale = 32.0 / norm2.shape[0]
    width = norm.shape[1] * width_scale
    best_diff = width
    bestb = 0
    for idx, val in enumerate(buckets):
      if (buckets[idx] - width) < 0  :
          bestb = idx
          best_diff = abs(buckets[idx] - width) * 3
          continue
      if best_diff > (buckets[idx] - width): 
          bestb = idx
          best_diff = (buckets[idx] - width)
    scaled = cv2.resize(norm, (buckets[bestb], 32)) # Resize cropped region for input for recognizer FCN
         
    if scaled.ndim == 3:
      scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY) 
    
    imtf = np.asarray([scaled], dtype=np.float)
    imtf = np.asarray(imtf, dtype=np.float)
    imtf /= 128.0
    imtf -= 1
    imtf = np.reshape(imtf, (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2])) 
        
    net_ctc.blobs['data'].reshape(imtf.shape[0],imtf.shape[1],imtf.shape[2], imtf.shape[3]) # Reshape the recognizer FCN to adapt varying cropped region size
    net_ctc.blobs['data'].data[...] = imtf # Load the data onto recognizer FCN (cropped region data)
    net_ctc.forward() # Recognizer FCN feed-forward
    ctc_f = net_ctc.blobs['softmax'].data[...] 
    
    ctc_f = ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[1], ctc_f.shape[3])
    labels = ctc_f.argmax(2) # 3rd dimension (ctc_f[:,:,2]) contains softmax distribution over all the possible characters for each position, thus labels store the index of character with maximum value (probability).
    mask = labels > 3
    masked = ctc_f.max(2)[mask] # For each predicted character, fetch the corresponding score
    mean_conf = np.sum(masked) / masked.shape[0] # Mean score for all the predicted characters
    
        # Visualize if mean score for predicted characters is less than 0.3
    if mean_conf < 0.3:
      continue
    
    if debug:    
      vis.vis_square(imtf[0])
    
    det_text, conf, dec_s = print_seq_ext(labels[:, 0], np.sum(masked) ) 
    if not split_words:
      detections_out.extend( [(boxt, (det_text, mean_conf, 1, mean_conf) )] )
      continue
    
    #print(det_text)
    #str_lm, pr =  cmp_trie.decode_sofmax_lm(ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2]))
    #if det_text != str_lm:
    #  print('  Decoding diff: {0} - {1}'.format(det_text, str_lm))
    #  det_text = str_lm.strip()
    
    if len(det_text.strip()) == 0:
      continue
    
    if len(det_text.strip()) <= 3:
      if mean_conf < 0.6 or det_word[5] < 0.4:
        continue
    
    pr = 1
    for k in range(masked.shape[0]):
      pr = pr *  masked[k]
    pr = math.exp(pr)
    #pr = math.pow(pr, 1.0/ len(det_text) )
    
    #tex_conf =  mean_conf / ctc_f.shape[0]
    #if tex_conf < 0.1:
    #  continue
    
    #print(det_text)
    #cv2.imshow('norm2', norm2)
    splits_raw = process_splits(det_text, conf, dec_s, norm2, ctc_f, rot_mat, boxt, img, det_word[5], mean_conf, alow_non_dict = alow_non_dict) # Process the split and improve the localization results using "space" (' ') predicted by recognizer
    detections_out.extend( splits_raw )
    spl = det_text.split(" ")
    
    if len(spl) == 1 and cmp_trie.is_dict(spl[0].lower().encode('utf-8')) == 1:
      continue
                  
    
    dec2, conf2, dec_splits = cmp_trie.decode_sofmax(ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2]))
    best_dict = print_seq2(dec2[0])
    
    if out_raw is not None and len(det_text) > 2:
      boxout = cv2.boxPoints(boxt)    
      out_raw.write(u"{0}|{1}|{2}|{3}|{4}|{5}|{6}|{7}|{8}|{9}|{10}|{11}\n".format(\
              baseName[:-4],boxout[0, 0],boxout[0, 1], boxout[1, 0], boxout[1, 1], \
              boxout[2, 0], boxout[2, 1], boxout[3, 0], boxout[3, 1], det_text, best_dict, mean_conf).encode('utf8'))
  
    splits_out = process_splits(best_dict, conf2, dec_splits, norm2, ctc_f, rot_mat, boxt, img, det_word[5], pr, alow_non_dict=False)
    detections_out.extend( splits_out )
  
  #detections_out = nms(detections_out)
  if out_raw is not None:
    out_raw.flush()   
  
  cv2.imshow('draw', draw)
  cv2.waitKey(10)  
  return detections_out 
Exemplo n.º 13
0
 def random_crop(self, img, word_gto):
     
     xs =  int(random.uniform(0, self.crop_ratio) * img.shape[1])
     xe =  int(random.uniform(0, self.crop_ratio) * img.shape[1])
     maxx = img.shape[1] - xe
     
     ys =  int(random.uniform(0, self.crop_ratio) * img.shape[0])
     ye =  int(random.uniform(0, self.crop_ratio) * img.shape[0])
     maxy = img.shape[0] - ye
     
     crop_img = img[ys:maxy, xs:maxx]
     
     normo = math.sqrt(img.shape[0] * img.shape[0] + img.shape[1] * img.shape[1] )
     
     image_size = (crop_img.shape[1], crop_img.shape[0]) 
     normo2 = math.sqrt(image_size[1] * image_size[1] + image_size[0] * image_size[0] )
     
     o_size = (img.shape[1], img.shape[0])
     
     gt_out = []
     for gt_no in range(len(word_gto)): #TODO - remove loop ... use numpy
         
         gt = word_gto[gt_no]
         
         gtbox  = ((gt[0] * o_size[0], gt[1] * o_size[1]), (gt[2] * normo, gt[3] * normo), gt[4] * 180 / 3.14)
         gtbox = cv2.boxPoints(gtbox)
         gtbox = np.array(gtbox, dtype="float")
         
         gtbox[:, 0] -= xs
         gtbox[:, 1] -= ys
         
         gtbox[gtbox[:, 0] < 0, 0] = 0
         gtbox[gtbox[:, 1] < 0, 1] = 0
         
         gtbox[gtbox[:, 0] > maxx, 0] = maxx
         gtbox[gtbox[:, 1] > maxy, 1] = maxy
         
         dh = gtbox[0, :] - gtbox[1, :]
         dw = gtbox[1, :] - gtbox[2, :]
         
         centerx = np.sum( gtbox[:, 0] ) / float(gtbox.shape[0])
         centery = np.sum( gtbox[:, 1] ) / float(gtbox.shape[0])
         
         
                     
         h = math.sqrt(dh[0] * dh[0] + dh[1] * dh[1]) / normo2
         w = math.sqrt(dw[0] * dw[0] + dw[1] * dw[1]) / normo2
         
         if w * normo2 < 2 or h * normo2  < 2 or np.isinf(w) or np.isinf(h):
             #print("warn: removig too small gt {0}".format(gt))
             continue
         
         gt[0] = centerx / image_size[0]
         gt[1] = centery / image_size[1]
         gt[2] = w
         gt[3] = h
         
         gt[4] = math.atan2((gtbox[2, 1] - gtbox[1, 1]), (gtbox[2, 0] - gtbox[1, 0]))
         
         if False:
             draw_box_points(crop_img,  np.array(gtbox, dtype="int"), color = (0, 255, 0))
             
             gtbox2  = ((gt[0] * image_size[0], gt[1] * image_size[1]), (gt[2] * normo2, gt[3] * normo2), gt[4] * 180 / 3.14)
             gtbox2 = cv2.boxPoints(gtbox2)
             gtbox2 = np.array(gtbox2, dtype="float")
             
             draw_box_points(crop_img,  np.array(gtbox2, dtype="int"), color = (0, 255, 0))
             cv2.imshow('c2', crop_img)
             
         gt_out.append(gt)
         
     #cv2.imshow('crop_img', crop_img)
     #cv2.waitKey(0)
     return (crop_img, gt_out)
Exemplo n.º 14
0
def process_splits(trans, conf, splits, norm2, ctc_f, rot_mat, boxt, draw, iou, debug = False, alow_non_dict = False):

  '''
  Summary : Split the transciption and corresponding bounding-box based on spaces predicted by recognizer FCN.
  Description : 

  Parameters
  ----------
  trans : string
      String containing the predicted transcription for the corresponding predicted bounding-box.
  conf : list
      List containing sum of confidence for all the character by recognizer FCN, start and end position in bounding-box for generated transciption.
  splits :  list
      List containing index of position of predicted spaces by the recognizer FCN.
  norm2 : matrix
      Matrix containing the cropped bounding-box predicted by localization FCN in the originial image.
  ctc_f : matrix
      Matrix containing output of recognizer FCN for the given input bounding-box.
  rot_mat : matrix
      Rotation matrix returned by get_normalized_image function.
  boxt : tuple of tuples
      Tuple of tuples containing parametes of predicted bounding-box by localization FCN.
  draw : matrix
      Matrix containing input image. 
  debug : boolean
      Boolean parameter representing debug mode, if it is True visualization boxes are generated.
          
  Returns
  -------
  boxes_out : list of tuples
      List of tuples containing predicted bounding-box parameters, predicted transcription and mean confidence score from the recognizer.
  '''
  
  spl = trans.split(" ")
  boxout = cv2.boxPoints(boxt)
  start_f = 0
  mean_conf = conf[0, 0] / len(trans) # Overall confidence of recognizer FCN
  boxes_out = []
  
  for s in range(len(spl)):
      
    text = spl[s]
    
    end_f = conf[0, 2]
    if s < len(spl) - 1:
      try:
        if splits[0, s] > start_f:
          end_f = splits[0, s] # New ending point of bounding-box transcription
      except IndexError:
        pass
    
    scalex = norm2.shape[1] / float(ctc_f.shape[0])
        
    poss = start_f * scalex
    pose = (end_f + 2) * scalex
    rect = [[poss, 0], [pose, 0], \
            [pose, norm2.shape[0] - 1], [poss, norm2.shape[0] - 1]]
    rect = np.array(rect)
    #rect[:, 0] +=  boxt[0][0]
    #rect[:, 1] += boxt[0][1]
    
    int_t = cv2.invertAffineTransform(rot_mat)
    
    dst_rect = np.copy(rect)
    dst_rect[:,0]  = int_t[0,0]*rect[:,0] + int_t[0,1]*rect[:, 1] + int_t[0,2]
    dst_rect[:,1]  = int_t[1,0]*rect[:,0] + int_t[1,1]*rect[:, 1] + int_t[1,2]
    
    
    tx = np.sum(dst_rect[:,0]) / 4.0
    ty = np.sum(dst_rect[:,1]) / 4.0
    br = cv2.boundingRect(boxout)
    tx += br[0]
    ty += br[1]
    twidth = (pose - poss) #twidth = (pose - poss) / ext_factor
    theight = norm2.shape[0]
    
    
    box_back = ( (tx, ty), (twidth, theight * 0.9), boxt[2] )
    
    if debug:
      boxout_u = cv2.boxPoints(box_back)
      vis.draw_box_points(draw, boxout_u, color = (0, 255, 0))
      cv2.imshow('draw', draw)
        
    if len(text.strip()) == 0:
      print("zero length text!")
      continue 
    
    textc = text.replace(".", "").replace(":", "").replace("!", "").replace("?", "").replace(",", "").replace("/", "").replace("-", "").replace("$", "").replace("'", "").replace("(", "").replace(")", "").replace("+", "")
    if textc.endswith("'s"):
      textc = textc[:-2]
    is_dict = cmp_trie.is_dict(textc.encode('utf-8')) or textc.isdigit() or alow_non_dict
    if len(text) > 2 and ( text.isdigit() or is_dict):
        boxes_out.append( (box_back, (text, mean_conf, is_dict, iou) ) )
    start_f = end_f + 1      
  return boxes_out    
Exemplo n.º 15
0
def ocr_detections(net_ctc,
                   img,
                   scaled_img,
                   boxes,
                   image_size,
                   r_p_th,
                   out_raw,
                   baseName,
                   debug,
                   split_words,
                   alow_non_dict=False):

    global rec_t, ext_factor, use_per_image

    draw = np.copy(scaled_img)

    # Region layer returns normalized coordiates, convert the generated boxes to image coordinate system
    boxes[0, 0, :, 0] *= image_size[0]
    boxes[0, 0, :, 1] *= image_size[1]
    normFactor = math.sqrt(image_size[1] * image_size[1] +
                           image_size[0] * image_size[0])
    boxes[0, 0, :, 2] *= normFactor
    boxes[0, 0, :, 3] *= normFactor

    nms_mask = boxes[0, 0, :, 8] != 1
    boxes = boxes[:, :, nms_mask, :]

    # Region layer returns boxes in sorted order by r_{p}, filter out the boxes with r_{p} below threshold value
    boxes_count = 0
    for i in range(0, boxes.shape[2]):
        det_word = boxes[0, 0, i]
        if (det_word[0] == 0 and det_word[1] == 0) or det_word[5] < r_p_th:
            break
        boxes_count += 1

    detections_out = []

    for i in range(0, boxes_count):

        det_word = boxes[0, 0, i]
        boxr = ((det_word[0], det_word[1]), (det_word[2], det_word[3]),
                det_word[4] * 180 / 3.14
                )  # Convert the rotation parameter to degrees
        box = cv2.boxPoints(
            boxr)  # Gives the coordinates for 4 points of bounding-box
        box = np.array(box, dtype="int")

        if det_word[3] < 5:
            continue

        if debug:
            try:
                vis.draw_box_points(
                    draw, box,
                    (255, 0, 0))  # Visualize the predicted bounding-boxes
            except:
                pass

        bbox = cv2.boundingRect(box)
        bbox = [bbox[0], bbox[1], bbox[2], bbox[3]]
        bbox[2] += bbox[0]  # Convert width to right-coordinate
        bbox[3] += bbox[1]  # Convert height to bottom-coordinate

        boxro = [
            [det_word[0], det_word[1]],
            [det_word[2] * ext_factorx,
             det_word[3] * ext_factor], det_word[4] * 180 / 3.14
        ]  # Re-scaling the bounding-box parameters to increase height and width, this helps recognizer
        boxt = get_obox(
            scaled_img, img,
            boxro)  # Rescale the predicted bounding box to original image size
        boxt = ((boxt[0][0], boxt[0][1]), (boxt[1][0], boxt[1][1]), boxt[2])

        norm2, rot_mat = get_normalized_image(
            img, boxt
        )  # norm2 stores normalized cropped region from original image determined by predicted bounding box
        if norm2 is None:
            continue
        #boxt[2] = boxt[2] * 180 / 3.14
        #cv2.imshow('norm2', norm2)
        #cv2.imshow('draw', draw)
        if norm2.ndim > 2:
            norm = cv2.cvtColor(
                norm2, cv2.COLOR_BGR2GRAY
            )  # Convert the cropped region to GRAY scale for recognizer
        else:
            norm = norm2  # Do nothing if already GRAY scale

        # Change width for each cropped region, keeping height fixed (32). Map width to closest value from bucket
        width_scale = 32.0 / norm2.shape[0]
        width = norm.shape[1] * width_scale
        best_diff = width
        bestb = 0
        for idx, val in enumerate(buckets):
            if (buckets[idx] - width) < 0:
                bestb = idx
                best_diff = abs(buckets[idx] - width) * 3
                continue
            if best_diff > (buckets[idx] - width):
                bestb = idx
                best_diff = (buckets[idx] - width)
        scaled = cv2.resize(
            norm, (buckets[bestb],
                   32))  # Resize cropped region for input for recognizer FCN

        if scaled.ndim == 3:
            scaled = cv2.cvtColor(scaled, cv2.COLOR_BGR2GRAY)

        imtf = np.asarray([scaled], dtype=np.float)
        imtf = np.asarray(imtf, dtype=np.float)
        imtf /= 128.0
        imtf -= 1
        imtf = np.reshape(imtf,
                          (imtf.shape[0], -1, imtf.shape[1], imtf.shape[2]))

        net_ctc.blobs['data'].reshape(
            imtf.shape[0], imtf.shape[1], imtf.shape[2], imtf.shape[3]
        )  # Reshape the recognizer FCN to adapt varying cropped region size
        net_ctc.blobs['data'].data[
            ...] = imtf  # Load the data onto recognizer FCN (cropped region data)
        net_ctc.forward()  # Recognizer FCN feed-forward
        ctc_f = net_ctc.blobs['softmax'].data[...]

        ctc_f = ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[1], ctc_f.shape[3])
        labels = ctc_f.argmax(
            2
        )  # 3rd dimension (ctc_f[:,:,2]) contains softmax distribution over all the possible characters for each position, thus labels store the index of character with maximum value (probability).
        mask = labels > 3
        masked = ctc_f.max(
            2
        )[mask]  # For each predicted character, fetch the corresponding score
        mean_conf = np.sum(masked) / masked.shape[
            0]  # Mean score for all the predicted characters

        # Visualize if mean score for predicted characters is less than 0.3
        if mean_conf < 0.3:
            continue

        if debug:
            vis.vis_square(imtf[0])

        det_text, conf, dec_s = print_seq_ext(labels[:, 0], np.sum(masked))
        if not split_words:
            detections_out.extend([(boxt, (det_text, mean_conf, 1, mean_conf))
                                   ])
            continue

        #print(det_text)
        #str_lm, pr =  cmp_trie.decode_sofmax_lm(ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2]))
        #if det_text != str_lm:
        #  print('  Decoding diff: {0} - {1}'.format(det_text, str_lm))
        #  det_text = str_lm.strip()

        if len(det_text.strip()) == 0:
            continue

        if len(det_text.strip()) <= 3:
            if mean_conf < 0.6 or det_word[5] < 0.4:
                continue

        pr = 1
        for k in range(masked.shape[0]):
            pr = pr * masked[k]
        pr = math.exp(pr)
        #pr = math.pow(pr, 1.0/ len(det_text) )

        #tex_conf =  mean_conf / ctc_f.shape[0]
        #if tex_conf < 0.1:
        #  continue

        #print(det_text)
        #cv2.imshow('norm2', norm2)
        splits_raw = process_splits(
            det_text,
            conf,
            dec_s,
            norm2,
            ctc_f,
            rot_mat,
            boxt,
            img,
            det_word[5],
            mean_conf,
            alow_non_dict=alow_non_dict
        )  # Process the split and improve the localization results using "space" (' ') predicted by recognizer
        detections_out.extend(splits_raw)
        spl = det_text.split(" ")

        if len(spl) == 1 and cmp_trie.is_dict(
                spl[0].lower().encode('utf-8')) == 1:
            continue

        dec2, conf2, dec_splits = cmp_trie.decode_sofmax(
            ctc_f.reshape(ctc_f.shape[0], ctc_f.shape[2]))
        best_dict = print_seq2(dec2[0])

        if out_raw is not None and len(det_text) > 2:
            boxout = cv2.boxPoints(boxt)
            out_raw.write(u"{0}|{1}|{2}|{3}|{4}|{5}|{6}|{7}|{8}|{9}|{10}|{11}\n".format(\
                    baseName[:-4],boxout[0, 0],boxout[0, 1], boxout[1, 0], boxout[1, 1], \
                    boxout[2, 0], boxout[2, 1], boxout[3, 0], boxout[3, 1], det_text, best_dict, mean_conf).encode('utf8'))

        splits_out = process_splits(best_dict,
                                    conf2,
                                    dec_splits,
                                    norm2,
                                    ctc_f,
                                    rot_mat,
                                    boxt,
                                    img,
                                    det_word[5],
                                    pr,
                                    alow_non_dict=False)
        detections_out.extend(splits_out)

    #detections_out = nms(detections_out)
    if out_raw is not None:
        out_raw.flush()

    cv2.imshow('draw', draw)
    cv2.waitKey(10)
    return detections_out
Exemplo n.º 16
0
def process_splits(trans,
                   conf,
                   splits,
                   norm2,
                   ctc_f,
                   rot_mat,
                   boxt,
                   draw,
                   iou,
                   debug=False,
                   alow_non_dict=False):
    '''
  Summary : Split the transciption and corresponding bounding-box based on spaces predicted by recognizer FCN.
  Description : 

  Parameters
  ----------
  trans : string
      String containing the predicted transcription for the corresponding predicted bounding-box.
  conf : list
      List containing sum of confidence for all the character by recognizer FCN, start and end position in bounding-box for generated transciption.
  splits :  list
      List containing index of position of predicted spaces by the recognizer FCN.
  norm2 : matrix
      Matrix containing the cropped bounding-box predicted by localization FCN in the originial image.
  ctc_f : matrix
      Matrix containing output of recognizer FCN for the given input bounding-box.
  rot_mat : matrix
      Rotation matrix returned by get_normalized_image function.
  boxt : tuple of tuples
      Tuple of tuples containing parametes of predicted bounding-box by localization FCN.
  draw : matrix
      Matrix containing input image. 
  debug : boolean
      Boolean parameter representing debug mode, if it is True visualization boxes are generated.
          
  Returns
  -------
  boxes_out : list of tuples
      List of tuples containing predicted bounding-box parameters, predicted transcription and mean confidence score from the recognizer.
  '''

    spl = trans.split(" ")
    boxout = cv2.boxPoints(boxt)
    start_f = 0
    mean_conf = conf[0, 0] / len(trans)  # Overall confidence of recognizer FCN
    boxes_out = []

    for s in range(len(spl)):

        text = spl[s]

        end_f = conf[0, 2]
        if s < len(spl) - 1:
            try:
                if splits[0, s] > start_f:
                    end_f = splits[
                        0, s]  # New ending point of bounding-box transcription
            except IndexError:
                pass

        scalex = norm2.shape[1] / float(ctc_f.shape[0])

        poss = start_f * scalex
        pose = (end_f + 2) * scalex
        rect = [[poss, 0], [pose, 0], \
                [pose, norm2.shape[0] - 1], [poss, norm2.shape[0] - 1]]
        rect = np.array(rect)
        #rect[:, 0] +=  boxt[0][0]
        #rect[:, 1] += boxt[0][1]

        int_t = cv2.invertAffineTransform(rot_mat)

        dst_rect = np.copy(rect)
        dst_rect[:, 0] = int_t[0, 0] * rect[:, 0] + int_t[
            0, 1] * rect[:, 1] + int_t[0, 2]
        dst_rect[:, 1] = int_t[1, 0] * rect[:, 0] + int_t[
            1, 1] * rect[:, 1] + int_t[1, 2]

        tx = np.sum(dst_rect[:, 0]) / 4.0
        ty = np.sum(dst_rect[:, 1]) / 4.0
        br = cv2.boundingRect(boxout)
        tx += br[0]
        ty += br[1]
        twidth = (pose - poss)  #twidth = (pose - poss) / ext_factor
        theight = norm2.shape[0]

        box_back = ((tx, ty), (twidth, theight * 0.9), boxt[2])

        if debug:
            boxout_u = cv2.boxPoints(box_back)
            vis.draw_box_points(draw, boxout_u, color=(0, 255, 0))
            cv2.imshow('draw', draw)

        if len(text.strip()) == 0:
            print("zero length text!")
            continue

        textc = text.replace(".", "").replace(":", "").replace(
            "!",
            "").replace("?", "").replace(",", "").replace("/", "").replace(
                "-", "").replace("$",
                                 "").replace("'", "").replace("(", "").replace(
                                     ")", "").replace("+", "")
        if textc.endswith("'s"):
            textc = textc[:-2]
        is_dict = cmp_trie.is_dict(
            textc.encode('utf-8')) or textc.isdigit() or alow_non_dict
        if len(text) > 2 and (text.isdigit() or is_dict):
            boxes_out.append((box_back, (text, mean_conf, is_dict, iou)))
        start_f = end_f + 1
    return boxes_out
Exemplo n.º 17
0
    def random_crop(self, img, word_gto):

        xs = int(random.uniform(0, self.crop_ratio) * img.shape[1])
        xe = int(random.uniform(0, self.crop_ratio) * img.shape[1])
        maxx = img.shape[1] - xe

        ys = int(random.uniform(0, self.crop_ratio) * img.shape[0])
        ye = int(random.uniform(0, self.crop_ratio) * img.shape[0])
        maxy = img.shape[0] - ye

        crop_img = img[ys:maxy, xs:maxx]

        normo = math.sqrt(img.shape[0] * img.shape[0] +
                          img.shape[1] * img.shape[1])

        image_size = (crop_img.shape[1], crop_img.shape[0])
        normo2 = math.sqrt(image_size[1] * image_size[1] +
                           image_size[0] * image_size[0])

        o_size = (img.shape[1], img.shape[0])

        gt_out = []
        for gt_no in range(len(word_gto)):  #TODO - remove loop ... use numpy

            gt = word_gto[gt_no]

            gtbox = ((gt[0] * o_size[0], gt[1] * o_size[1]),
                     (gt[2] * normo, gt[3] * normo), gt[4] * 180 / 3.14)
            gtbox = cv2.boxPoints(gtbox)
            gtbox = np.array(gtbox, dtype="float")

            gtbox[:, 0] -= xs
            gtbox[:, 1] -= ys

            gtbox[gtbox[:, 0] < 0, 0] = 0
            gtbox[gtbox[:, 1] < 0, 1] = 0

            gtbox[gtbox[:, 0] > maxx, 0] = maxx
            gtbox[gtbox[:, 1] > maxy, 1] = maxy

            dh = gtbox[0, :] - gtbox[1, :]
            dw = gtbox[1, :] - gtbox[2, :]

            centerx = np.sum(gtbox[:, 0]) / float(gtbox.shape[0])
            centery = np.sum(gtbox[:, 1]) / float(gtbox.shape[0])

            h = math.sqrt(dh[0] * dh[0] + dh[1] * dh[1]) / normo2
            w = math.sqrt(dw[0] * dw[0] + dw[1] * dw[1]) / normo2

            if w * normo2 < 2 or h * normo2 < 2 or np.isinf(w) or np.isinf(h):
                #print("warn: removig too small gt {0}".format(gt))
                continue

            gt[0] = centerx / image_size[0]
            gt[1] = centery / image_size[1]
            gt[2] = w
            gt[3] = h

            gt[4] = math.atan2((gtbox[2, 1] - gtbox[1, 1]),
                               (gtbox[2, 0] - gtbox[1, 0]))

            if False:
                draw_box_points(crop_img,
                                np.array(gtbox, dtype="int"),
                                color=(0, 255, 0))

                gtbox2 = ((gt[0] * image_size[0], gt[1] * image_size[1]),
                          (gt[2] * normo2, gt[3] * normo2), gt[4] * 180 / 3.14)
                gtbox2 = cv2.boxPoints(gtbox2)
                gtbox2 = np.array(gtbox2, dtype="float")

                draw_box_points(crop_img,
                                np.array(gtbox2, dtype="int"),
                                color=(0, 255, 0))
                cv2.imshow('c2', crop_img)

            gt_out.append(gt)

        #cv2.imshow('crop_img', crop_img)
        #cv2.waitKey(0)
        return (crop_img, gt_out)