예제 #1
0
uidx = 0  # count batch
loss_s = 0.  # count loss
ud_s = 0  # time for training an epoch
validFreq = -1
saveFreq = -1
sampleFreq = -1
dispFreq = 100
if validFreq == -1:
    validFreq = len(train)
if saveFreq == -1:
    saveFreq = len(train)
if sampleFreq == -1:
    sampleFreq = len(train)

# initialize model
WAP_model = Encoder_Decoder(params)
if init_param_flag:
    WAP_model.apply(weight_init)
if multi_gpu_flag:
    WAP_model = nn.DataParallel(WAP_model, device_ids=[0, 1])
WAP_model.cuda()

# print model's parameters
model_params = WAP_model.named_parameters()
for k, v in model_params:
    print(k)

# loss function
criterion = torch.nn.CrossEntropyLoss(reduce=False)
# optimizer
optimizer = optim.Adadelta(WAP_model.parameters(),
def main(model_path, dictionary_target, fea, latex, saveto, output, beam_k=5):
    # model architecture
    params = {}
    params['n'] = 256
    params['m'] = 256
    params['dim_attention'] = 512
    params['D'] = 684
    params['K'] = 111
    params['growthRate'] = 24
    params['reduction'] = 0.5
    params['bottleneck'] = True
    params['use_dropout'] = True
    params['input_channels'] = 1

    # load model
    model = Encoder_Decoder(params)
    model.load_state_dict(
        torch.load(model_path, map_location=lambda storage, loc: storage))
    model.cuda()

    # load dictionary
    worddicts = load_dict(dictionary_target)
    worddicts_r = [None] * len(worddicts)
    for kk, vv in worddicts.items():
        worddicts_r[vv] = kk

    # load data
    test, test_uid_list = dataIterator(fea,
                                       latex,
                                       worddicts,
                                       batch_size=8,
                                       batch_Imagesize=500000,
                                       maxlen=20000,
                                       maxImagesize=500000)

    # testing
    model.eval()
    with torch.no_grad():
        fpp_sample = open(saveto, 'w')
        test_count_idx = 0
        print('Decoding ... ')
        for x, y in test:
            for xx in x:
                print('%d : %s' %
                      (test_count_idx + 1, test_uid_list[test_count_idx]))
                xx_pad = xx.astype(np.float32) / 255.
                xx_pad = torch.from_numpy(
                    xx_pad[None, :, :, :]).cuda()  # (1,1,H,W)
                sample, score = gen_sample(model,
                                           xx_pad,
                                           params,
                                           False,
                                           k=beam_k,
                                           maxlen=1000)
                score = score / np.array([len(s) for s in sample])
                ss = sample[score.argmin()]
                # write decoding results
                fpp_sample.write(test_uid_list[test_count_idx])
                test_count_idx = test_count_idx + 1
                # symbols (without <eos>)
                for vv in ss:
                    if vv == 0:  # <eos>
                        break
                    fpp_sample.write(' ' + worddicts_r[vv])
                fpp_sample.write('\n')
    fpp_sample.close()
    print('test set decode done')
    os.system('python compute-wer.py ' + saveto + ' ' + latex + ' ' + output)
    fpp = open(output)
    stuff = fpp.readlines()
    fpp.close()
    m = re.search('WER (.*)\n', stuff[0])
    test_per = 100. * float(m.group(1))
    m = re.search('ExpRate (.*)\n', stuff[1])
    test_sacc = 100. * float(m.group(1))
    print('Valid WER: %.2f%%, ExpRate: %.2f%%' % (test_per, test_sacc))
예제 #3
0
def test(text_detection_modelpara, ocr_modelpara, dictionary_target):
    # load net
    net = CRAFT()  # initialize

    print('Loading text detection model from checkpoint {}'.format(
        text_detection_modelpara))
    if args.cuda:
        net.load_state_dict(copyStateDict(
            torch.load(text_detection_modelpara)))
    else:
        net.load_state_dict(
            copyStateDict(
                torch.load(text_detection_modelpara, map_location='cpu')))

    if args.cuda:
        net = net.cuda()
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = False

    params = {}
    params['n'] = 256
    params['m'] = 256
    params['dim_attention'] = 512
    params['D'] = 684
    params['K'] = 5748
    params['growthRate'] = 24
    params['reduction'] = 0.5
    params['bottleneck'] = True
    params['use_dropout'] = True
    params['input_channels'] = 3
    params['cuda'] = args.cuda

    # load model
    OCR = Encoder_Decoder(params)
    if args.cuda:
        OCR.load_state_dict(copyStateDict(torch.load(ocr_modelpara)))
    else:
        OCR.load_state_dict(
            copyStateDict(torch.load(ocr_modelpara, map_location='cpu')))
    if args.cuda:
        #OCR = OCR.cuda()
        OCR = torch.nn.DataParallel(OCR)
        cudnn.benchmark = False

    OCR.eval()
    net.eval()

    # load dictionary
    worddicts = load_dict(dictionary_target)
    worddicts_r = [None] * len(worddicts)
    for kk, vv in worddicts.items():
        worddicts_r[vv] = kk
    t = time.time()

    fontPIL = '/usr/share/fonts/truetype/fonts-japanese-gothic.ttf'  # japanese font
    size = 40
    colorBGR = (0, 0, 255)

    paper = ET.Element('paper')
    paper.set('xmlns', "http://codh.rois.ac.jp/modern-magazine/")
    # load data
    for k, image_path in enumerate(image_list[:]):
        print("Test image {:d}/{:d}: {:s}".format(k + 1, len(image_list),
                                                  image_path),
              end='\r')
        res_img_file = result_folder + "res_" + os.path.basename(image_path)

        #print (res_img_file, os.path.basename(image_path), os.path.exists(res_img_file))
        #if os.path.exists(res_img_file): continue
        #image = imgproc.loadImage(image_path)
        '''image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        ret2,image = cv2.threshold(image,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
        height = image.shape[0]
        width = image.shape[1]
        scale = 1000.0/height
        H = int(image.shape[0] * scale)
        W = int(image.shape[1] * scale)
        image = cv2.resize(image , (W, H))
        print(image.shape, image_path)
        cv2.imwrite(image_path, image) 
        continue'''
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        h, w = image.shape[0], image.shape[1]
        print(image_path)
        page = ET.SubElement(paper, "page")
        page.set('file', os.path.basename(image_path).replace('.jpg', ''))
        page.set('height', str(h))
        page.set('width', str(w))
        page.set('dpi', str(100))
        page.set('number', str(1))

        bboxes, polys, score_text = test_net(net, image, args.text_threshold,
                                             args.link_threshold,
                                             args.low_text, args.cuda,
                                             args.poly)
        text = []
        localtions = []
        for i, box in enumerate(bboxes):
            poly = np.array(box).astype(np.int32)
            min_x = np.min(poly[:, 0])
            max_x = np.max(poly[:, 0])
            min_y = np.min(poly[:, 1])
            max_y = np.max(poly[:, 1])
            if min_x < 0:
                min_x = 0
            if min_y < 0:
                min_y = 0

            #image = cv2.rectangle(image,(min_x,min_y),(max_x,max_y),(0,255,0),3)
            input_img = image[min_y:max_y, min_x:max_x]

            w = max_x - min_x + 1
            h = max_y - min_y + 1
            line = ET.SubElement(page, "line")
            line.set("x", str(min_x))
            line.set("y", str(min_y))
            line.set("height", str(h))
            line.set("width", str(w))
            if w < h:
                rate = 20.0 / w
                w = int(round(w * rate))
                h = int(round(h * rate / 20.0) * 20)
            else:
                rate = 20.0 / h
                w = int(round(w * rate / 20.0) * 20)
                h = int(round(h * rate))
            #print (w, h, rate)
            input_img = cv2.resize(input_img, (w, h))

            mat = np.zeros([1, h, w], dtype='uint8')
            mat[0, :, :] = 0.299 * input_img[:, :,
                                             0] + 0.587 * input_img[:, :,
                                                                    1] + 0.114 * input_img[:, :,
                                                                                           2]

            xx_pad = mat.astype(np.float32) / 255.
            xx_pad = torch.from_numpy(xx_pad[None, :, :, :])  # (1,1,H,W)
            if args.cuda:
                xx_pad.cuda()
            with torch.no_grad():
                sample, score, alpha_past_list = gen_sample(OCR,
                                                            xx_pad,
                                                            params,
                                                            args.cuda,
                                                            k=10,
                                                            maxlen=600)
            score = score / np.array([len(s) for s in sample])
            ss = sample[score.argmin()]
            alpha_past = alpha_past_list[score.argmin()]
            result = ''
            i = 0
            location = []
            for vv in ss:

                if vv == 0:  # <eol>
                    break
                alpha = alpha_past[i]
                if i != 0: alpha = alpha_past[i] - alpha_past[i - 1]
                (y, x) = np.unravel_index(np.argmax(alpha, axis=None),
                                          alpha.shape)
                #print (int(16* x /rate), int(16* y/rate) , chr(int(worddicts_r[vv],16)))
                location.append(
                    [int(16 * x / rate) + min_x,
                     int(16 * y / rate) + min_y])
                #image = cv2.circle(image,(int(16* x/rate) -  8 + min_x, int(16* y/rate) + 8 + min_y),25, (0,0,255), -1)

                result += chr(int(worddicts_r[vv], 16))
                '''char = ET.SubElement(line, "char") 
                char.set('num_cand', '1') 
                char.set('x', str(int(16* x/rate) -  8 + min_x)) 
                char.set('y', str(int(16* y/rate) + 8 + min_y)) 
                res = ET.SubElement(char, "result") 
                res.set('CC', str(100))
                res.text = chr(int(worddicts_r[vv],16))
                cand = ET.SubElement(char, "cand") 
                cand.set('CC', str(100))
                cand.text = chr(int(worddicts_r[vv],16))'''
                i += 1
            line.text = result
            text.append(result)
            localtions.append(location)
            image = cv2_putText_1(img=image,
                                  text=result,
                                  org=(min_x, max_x, min_y, max_y),
                                  fontFace=fontPIL,
                                  fontScale=size,
                                  color=colorBGR)

        print('save image')
        # save score text
        filename, file_ext = os.path.splitext(os.path.basename(image_path))
        mask_file = result_folder + "/res_" + filename + '_mask.jpg'
        #cv2.imwrite(mask_file, score_text)
        file_utils.saveResult(image_path, image, polys, dirname=result_folder)

    xml_string = ET.tostring(paper, 'Shift_JIS')

    fout = codecs.open('./data/result.xml', 'w', 'shift_jis')
    fout.write(xml_string.decode('shift_jis'))
    fout.close()

    print("elapsed time : {}s".format(time.time() - t))
예제 #4
0
def main(model_path, dictionary_target, dictionary_retarget, fea, output_path, k=5):
    # set parameters
    params = {}
    params['n'] = 256
    params['m'] = 256
    params['dim_attention'] = 512
    params['D'] = 684
    params['K'] = 106
    params['growthRate'] = 24
    params['reduction'] = 0.5
    params['bottleneck'] = True
    params['use_dropout'] = True
    params['input_channels'] = 1
    params['Kre'] = 7
    params['mre'] = 256

    maxlen = 300
    params['maxlen'] = maxlen

    # load model
    model = Encoder_Decoder(params)
    model.load_state_dict(torch.load(model_path,map_location=lambda storage,loc:storage))
    # enable CUDA
    model.cuda()

    # load source dictionary and invert
    worddicts = load_dict(dictionary_target)
    print ('total chars',len(worddicts))
    worddicts_r = [None] * len(worddicts)
    for kk, vv in worddicts.items():
        worddicts_r[vv] = kk

    reworddicts = load_dict(dictionary_retarget)
    print ('total relations',len(reworddicts))
    reworddicts_r = [None] * len(reworddicts)
    for kk, vv in reworddicts.items():
        reworddicts_r[vv] = kk

    valid,valid_uid_list = dataIterator_test(fea, worddicts, reworddicts,
                         batch_size=8, batch_Imagesize=800000,maxImagesize=800000)

    # change model's mode to eval
    model.eval()

    valid_out_path = output_path + 'symbol_relation/'
    valid_malpha_path = output_path + 'memory_alpha/'
    if not os.path.exists(valid_out_path):
        os.mkdir(valid_out_path)
    if not os.path.exists(valid_malpha_path):
        os.mkdir(valid_malpha_path)
    valid_count_idx = 0
    print('Decoding ... ')
    ud_epoch = time.time()
    model.eval()
    with torch.no_grad():
        for x in valid:
            for xx in x:  # xx:当前batch中的一个数据,numpy
                print('%d : %s' % (valid_count_idx + 1, valid_uid_list[valid_count_idx]))
                xx_pad = np.zeros((xx.shape[0], xx.shape[1], xx.shape[2]), dtype='float32')  # (1,height,width)
                xx_pad[:, :, :] = xx / 255.
                xx_pad = torch.from_numpy(xx_pad[None, :, :, :]).cuda()
                score, sample, malpha_list, relation_sample = \
                            gen_sample(model, xx_pad, params, gpu_flag=False, k=k, maxlen=maxlen)
                # sys.exit()
                if len(score) != 0:
                    score = score / np.array([len(s) for s in sample])
                    # relation_score = relation_score / np.array([len(r) for r in relation_sample])
                    min_score_index = score.argmin()
                    ss = sample[min_score_index]
                    rs = relation_sample[min_score_index]
                    mali = malpha_list[min_score_index]
                    fpp_sample = open(valid_out_path+valid_uid_list[valid_count_idx]+'.txt','w')
                    file_malpha_sample = valid_malpha_path+valid_uid_list[valid_count_idx]+'_malpha.txt'
                    for i, [vv, rv] in enumerate(zip(ss, rs)):
                        if vv == 0:
                            string = worddicts_r[vv] + '\tEnd\n'
                            fpp_sample.write(string)
                            break
                        else:
                            if i == 0:
                                string = worddicts_r[vv] + '\tStart\n'
                            else:
                                string = worddicts_r[vv] + '\t' + reworddicts_r[rv] + '\n'
                            fpp_sample.write(string)
                    np.savetxt(file_malpha_sample, np.array(mali))
                    fpp_sample.close()
                valid_count_idx=valid_count_idx+1
    print('test set decode done')
    ud_epoch = (time.time() - ud_epoch) / 60.
    print('epoch cost time ... ', ud_epoch)
예제 #5
0
def main(model_path, dictionary_target, fea, latex, saveto, output, beam_k=5):
    # model architecture
    params = {}
    params['n'] = 256
    params['m'] = 256
    params['dim_attention'] = 512
    params['D'] = 684
    params['K'] = 5748
    params['growthRate'] = 24
    params['reduction'] = 0.5
    params['bottleneck'] = True
    params['use_dropout'] = True
    params['input_channels'] = 3

    # load model
    model = Encoder_Decoder(params)
    model.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage))
    model.cuda()

    # load dictionary
    worddicts = load_dict(dictionary_target)
    worddicts_r = [None] * len(worddicts)
    for kk, vv in worddicts.items():
        worddicts_r[vv] = kk

    start_time = time.time()
    channels = 1
    folder = './kokumin/'
    out = './kokuminOut/'
    index = 0

    # testing
    model.eval()
    with torch.no_grad():
        for img_file in os.listdir(folder):
            if  '.jpg' in  img_file:
                label_file = folder + 'res_' + img_file.replace('jpg', 'txt')
                if os.path.isfile(label_file) == False: continue
                out_file = out + img_file
                out_txtfile = out + img_file.replace('jpg', 'txt')
                img_file = folder + img_file
                #print img_file, label_file
                im = imread(img_file)
                arr = Image.fromarray(im).convert('RGB')
                draw = ImageDraw.Draw(arr)
   
                #print im.shape
                with open(label_file) as f:
                    BBs = f.readlines()
                BBs = [x.strip().split(',') for x in BBs]
                f = open(out_txtfile, 'w')
                for BB in BBs:
                    x1 = min(int(BB[0]), int(BB[2]), int(BB[4]), int(BB[6]))
                    y1 = min(int(BB[1]), int(BB[3]), int(BB[5]), int(BB[7]))
                    x2 = max(int(BB[0]), int(BB[2]), int(BB[4]), int(BB[6]))
                    y2 = max(int(BB[1]), int(BB[3]), int(BB[5]), int(BB[7]))
                    if x1 < 0: x1 = 0
                    if y1 < 0: y1 = 0

                    draw.rectangle((x1, y1, x2, y2), fill=None, outline=(255, 0 , 0))

                    f.write(str(x1) + ',' + str(y1) + ',' + str(x2) + ',' + str(y2) + ',')
                    input_img = im[y1:y2, x1:x2]
                    w = x2 - x1 + 1
                    h = y2 - y1 + 1
                    #print x1, y1, x2, y2
                    #print w, h
                    if w < h:
                        rate = 20.0/w
                        w = int(round(w*rate))
                        h = int(round(h* rate / 20.0) * 20)
                    else:
                        rate = 20.0/h
                        w = int(round(w*rate / 20.0) * 20)
                        h = int(round(h* rate))
                    #print w, h
                    input_img = imresize(input_img, (h,w))

                    mat = np.zeros([channels, h, w], dtype='uint8')  
                    mat[0,:,:] = input_img
                    #mat[0,:,:] =  0.299* input_img[:, :, 0] + 0.587 * input_img[:, :, 1] + 0.114 * input_img[:, :, 2]

                    xx_pad = mat.astype(np.float32) / 255.
                    xx_pad = torch.from_numpy(xx_pad[None, :, :, :]).cuda()  # (1,1,H,W)
                    sample, score, alpha_past_list = gen_sample(model, xx_pad, params, False, k=beam_k, maxlen=600)
                    score = score / np.array([len(s) for s in sample])
                    ss = sample[score.argmin()]
                    result = ''
                    for vv in ss:
                        if vv == 0: # <eol>
                            break
                        result += worddicts_r[vv] + ' '
                    print ('resutl:',  index, result)
                    f.write(result + '\n')
                f.close()
                arr.save(out_file,"JPEG")
예제 #6
0
KL_loss_s = 0.
loss_s = 0.
ud_s = 0  # time for training an epoch
validFreq = -1
saveFreq = -1
sampleFreq = -1
dispFreq = 100
if validFreq == -1:
    validFreq = len(train)
if saveFreq == -1:
    saveFreq = len(train)
if sampleFreq == -1:
    sampleFreq = len(train)

# initialize model
WAP_model = Encoder_Decoder(params)
if init_param_flag:
    WAP_model.apply(weight_init)
if multi_gpu_flag:
    WAP_model = nn.DataParallel(WAP_model, device_ids=[0, 1, 2, 3])
if reload_flag:
    WAP_model.load_state_dict(
        torch.load(saveto, map_location=lambda storage, loc: storage))
WAP_model.cuda()

# print model's parameters
model_params = WAP_model.named_parameters()
for k, v in model_params:
    print(k)

# loss function