def viterbi_post_process(self, img_arr, results): '''Go through all results and attempts to correct invalid syllables''' final = [[] for i in range(len(results))] for i, line in enumerate(results): syllable = [] for j, char in enumerate(line): if char[-1] in u'་། ' or not word_parts_set.intersection(char[-1]) or j == len(line)-1: if syllable: syl_str = ''.join(s[-1] for s in syllable) if is_non_std(syl_str) and syl_str not in syllables: print syl_str, 'HAS PROBLEMS. TRYING TO FIX' bx = combine_many_boxes([ch[0:4] for ch in syllable]) bx = list(bx) arr = img_arr[bx[1]:bx[1]+bx[3], bx[0]:bx[0]+bx[2]] arr = fadd_padding(arr, 3) try: temp_dir = tempfile.mkdtemp() tmpimg = os.path.join(temp_dir, 'tmp.tif') Image.fromarray(arr*255).convert('L').save(tmpimg) pgrec = PageRecognizer(tmpimg, Config(line_break_method='line_cut', page_type='book', postprocess=False, viterbi_postprocessing=True, clear_hr=False, detect_o=False)) prob, hmm_res = pgrec.recognize_page() os.remove(tmpimg) os.removedirs(temp_dir) except TypeError: print 'HMM run exited with an error.' prob = 0 hmm_res = '' logging.info(u'VPP Correction: %s\t%s' % (syl_str, hmm_res)) if prob == 0 and hmm_res == '': print 'hit problem. using unmodified output' for s in syllable: final[i].append(s) else: bx.append(prob) bx.append(hmm_res) final[i].append(bx) else: for s in syllable: final[i].append(s) final[i].append(char) syllable = [] else: syllable.append(char) if syllable: for s in syllable: final[i].append(s) return final
def viterbi_post_process(img_arr, results): '''Go through all results and attempts to correct invalid syllables''' final = [[] for i in range(len(results))] for i, line in enumerate(results): syllable = [] for j, char in enumerate(line): if char[-1] in u'་། ' or not word_parts.intersection(char[-1]) or j == len(line)-1: if syllable: syl_str = ''.join(s[-1] for s in syllable) if is_non_std(syl_str) and syl_str not in syllables: print syl_str, 'HAS PROBLEMS. TRYING TO FIX' bx = combine_many_boxes([ch[0:4] for ch in syllable]) bx = list(bx) arr = img_arr[bx[1]:bx[1]+bx[3], bx[0]:bx[0]+bx[2]] arr = fadd_padding(arr, 3) try: prob, hmm_res = main(arr, Config(line_break_method='line_cut', page_type='book', postprocess=False, viterbi_postprocess=True, clear_hr=False), page_info={'flname':''}) except TypeError: print 'HMM run exited with an error.' prob = 0 hmm_res = '' # corrections[syl_str].append(hmm_res) logging.info(u'VPP Correction: %s\t%s' % (syl_str, hmm_res)) if prob == 0 and hmm_res == '': print 'hit problem. using unmodified output' for s in syllable: final[i].append(s) else: bx.append(prob) bx.append(hmm_res) final[i].append(bx) else: for s in syllable: final[i].append(s) final[i].append(char) syllable = [] else: syllable.append(char) if syllable: for s in syllable: final[i].append(s) return final
def __init__(self, shapes, k): from sklearn.cluster import KMeans self.shapes = shapes self.k = k self.page_array = shapes.img_arr if shapes.conf['line_cluster_pos'] == 'top': tops = array(shapes.get_tops(), dtype=float64) elif shapes.conf['line_cluster_pos'] == 'center': tops = array( [t[1] + .5*shapes.char_mean for t in shapes.get_boxes() if t[3] > 2* shapes.tsek_mean], dtype=float64 ) else: raise ValueError, "The line_cluster_pos argument must be either 'top' or 'center'" tops.shape = (len(tops), 1) kmeans = KMeans(n_clusters=k) # print tops kmeans.fit(tops) ################## ######## mark cluster centroids on original image and show them # img_arr = shapes.img_arr.copy() # for centroid in kmeans.cluster_centers_: ## print centroid[0] # img_arr[centroid[0],:] = 0 # # import Image # Image.fromarray(img_arr*255).show() #######################3 lines = [[] for i in range(k)] ind = shapes.get_indices() ### Assign char pointers (ind) to the appropriate line ### # [lines[kmeans.labels_[i]].append(ind[i]) for i in range(len(ind))] [lines[kmeans.predict(shapes.get_boxes()[ind[i]][1])[0]].append(ind[i]) for i in range(len(ind))] lines = [l for l in lines if l] self.k = len(lines) boxes = shapes.get_boxes() ### Sort indices so they are in order from top to bottom using y from the first box in each line sort_inx = list(argsort([boxes[line[0]][1] for line in lines])) lines.sort(key=lambda line: boxes[line[0]][1]) ### Get breaklines for splitting up lines ### Uses the topmost box in each line cluster to determine breakline try: topmosts = [min([boxes[i][1] for i in line]) for line in lines] except ValueError: print 'failed to get topmosts...' raise vsums = self.page_array.sum(axis=1) breaklines = [] delta = 25 for c in topmosts: if c - delta < 0: lower = 0 else: lower = c-delta e = argmax(vsums[lower:c+delta]) c = c - delta + e if c < 0: c = 0 breaklines.append(c) breaklines.append(self.page_array.shape[0]) self.baselines = [] for i, br in enumerate(breaklines[:-1]): try: baseline_area = vsums[br:breaklines[i+1]] if baseline_area.any(): self.baselines.append(br + argmin(baseline_area)) else: print i print 'No baseline info' except ValueError: print 'ValueError. exiting...HERE' import traceback;traceback.print_exc() raise final_ind = dict((i, []) for i in range(len(lines))) self.new_contours = {} for j, br in enumerate(breaklines[1:-1]): topcount = 0 bottomcount = 0 for i in lines[j]: # if char extends into next line, break it # 253 is roughly global line height avg + 1 std # The following lines says that a box/char must be extending over # breakline by a non trivial amount eg. 30 px and must itself # be a tall-ish box (roughly the height of average line) in order # for it to be broken. # if (bounding[i][1] + bounding[i][3]) - br >= 30 and bounding[i][3] > 205: if (boxes[i][1] + boxes[i][3]) - br >= 30 and \ (boxes[i][1] + boxes[i][3]) - topmosts[j] > self.shapes.char_mean*2.85: chars = ones((boxes[i][3]+2, boxes[i][2]+2), dtype=uint8) contours = shapes.contours cv.drawContours(chars, [contours[i]], -1,0, \ thickness = -1, offset=(-boxes[i][0]+1,-boxes[i][1]+1)) cv.dilate(chars, None, chars) y_offset = boxes[i][1] new_br = br - y_offset prd_cut = [] ### Iterate through potential cut-points and ### and cut where top half has the highest probability ### that is not a tsek # print 'bottom bound cut point', int(.75*shapes.tsek_mean) for delta in range(-3, int(.75*shapes.tsek_mean), 1): # for delta in range(-3, 100, 1): cut_point = new_br + delta # chars[cut_point, :] = 0 # import Image # Image.fromarray(chars*255).show() tchr = chars[:cut_point,:] tchr = ftrim(tchr) if not tchr.any(): continue tchr = normalize_and_extract_features(tchr) probs = cls.predict_proba(tchr) max_prob_ind = argmax(probs) chr = label_chars[max_prob_ind] prd_cut.append((probs[0,max_prob_ind], chr, cut_point)) prd_cut = [q for q in prd_cut if q[1] != u'་'] try: cut_point = max(prd_cut)[-1] except: print 'No max prob for vertical char break, using default breakline. Usually this means the top half of the attempted segmentation looks like a tsek blob' cut_point = br-boxes[i][1] #######FOLLWNG NOT WORKING ATTEMPTS TO GET A BETTER BREAK LINE # br2 = br-bounding[i][1] # # csum = chars.sum(axis=1) # bzone = csum[br2-25:br2+40] # if bzone.any(): # br2 = np.argmax(bzone) + (br-25) ## print br, 'br' # chars = chars*255 # nbr = br # cv.line(chars, (0, br2), (chars.shape[1], br2), 0) # Image.fromarray(chars).save('/tmp/outt.tiff') # sys.exit() ############# tarr = chars[:cut_point,:] tarr, top_offset = ftrim(tarr, new_offset=True) tarr = fadd_padding(tarr, 3) barr = chars[cut_point:,:] barr = ftrim(barr, sides='brt') barr = fadd_padding(barr, 3) c1, h = cv.findContours(image=tarr, mode=cv.RETR_LIST, method=cv.CHAIN_APPROX_SIMPLE, offset=(boxes[i][0]+top_offset['left'],boxes[i][1])) c1 = c1[argmax([len(t) for t in c1])] # use the most complex contour bnc1 = cv.boundingRect(c1) c2, h = cv.findContours(barr, mode=cv.RETR_LIST, method=cv.CHAIN_APPROX_SIMPLE, offset=(boxes[i][0]-3,boxes[i][1]+cut_point-3)) c2 = c2[argmax([len(t) for t in c2])] bnc2 = cv.boundingRect(c2) topbox_name = 't%d_%d' % (j, topcount) final_ind[j].append(topbox_name) self.new_contours[topbox_name] = (bnc1, c1) topcount += 1 if bnc2[-1] > 8: #only add bottom contour if not trivially small bottombox_name = 'b%d_%d' % (j, bottomcount) final_ind[j+1].append(bottombox_name) self.new_contours[bottombox_name] = (bnc2, c2) bottomcount += 1 else: final_ind[j].append(i) # Don't forget to include the last line map(final_ind[len(lines)-1].append, lines[len(lines)-1]) self.lines_chars = final_ind cctops = [self.shapes.get_boxes()[i][1] for i in self.shapes.small_contour_indices] char_tops = zip(cctops, self.shapes.small_contour_indices) char_tops.sort(key=lambda x: x[0]) sorted_indices = [i[1] for i in char_tops] _line_insert_indxs = [] _line_insert_indxs.extend([bisect_right(char_tops, (i - 1,)) for i in breaklines]) self.small_cc_lines_chars = [] if not _line_insert_indxs: sys.exit() for i, l in enumerate(_line_insert_indxs[:-1]): self.small_cc_lines_chars.append(sorted_indices[l:_line_insert_indxs[i+1]]) self.small_cc_lines_chars.append(sorted_indices[_line_insert_indxs[-1]:]) self.small_cc_lines_chars = [self.small_cc_lines_chars[i] for i in range(len(self.lines_chars)) if self.lines_chars[i]] cctops = [self.shapes.get_boxes()[i][1] for i in self.shapes.emph_symbols] char_tops = zip(cctops, self.shapes.emph_symbols) char_tops.sort(key=lambda x: x[0]) empred = [kmeans.predict(shapes.get_boxes()[i][1])[0] for i in self.shapes.emph_symbols] self.emph_lines = [[] for i in range(k)] for nn, e in enumerate(empred): self.emph_lines[sort_inx.index(e)].append(self.shapes.emph_symbols[nn]) if self.shapes.detect_o: cctops = [self.shapes.get_boxes()[i][1] for i in self.shapes.naros] char_tops = zip(cctops, self.shapes.naros) char_tops.sort(key=lambda x: x[0]) sorted_indices = [i[1] for i in char_tops] _line_insert_indxs = [] _line_insert_indxs.extend([bisect_right(char_tops, (i - 1,)) for i in breaklines]) if not _line_insert_indxs: sys.exit() self.line_naros = [] for i, l in enumerate(_line_insert_indxs[:-1]): # self.line_naros.append(sorted_indices[l:_line_insert_indxs[i+1]]) self.line_naros.append(sorted_indices[_line_insert_indxs[-1]:]) self.line_naros = [self.line_naros[i] for i in range(len(self.lines_chars)) if self.lines_chars[i]] self.line_naro_spans = [] for ll, mm in enumerate(self.line_naros): thisline = [] for nn, naro in enumerate(mm): box = self.get_box(naro) thisline.append(box) thisline.sort(key=lambda x: x[0]) self.line_naros[ll].sort(key=lambda x: self.get_box(x)[0]) self.line_naro_spans.append(thisline) if self.shapes.low_ink: cctops = [lib[1] for lib in self.shapes.low_ink_boxes] char_tops = zip(cctops, self.shapes.low_ink_boxes) char_tops.sort(key=lambda x: x[0]) sorted_indices = [i[1] for i in char_tops] _line_insert_indxs = [] _line_insert_indxs.extend([bisect_right(char_tops, (i - 1,)) for i in breaklines]) self.low_ink_boxes = [] if not _line_insert_indxs: sys.exit() for i, l in enumerate(_line_insert_indxs[:-1]): self.low_ink_boxes.append(sorted_indices[l:_line_insert_indxs[i+1]]) self.low_ink_boxes.append(sorted_indices[_line_insert_indxs[-1]:]) self.low_ink_boxes = [self.low_ink_boxes[i] for i in range(len(self.lines_chars)) if self.lines_chars[i]]
def construct_vector_set_experimental(self): NINF = -np.inf final_box_info = CombineBoxesForPage(self.line_info) self.final_box_info = final_box_info final_boxes = final_box_info.final_boxes final_indices = final_box_info.final_indices scales = final_box_info.transitions self.vectors = [[] for i in range(self.line_info.k)] self.new_boxes = [[] for i in range(self.line_info.k)] # cur_mean = self.final_box_info.char_mean cur_std = self.final_box_info.char_std BREAKWIDTH = self.breakwidth rbfcls = self.line_info.rbfcls for l in range(len(final_indices)): # for each line try: scale_l = scales[l] except: print 'ERROR AT ', l, len(scales) raise char_mean_int = floor(final_box_info.char_mean) char_std_int = ceil(final_box_info.char_std) try: lb = range(len(final_indices[l])) except IndexError: print 'index error' continue segmented = 0 for i in lb: # for each line box ## New draw, takes into account tree hierarchy of contours x, y, w, h = final_boxes[l][i] letter = ones((h, w), dtype=uint8) for k in final_indices[l][i]: if not isinstance(k, str): letter = self.line_info.shapes.draw_contour_and_children( k, char_arr=letter, offset=(-x, -y)) else: cv.drawContours(letter, [self.line_info.get_contour(k)], -1, 0, thickness=-1, offset=(-x, -y)) letter = cv.resize(letter, dsize=(0, 0), fx=scale_l, fy=scale_l) if letter.shape[1] >= (final_box_info.char_mean + BREAKWIDTH * final_box_info.char_std ): # if a box is too large, break it # segmented += 1 sw = w * scale_l sh = h * scale_l vsum = letter.sum(axis=0) chars = sw // (final_box_info.char_mean - 1.5 * final_box_info.char_std ) # important, floor division if 10.0 > chars > 1.0: # Assume chars-to-be-broken don't span > 10 # if chars: w = sw h = sh best_box_dim = [] best_prob = 0.0 best_seq = None ## Iterate through a range of variable chars if ## chars is greater than 2. This allows potential ## breaks for chars-1, chars-2 # all_choices = [] for chars in range(int(chars), 1, -1): for z in range(0, 21, 2): segs = [] prev_breakline = 0 for pos in range(int(chars - 1)): if char_mean_int - z >= 0: upper_range = [ int( np.round((pos + 1) * (char_mean_int - z))), int( np.round((pos + 1) * (char_mean_int + z))) ] vsum_range = vsum[ upper_range[0]:upper_range[1]] if vsum_range.any(): breakline = int( np.round((pos + 1) * (char_mean_int - z) + argmax(vsum_range))) else: breakline = None if breakline: sg = letter[:, prev_breakline: breakline] prev_breakline = breakline else: sg = letter[:, int( np.round(pos * ( char_mean_int - z)) ):int( np. round((pos + 1) * ( char_mean_int - z)))] prev_breakline = int( np.round((pos + 1) * (char_mean_int - z))) segs.append(sg) segs.append( letter[:, int( np.round((chars - 1) * (char_mean_int - z))):]) segs = [fadd_padding(sg, 2) for sg in segs] seg_ctrs = [ cv.findContours( sg.copy(), mode=cv.RETR_CCOMP, method=cv.CHAIN_APPROX_SIMPLE) for sg in segs ] try: seg_bxs = [[ cv.boundingRect(k) for k in sgc[0] ] for sgc in seg_ctrs] except: print sgc raise bxs = [] nsegs = [] prev_w = 0 for zi, ltb in enumerate(seg_bxs): seg = segs[zi] for b in ltb: if b[2] < ( final_box_info.tsek_mean + 4 * final_box_info.tsek_std ) or b[3] < final_box_info.tsek_mean + 4 * final_box_info.tsek_std: seg[b[1] - 1:b[1] + b[3] + 1, b[0] - 1:b[0] + b[2] + 1] = True seg, ofst = ftrim(seg, new_offset=True) bx = [ x + prev_w + (ofst['left'] / scale_l), y + (ofst['top'] / scale_l), seg.shape[1] / scale_l, seg.shape[0] / scale_l ] prev_w += seg.shape[1] / scale_l bxs.append(bx) nsegs.append(seg) xt = [ normalize_and_extract_features(sg) for sg in nsegs if 0 not in sg.shape ] prd_probs = cls.predict_log_proba(xt) prd_probs = prd_probs.astype(np.float32) prob, prds = viterbi_cython( prd_probs.shape[0], n_states, start_p, trans_p, prd_probs) prob = np.exp(prob) if prob > best_prob: best_prob = prob best_seq = prds best_box_dim = bxs best_xt = xt if not best_box_dim: best_prob = prob best_seq = prds best_box_dim = bxs best_xt = xt for u in range(len(best_seq)): self.vectors[l].append(label_chars[best_seq[u]]) best_box = best_box_dim[u] best_box = [int(np.round(ii)) for ii in best_box] best_box.append(best_prob) best_box.append(label_chars[best_seq[u]]) self.new_boxes[l].append(best_box) try: self.line_info.shapes.img_arr[ best_box[1]:best_box[1] + best_box[3], best_box[0] + best_box[2]] = 1 except: pass else: self.new_boxes[l].append([x, y, w, h]) vect = normalize_and_extract_features(letter) self.vectors[l].append(vect) else: self.new_boxes[l].append([x, y, w, h]) vect = normalize_and_extract_features(letter) self.vectors[l].append(vect) if not any(self.vectors): print 'no vectors' return else: if self.line_info.shapes.detect_o: for i, l in enumerate(self.new_boxes): for n in self.line_info.line_naros[i]: box = self.line_info.get_box(n) x, y, w, h = box r0 = x + w for k, b in enumerate(l): if ((b[2] + w) - abs(b[0] - x) - abs( (b[0] + b[2]) - r0)) / ( 2 * float(min(w, b[2]))) > .8: try: nbox = list(combine_many_boxes([box, b])) except: print nbox[3] raise if isinstance(self.vectors[i][k], unicode): self.vectors[i][k] += u'ོ' nbox = b nbox[-1] = self.vectors[i][k] else: probs = cls.predict_log_proba( self.vectors[i][k]) mx = np.argmax(probs) prob = probs[0][mx] mx = rbfcls.predict(self.vectors[i][k])[0] ch = label_chars[mx] + u'ོ' self.vectors[i][k] = ch nbox.append(prob) nbox.append(ch) self.new_boxes[i][k] = nbox
def _sample_widths_method(self, chars, letter, letter_box, oo_scale_l, line_num=None): x, y, w, h = letter_box ################default cur_mean = self.final_box_info.char_mean * .97 cur_std = .295 * self.final_box_info.char_std ################# best_prob = -np.inf if chars > 1: letter = cv.dilate(letter.copy(), None, iterations=1) padding_amount = 3 for n in range(15): widths = [gauss(cur_mean, cur_std) for i in range(chars)] prev = 0 vecs = [] wdthprobs = 0 boxes = [] for i, val in enumerate(widths): if i == chars - 1: end = letter.shape[1] else: end = prev + val wdthprobs += gausslogprob(cur_mean, cur_std, end - prev) s = fadd_padding(letter[:, int(prev):int(end)], padding_amount) _, ctrs, hier = cv.findContours( s.copy(), mode=cv.RETR_TREE, method=cv.CHAIN_APPROX_NONE) bounding = map(boundingRect, ctrs) for k, b in enumerate(bounding): if (b[2] < 23 or b[3] < 23) and hier[0][k][3] == 0: s[b[1] - 1:b[1] + b[3] + 1, b[0] - 1:b[0] + b[2] + 1] = 1 s = s[padding_amount:-padding_amount, padding_amount:-padding_amount] s, ofst = ftrim(s, new_offset=True) if 0 not in s.shape: nnbox = [ x + (prev + ofst['left']) * oo_scale_l, y + (ofst['top'] * oo_scale_l), s.shape[1] * oo_scale_l, s.shape[0] * oo_scale_l ] if line_num is not None: naro = self.line_info.check_naro_overlap( line_num, nnbox) if naro != False: naro_box = self.line_info.get_box(naro) nnbox = combine_many_boxes([nnbox, naro_box]) ss = cv.resize(s, dsize=(0, 0), fx=oo_scale_l, fy=oo_scale_l) ss = np.vstack((ones( (nnbox[3] - ss.shape[0], ss.shape[1]), dtype=ss.dtype), ss)) ss = hstack( (ss, ones( (ss.shape[0], nnbox[2] - ss.shape[1]), dtype=ss.dtype))) cv.drawContours( ss, [self.line_info.get_contour(naro)], -1, 0, thickness=-1, offset=(-naro_box[0], -naro_box[1])) s = ss vecs.append(normalize_and_extract_features(s)) boxes.append(nnbox) else: break prev += val if not vecs: continue xn = len(vecs) vecs = np.array(vecs).reshape(xn, 346) # 346 is len(vecs[0]) probs = predict_log_proba(vecs) probs = probs.astype(np.float32) if n % 10 == 0 and n != 0: cur_mean = self.final_box_info.char_mean * ( .97 - (3 * n / 1000.0)) prob, prds = viterbi_cython(xn, n_states, start_p, trans_p, probs) prob = prob + wdthprobs if prob > best_prob: best_prob = prob best_prd = prds best_boxes = boxes else: best_boxes = [letter_box] probs = predict_log_proba(normalize_and_extract_features(letter)) amx = probs[0].argmax() try: startprob = start_p[amx] except IndexError: startprob = 1e-10 best_prob = probs[0][amx] + gausslogprob( cur_mean, cur_std, letter_box[2] / oo_scale_l) + startprob best_prd = [amx] final_prob = best_prob res = [] for i, val in enumerate(best_prd): best_boxes[i] = [int(np.round(k)) for k in best_boxes[i]] best_boxes[i].extend([float(np.exp(final_prob)), label_chars[val]]) res.append(best_boxes[i]) return (final_prob, res)