def test_accuracy(t, clsf=None): '''Get accuracy score for a testset t''' if clsf: cls = clsf else: global cls y = tsets[t][:,0] x = tsets[t][:,1:] x3 = [] for j in x: j = ftrim(j.reshape((32,16)).astype(np.uint8)) x3.append(normalize_and_extract_features(j)) pred = cls.predict(x3) s = 0 for i, p in enumerate(pred): if float(p) == y[i]: s += 1.0 else: pass print 'correct', label_chars[y[i]], '||', label_chars[p], t #, max(cls.predict_proba(x3[i])[0]) score = s / len(y) return score
def __init__(self, img_arr, fast_cls, small_coef=1, low_ink=False, \ page_type=None, flpath=None, detect_o=True,\ clear_hr = False): #lower coef means more filtering USE 3 for nying gyud self.img_arr = img_arr self.page_type = page_type self.flpath = flpath self.low_ink = low_ink self.detect_o = detect_o # self.clear_hr = clear_hr # self.cached_features = {} # self.cached_pred_prob = {} self.cached_features = OrderedDict() self.cached_pred_prob = OrderedDict() # self.low_ink = True # if page_type == 'pecha': # self._contour_mode = cv.RETR_CCOMP # else: self._contour_mode = cv.RETR_TREE ### repeatedly called functions ones = np.ones uint8 = np.uint8 predict = fast_cls.predict predict_proba = fast_cls.predict_proba _, self.contours, self.hierarchy = self._contours() self.boxes = [] self.indices = [] self.small_coef = small_coef FILTERED_PUNC = (u'།', u'་', u']', u'[') self._set_shape_measurements() if page_type == 'pecha': if clear_hr: print 'Warning: clear_hr called on pecha format. For clearing text' self.force_clear_hr() self.set_pecha_layout() if self.indices: content_parent = int( statsmode([self.hierarchy[0][i][3] for i in self.indices])[0]) else: print 'no content found' else: content_parent = int( statsmode([hier[3] for hier in self.hierarchy[0]])[0]) self.indices = self.get_indices() # if self.page_type != 'pecha': ### Find the parent with the most children. Call it 'content_parent' # content_parent = int(statsmode([self.hierarchy[0][i][3] for i in self.indices])[0]) # width_measures = self.char_gaussians([b[2] for b in self.get_boxes() if (b[2] < .1*self.img_arr.shape[1]] and self.hierarchy[0][] )) outer_contours = [] outer_widths = [] # pg = np.ones_like(img_arr) ## Iterate through all contours for i in self.indices: cbox = self.get_boxes()[i] x, y, w, h = cbox ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS. Recently # added the len(indices) < 40 as a way to prevent exaggerated # filtering of small lines where gaussian width measures # are meaningless due to small sample size (too few contours) # if self.hierarchy[0][i][3] == content_parent and (cbox[2] < .1*self.img_arr.shape[1] or len(self.indices) < 40 ): if self.hierarchy[0][i][3] == content_parent and ( cbox[2] < .1 * self.img_arr.shape[1] or len(self.indices) < 40): # if self.hierarchy[0][i][3] == content_parent and cbox[2] < 3*self.char_mean: ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS # if self.hierarchy[0][i][3] == content_parent and cbox[2] < .075*self.img_arr.shape[1]: ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS outer_contours.append(i) outer_widths.append(cbox[2]) # if cbox[2] > 50: print cbox[2], # x,y,w,h = cbox # cv.rectangle(self.img_arr, (x,y), (x+w, y+h), 0) else: # if cbox[2] > 100: # print cbox # raw_input('continue?') if cbox[2] > .66 * self.img_arr.shape[1]: print cbox[2] / float(self.img_arr.shape[1]) if clear_hr and .995*self.img_arr.shape[1] > cbox[2] > \ .66*self.img_arr.shape[1] and cbox[1] < .25*self.img_arr.shape[0]: self.img_arr[0:cbox[1] + cbox[3], :] = 1 # print 'rejected box. too wide?', cbox[2] >= .1*self.img_arr.shape[1] # print # print max(outer_widths) width_measures = self.char_gaussians(outer_widths) # import Image # Image.fromarray(self.img_arr*255).show() # newarr = np.ones_like(img_arr) # for o in self.indices: # x,y,w,h = self.get_boxes()[o] # cv.rectangle(newarr, (x,y), (x+w, y+h), 0) # if self.hierarchy[0][o][3] == content_parent: # self.draw_contour_and_children(o, newarr, (0,0)) # # import Image # Image.fromarray(newarr*255).show() # import sys; sys.exit() for i, j in zip(['char_mean', 'char_std', 'tsek_mean', 'tsek_std'], width_measures): setattr(self, i, j) # print self.gmm.converged_ # print self.char_mean, self.char_std # print self.tsek_mean, self.tsek_std self.small_contour_indices = [] # self.contours = [] self.indices = [] # Need to reset!19 self.emph_symbols = [] self.naros = [] # print self.char_mean, self.char_std, self.tsek_mean for i in outer_contours: cbox = self.get_boxes()[i] # if small and has no children, put in small list (this could backfire with false interiors e.g. from salt and pepper noise) ## NOTE: previously small was defined as less than tsek_mean + 3xtsek std ## however, this wasn't always working. changing to less than charmean ## minus 2xchar std however should watch to see if is ok for many different inputs... x, y, w, h = cbox tmparr = ones((h, w), dtype=uint8) tmparr = self.draw_contour_and_children(i, tmparr, (-x, -y)) features = normalize_and_extract_features(tmparr) self.cached_features[i] = features prprob = predict_proba(features) # all_feats = self.cached_features.values() # all_probs = predict_proba(all_feats) # all_probs = predict_proba(self.cached_features.values()) # for ix,i in enumerate(outer_contours): # prprob = all_probs[ix] # if recognizer == 'probout': mxinx = prprob.argmax() quick_prd = label_chars[mxinx] self.cached_pred_prob[i] = (mxinx, prprob[0]) # self.cached_pred_prob[i] = (mxinx, prprob) # else: # quick_prd = label_chars[predict_proba(features).argmax()] # quick_prd = label_chars[predict(features)[0]] # is_emph_symbol = quick_prd in set([u'༷', u'༵', u'༼', u'༽', u'—']) is_emph_symbol = quick_prd in set([u'༷', u'༵', u'༼', u'༽']) # is_emph_symbol = quick_prd in set([u'༼', u'༽']) # is_emph_symbol = quick_prd in set([u'༷', u'༵']) # is_emph_symbol = quick_prd in set([u'༼', u'༽', u'—']) # is_emph_symbol = quick_prd in set([u'༼', u'༽']) # is_emph_symbol = quick_prd == '~~' # use this line if don't want this to actually get anything # if is_emph_symbol: print 'found naro? ', is_emph_symbol # import Image; Image.fromarray(tmparr*255).show() if is_emph_symbol: self.emph_symbols.append(i) print 'EMPHSYMBOLFOUND', quick_prd # cv.rectangle(self.img_arr, (x,y), (x+w, y+h), 0) elif quick_prd == u'ོ' and self.detect_o: self.naros.append(i) elif cbox[2] < 7: # elif cbox[2] < 9: continue # elif (cbox[2] <= self.char_mean - 2*self.char_std and # elif (cbox[2] <= self.char_mean - 3*self.char_std and # elif (cbox[2] <= self.tsek_mean*1.5 and # elif (cbox[2] <= self.tsek_mean*.0 and elif (cbox[2] <= self.tsek_mean * 3 and # elif (cbox[2] <= self.char_mean - 4*self.char_std and # self.hierarchy[0][i][2] < 0 and quick_prd in FILTERED_PUNC ) and not self.low_ink: # default!!! # quick_prd in (u'་')) and not self.low_ink: # quick_prd not in word_parts_set) and not self.low_ink : self.small_contour_indices.append(i) # self.indices.append(i) #DEFAULT # elif (cbox[2] <= self.tsek_mean*.8 and # elif (cbox[2] <= self.tsek_mean*.3 and # elif (cbox[2] <= self.char_mean - 4*self.char_std and # self.hierarchy[0][i][2] < 0 and not self.low_ink): # cv.rectangle(self.img_arr, (x,y), (x+w, y+h), 0) # continue else: # cv.rectangle(self.img_arr, (x,y), (x+w, y+h), 0) self.indices.append(i) # if (cbox[2] <= self.tsek_mean*1.5 and ## elif (cbox[2] <= self.char_mean - 4*self.char_std and # self.hierarchy[0][i][2] < 0 and # quick_prd in (u'།', u'་')): # self.small_contour_indices.append(i) # import Image # Image.fromarray(tmparr*255).convert('L').save('/tmp/examples/%04d.tif' % i) # print len(self.small_contour_indices), 'len small contour ind' # import Image # Image.fromarray(self.img_arr*255).show() # print scount # raw_input() if self.detect_o: print 'pre-filtered na-ro vowel', len(self.naros), 'found' # for i in self.indices: # if cbox[2] > 50: print cbox[2], # bx = self.boxes[i] # x,y,w,h = bx # cv.rectangle(img_arr, (x,y), (x+w, y+h), 0) # import Image # Image.fromarray(img_arr*255).show() # raw_input() # for i in self.indices: # if self.hierarchy[0][i][2] >= 0: # char = self.draw_contour_and_children(i) # # Image.fromarray(char*255).show() # raw_input() # from matplotlib import pyplot as plt # from matplotlib.mlab import normpdf # plt.subplot(111) # plt.title('tsek-char distributions, pre-segmentation') # ## widths = [self.boxes[i][2] for i in self.get_indices()] # n,bins,p = plt.hist(outer_widths, 200, range=(0,75), normed=True, color='#3B60FA') # plt.vlines([self.char_mean, self.tsek_mean], 0, np.array([max(n), max(n)]), linestyles='--') # plt.plot(bins, normpdf(bins, self.tsek_mean, self.tsek_std), label='fit', linewidth=1) # plt.fill_between(bins, normpdf(bins, self.tsek_mean, self.tsek_std), color=(.58,.63,.8), alpha=0.09) # plt.plot(bins, normpdf(bins, self.char_mean, self.char_std), label='fit', linewidth=1) # plt.fill_between(bins, normpdf(bins, self.char_mean, self.char_std), color=(.58,.63,.8), alpha=0.01) # plt.show() # print self.tsek_mean, self.tsek_std # print len(self.boxes) # font_detector.save_info(self.char_mean, self.char_std, self.tsek_mean, self.tsek_std) # self.low_ink = False if self.low_ink: self._low_ink_setting()
def recognize_chars_probout(segmentation, tsek_insert_method='baseline', ): '''Recognize characters using segmented char data Parameters: -------------------- segmentation: an instance of PechaCharSegmenter or Segmenter Returns: -------------- results: list of lists containing [x,y,width, height, prob, unicode], specifying the coordinates of the bounding box of stack, it probability, and its unicode characters -- on each line of the page''' results = [] tsek_mean = segmentation.final_box_info.tsek_mean cached_features = segmentation.line_info.shapes.cached_features cached_pred_prob = segmentation.line_info.shapes.cached_pred_prob for l, vectors in enumerate(segmentation.vectors): if not vectors: print 'no vectors...' continue tmp_result = [] new_boxes = segmentation.new_boxes[l] scale_w = segmentation.final_box_info.transitions[l] small_chars = segmentation.line_info.small_cc_lines_chars[l] #FIXME: define emph lines for line cut #### Line Cut has no emph_lines object so need to work around for now... emph_markers = getattr(segmentation.line_info, 'emph_lines', []) if emph_markers: emph_markers = emph_markers[l] img_arr = segmentation.line_info.shapes.img_arr left_edges = [b[0] for b in new_boxes] tsek_widths = [] for s in small_chars[::-1]: # consider small char from end of line going backward. backward useful for misplaced tsek often and maybe for TOC though should check bx = segmentation.line_info.shapes.get_boxes()[s] bx = list(bx) x,y,w,h = bx try: feature_vect = cached_features[s] inx, probs = cached_pred_prob[s] prob = probs[inx] prd = dig_to_char[inx] except: cnt = segmentation.line_info.shapes.contours[s] char_arr = np.ones((h,w), dtype=np.uint8) offset = (-x, -y) drawContours(char_arr, [cnt], -1,0, thickness = -1, offset=offset) feature_vect = normalize_and_extract_features(char_arr) prd, prob = prd_prob(feature_vect) insertion_pos = bisect(left_edges, x) left_items = 6 right_items = 5 if insertion_pos >= len(new_boxes): # insertion is at or near end of line and needs more left # neighbors to compensate for there being less chars to define the baseline left_items = 12 elif insertion_pos <= len(new_boxes): # same as above except at front of line right_items = 12 # right_items = 5 # bias slightly toward the left. if tsek_insert_method == 'baseline': top = 1000000 # arbitrary high number bottom = 0 #### Get min or max index to avoid reaching beyond edges of the line lower = max(insertion_pos - left_items, 0) upper = min(len(new_boxes)-1, insertion_pos+right_items) left = new_boxes[lower][0] right = new_boxes[upper][0] + new_boxes[upper][2] if insertion_pos < len(new_boxes): mid = new_boxes[insertion_pos][0] + new_boxes[insertion_pos][2] else: mid = right for j in new_boxes[lower:upper]: if j[1] < top: top = j[1] if j[1] + j[3] > bottom: bottom = j[1] + j[3] local_span = bottom - top top, bottom, left, right, mid = [int(np.round(ff)) for ff in [top, bottom, left, right, mid]] if prd == u'་' and local_span > 0: left_sum = img_arr[top:bottom,left:mid].sum(axis=1) right_sum = img_arr[top:bottom,mid:right].sum(axis=1) local_baseline_left = top + left_sum.argmin() if mid != right: local_baseline_right = top + right_sum.argmin() else: local_baseline_right = local_baseline_left if ((local_baseline_left >= bx[1] and local_baseline_left <= bx[1] + bx[3]) or (local_baseline_right >= bx[1] and local_baseline_right <= bx[1] + bx[3])) or (insertion_pos == len(vectors)): #or # (entire_local_baseline >= bx[1] and entire_local_baseline <= bx[1] + bx[3])): ### Account for fact that the placement of a tsek could be # before or after its indicated insertion pos ### experimental.. only need with certain fonts e.g. "book 6" ## in samples if insertion_pos <= len(new_boxes): prev_box = new_boxes[insertion_pos-1] left_prev = prev_box[0] if 0 <= x - left_prev < w and 2*w < prev_box[2]: insertion_pos -= 1 vectors.insert(insertion_pos, prd) new_boxes.insert(insertion_pos, bx) new_boxes[insertion_pos].append(prob) new_boxes[insertion_pos].append(prd) left_edges.insert(insertion_pos, bx[0]) tsek_widths.append(bx[2]) elif (bx[1] >= top -.25*local_span and bx[1] + bx[3] <= bottom + local_span*.25) or (insertion_pos == len(vectors)): vectors.insert(insertion_pos, prd) new_boxes.insert(insertion_pos, bx) new_boxes[insertion_pos].append(prob) new_boxes[insertion_pos].append(prd) left_edges.insert(insertion_pos, bx[0]) else: vectors.insert(insertion_pos, prd) new_boxes.insert(insertion_pos, bx) new_boxes[insertion_pos].append(prob) new_boxes[insertion_pos].append(prd) left_edges.insert(insertion_pos, bx[0]) for em in emph_markers: bx = segmentation.line_info.shapes.get_boxes()[em] mkinx = segmentation.line_info.shapes.cached_pred_prob[em][0] marker = dig_to_char[mkinx] marker_prob = segmentation.line_info.shapes.cached_pred_prob[em][1][mkinx] bx = list(bx) x,y,w,h = bx bx.append(marker_prob) bx.append(marker) insertion_pos = bisect(left_edges, x) vectors.insert(insertion_pos, marker) new_boxes.insert(insertion_pos, bx) left_edges.insert(insertion_pos, bx[0]) if len(vectors) == 1: i = -1 skip_next_n = 0 for i, v in enumerate(vectors[:-1]): if skip_next_n: skip_next_n -= 1 continue if new_boxes[i+1][0] - (new_boxes[i][0] + new_boxes[i][2]) >= 2*tsek_mean: if not len(new_boxes[i]) == 6 and not isinstance(v, unicode): prd, prob = prd_prob(v) else: if len(new_boxes[i]) == 6: prob, prd = new_boxes[i][4:] else: ## v is unicode stack, likely from segmentation step prd = v prob = .95 # NEED ACTUAL PROB new_boxes[i].append(prob) new_boxes[i].append(prd) tmp_result.append(new_boxes[i]) tmp_result.append([-1,-1,-1,-1, 1.0, u' ']) else: if hasattr(v, 'dtype'): try: prd, prob = prd_prob(v) except: print v new_boxes[i].append(prob) new_boxes[i].append(prd) else: if len(new_boxes[i]) == 6: prob, prd = new_boxes[i][4:] else: prd = v if len(new_boxes[i]) < 6: try: new_boxes[i].append(prob) except: new_boxes[i].append(1) new_boxes[i].append(prd) tmp_result.append(new_boxes[i]) if hasattr(vectors[-1], 'dtype'): prd, prob = prd_prob(vectors[-1]) new_boxes[-1].append(prob) new_boxes[-1].append(prd) tmp_result.append(new_boxes[-1]) results.append(tmp_result) return results
def recognize_chars_hmm(segmentation, tsek_insert_method='baseline', ): '''Recognize characters using segmented char data Parameters: -------------------- segmentation: an instance of PechaCharSegmenter or Segmenter Returns: -------------- results: list of lists containing [x,y,width, height, prob, unicode], specifying the coordinates of the bounding box of stack, it probability, and its unicode characters -- on each line of the page ''' n_states = trans_p.shape[0] results = [] tsek_mean = segmentation.final_box_info.tsek_mean cached_features = segmentation.line_info.shapes.cached_features cached_pred_prob = segmentation.line_info.shapes.cached_pred_prob # width_dists = {} # times = [] for l, vectors in enumerate(segmentation.vectors): if not vectors: print 'no vectors...' continue tmp_result = [] new_boxes = segmentation.new_boxes[l] small_chars = segmentation.line_info.small_cc_lines_chars[l] #FIXME: define emph lines for line cut #### Line Cut has no emph_lines object so need to work around for now... emph_markers = getattr(segmentation.line_info, 'emph_lines', []) if emph_markers: emph_markers = emph_markers[l] img_arr = segmentation.line_info.shapes.img_arr left_edges = [b[0] for b in new_boxes] tsek_widths = [] for s in small_chars[::-1]: # consider small char from end of line going backward. backward useful for misplaced tsek often and maybe for TOC though should check bx = segmentation.line_info.shapes.get_boxes()[s] bx = list(bx) x,y,w,h = bx try: feature_vect = cached_features[s] inx, probs = cached_pred_prob[s] prob = probs[inx] prd = dig_to_char[inx] # else: # vect = normalize_and_extract_features(letter) except: cnt = segmentation.line_info.shapes.contours[s] char_arr = np.ones((h,w), dtype=np.uint8) offset = (-x, -y) drawContours(char_arr, [cnt], -1,0, thickness = -1, offset=offset) feature_vect = normalize_and_extract_features(char_arr) # prd = classify(feature_vect) prd, prob = prd_prob(feature_vect) # print prd, max(cls.predict_proba(feature_vect)[0]) insertion_pos = bisect(left_edges, x) left_items = 6 right_items = 5 if insertion_pos >= len(new_boxes): left_items = 12 elif insertion_pos <= len(new_boxes): # same as above except at front of line right_items = 12 if tsek_insert_method == 'baseline': top = 1000000 # arbitrary high number bottom = 0 #### Get min or max index to avoid reaching beyond edges of the line lower = max(insertion_pos - left_items, 0) upper = min(len(new_boxes)-1, insertion_pos+right_items) #### left = new_boxes[lower][0] right = new_boxes[upper][0] + new_boxes[upper][2] if insertion_pos < len(new_boxes): mid = new_boxes[insertion_pos][0] + new_boxes[insertion_pos][2] else: mid = right for j in new_boxes[lower:upper]: if j[1] < top: top = j[1] try: if j[1] + j[3] > bottom: bottom = j[1] + j[3] except IndexError: print new_boxes[lower:upper] print j raise local_span = bottom - top left_sum = img_arr[top:bottom,left:mid].sum(axis=1) right_sum = img_arr[top:bottom,mid:right].sum(axis=1) try: local_baseline_left = top + left_sum.argmin() except: local_baseline_left = top if mid != right: local_baseline_right = top + right_sum.argmin() else: local_baseline_right = local_baseline_left if prd == u'་' and local_span > 0: if ((local_baseline_left >= bx[1] and local_baseline_left <= bx[1] + bx[3]) or (local_baseline_right >= bx[1] and local_baseline_right <= bx[1] + bx[3])) or (insertion_pos == len(vectors)): #or if insertion_pos <= len(new_boxes): prev_box = new_boxes[insertion_pos-1] left_prev = prev_box[0] if 0 <= x - left_prev < w and 2*w < prev_box[2]: insertion_pos -= 1 new_boxes.insert(insertion_pos, bx) bx.append(prob) bx.append(prd) vectors.insert(insertion_pos, bx) left_edges.insert(insertion_pos, bx[0]) tsek_widths.append(bx[2]) elif ((bx[1] >= top -.25*local_span and bx[1] + bx[3] <= bottom + local_span*.25) or (insertion_pos == len(vectors))) and bx[1] - local_baseline_left < 2*tsek_mean: vectors.insert(insertion_pos, prd) new_boxes.insert(insertion_pos, bx) new_boxes[insertion_pos].append(prob) new_boxes[insertion_pos].append(prd) left_edges.insert(insertion_pos, bx[0]) else: print 'small contour reject at', l, s, 'local height span', local_span, 'box height', bx[3] else: vectors.insert(insertion_pos, prd) new_boxes.insert(insertion_pos, bx) new_boxes[insertion_pos].append(prob) new_boxes[insertion_pos].append(prd) left_edges.insert(insertion_pos, bx[0]) for em in emph_markers: mkinx = segmentation.line_info.shapes.cached_pred_prob[em][0] marker = dig_to_char[mkinx] marker_prob = segmentation.line_info.shapes.cached_pred_prob[em][1][mkinx] bx = segmentation.line_info.shapes.get_boxes()[em] bx = list(bx) x,y,w,h = bx insertion_pos = bisect(left_edges, x) vectors.insert(insertion_pos, marker) bx.append(marker_prob) bx.append(marker) new_boxes.insert(insertion_pos, bx) left_edges.insert(insertion_pos, bx[0]) if len(vectors) == 1: i = -1 skip_next_n = 0 ###HMM PHASE allstrs = [] curstr = [] allinx = [] curinx = [] for j, v in enumerate(vectors): islist = isinstance(v, list) if isinstance(v, unicode) or islist: allstrs.append(curstr) allinx.append(curinx) curstr = [] curinx = [] else: curstr.append(v) curinx.append(j) if curstr: allstrs.append(curstr) allinx.append(curinx) for f, group in enumerate(allstrs): if not group: continue try: probs = predict_log_proba(group) except: print v, # raise LPROB = len(probs) if LPROB == 1: inx = probs[0].argmax() prb = probs[0][inx] prds = [inx] else: probs = probs.astype(np.float32) prb, prds = viterbi_cython(LPROB, n_states, start_p, trans_p, probs) prb = np.exp(prb) inx = allinx[f] for vv, c in enumerate(range(len(prds))): ind = inx[c] cprob = probs[c].max() #######replace low prob stacks using svm rbf classifier ####### warning: this may undo decisions made by hmm classifier # if np.exp(cprob) <= .98: # # print prds, type(prds) # print 'replacing', dig_to_char[prds[c]], 'with', # prds[c] = rbfcls.predict(group[vv])[0] # # print prds, type(prds) # # print prds[c] # print dig_to_char[prds[c]] # print ####################### new_boxes[ind].append(np.exp(cprob)) try: new_boxes[ind].append(dig_to_char[prds[c]]) except KeyError: new_boxes[ind].append('PROB') for ind, b in enumerate(new_boxes): tmp_result.append(new_boxes[ind]) if not len(new_boxes[ind]) == 6: print l, ind, new_boxes[ind], '<-----' if ind + 1 < len(new_boxes) and new_boxes[ind+1][0] - (new_boxes[ind][0] + new_boxes[ind][2]) >= 1.5*tsek_mean: tmp_result.append([-1,-1,-1,-1, 1.0, u' ']) results.append(tmp_result) return results
def recognize_chars(segmentation, tsek_insert_method='baseline', ): '''Recognize characters using segmented char data Parameters: -------------------- segmentation: an instance of PechaCharSegmenter or Segmenter Returns: -------------- results: Unicode string containing recognized text''' results = [] tsek_mean = segmentation.final_box_info.tsek_mean width_dists = {} for l, vectors in enumerate(segmentation.vectors): if not vectors: print 'no vectors...' continue tmp_result = [] new_boxes = segmentation.new_boxes[l] small_chars = segmentation.line_info.small_cc_lines_chars[l] #FIXME: define emph lines for line cut #### Line Cut has no emph_lines object so need to work around for now... emph_markers = getattr(segmentation.line_info, 'emph_lines', []) if emph_markers: emph_markers = emph_markers[l] img_arr = segmentation.line_info.shapes.img_arr left_edges = [b[0] for b in new_boxes] tsek_widths = [] for s in small_chars[::-1]: # consider small char from end of line going backward. backward useful for misplaced tsek often and maybe for TOC though should check # for s in small_chars: # consider small char from end of line going backward. backward useful for misplaced tsek often and maybe for TOC though should check cnt = segmentation.line_info.shapes.contours[s] bx = segmentation.line_info.shapes.get_boxes()[s] bx = list(bx) x,y,w,h = bx char_arr = np.ones((h,w), dtype=np.uint8) offset = (-x, -y) drawContours(char_arr, [cnt], -1,0, thickness = -1, offset=offset) feature_vect = normalize_and_extract_features(char_arr) prd = classify(feature_vect) insertion_pos = bisect(left_edges, x) left_items = 6 right_items = 5 if insertion_pos >= len(new_boxes): # insertion is at or near end of line and needs more left # neighbors to compensate for there being less chars to define the baseline left_items = 12 elif insertion_pos <= len(new_boxes): # same as above except at front of line right_items = 12 if tsek_insert_method == 'baseline': top = 1000000 # arbitrary high number bottom = 0 #### Get min or max index to avoid reaching beyond edges of the line lower = max(insertion_pos - left_items, 0) upper = min(len(new_boxes)-1, insertion_pos+right_items) #### left = new_boxes[lower][0] right = new_boxes[upper][0] + new_boxes[upper][2] if insertion_pos < len(new_boxes): mid = new_boxes[insertion_pos][0] + new_boxes[insertion_pos][2] else: mid = right for j in new_boxes[lower:upper]: if j[1] < top: top = j[1] if j[1] + j[3] > bottom: bottom = j[1] + j[3] local_span = bottom - top if prd == u'་' and local_span > 0: left_sum = img_arr[top:bottom,left:mid].sum(axis=1) right_sum = img_arr[top:bottom,mid:right].sum(axis=1) local_baseline_left = top + left_sum.argmin() if mid != right: local_baseline_right = top + right_sum.argmin() else: local_baseline_right = local_baseline_left if ((local_baseline_left >= bx[1] and local_baseline_left <= bx[1] + bx[3]) or (local_baseline_right >= bx[1] and local_baseline_right <= bx[1] + bx[3])): #or # (entire_local_baseline >= bx[1] and entire_local_baseline <= bx[1] + bx[3])): ### Account for fact that the placement of a tsek could be # before or after its indicated insertion pos ### experimental.. only need with certain fonts e.g. "book 6" ## in samples if insertion_pos <= len(new_boxes): # cur_box_in_pos = new_boxes[insertion_pos] prev_box = new_boxes[insertion_pos-1] # left_cur = cur_box_in_pos[0] left_prev = prev_box[0] if 0 <= x - left_prev < w and 2*w < prev_box[2]: insertion_pos -= 1 vectors.insert(insertion_pos, prd) new_boxes.insert(insertion_pos, bx) left_edges.insert(insertion_pos, bx[0]) tsek_widths.append(bx[2]) elif bx[1] >= top -.25*local_span and bx[1] + bx[3] <= bottom + local_span*.25: vectors.insert(insertion_pos, prd) new_boxes.insert(insertion_pos, bx) left_edges.insert(insertion_pos, bx[0]) else: vectors.insert(insertion_pos, prd) new_boxes.insert(insertion_pos, bx) left_edges.insert(insertion_pos, bx[0]) tsek_mean = np.mean(tsek_widths) for em in emph_markers: marker = dig_to_char[segmentation.line_info.shapes.cached_pred_prob[em][0]] marker_prob = segmentation.line_info.shapes.cached_pred_prob[em][1] bx = segmentation.line_info.shapes.get_boxes()[em] bx = list(bx) x,y,w,h = bx insertion_pos = bisect(left_edges, x) bx.append(marker_prob) bx.append(marker) vectors.insert(insertion_pos, marker) new_boxes.insert(insertion_pos, bx) left_edges.insert(insertion_pos, bx[0]) # tsek_std = np.std(tsek_widths) if len(vectors) == 1: i = -1 for i, v in enumerate(vectors[:-1]): if new_boxes[i+1][0] - (new_boxes[i][0] + new_boxes[i][2]) >= 2*tsek_mean: if not isinstance(v, unicode): prd = classify(v, pca_trans=PCA_TRANS, multi=False) else: prd = v new_boxes[i].append(prd) tmp_result.append(new_boxes[i]) tmp_result.append([-1,-1,-1,-1, u' ']) else: if not isinstance(v, unicode): prd = classify(v, pca_trans=PCA_TRANS, multi=False) ### Assume that a tsek shouldn't show up at this point ### a more reliable way to do this is to better # if prd == u'་': # prbs = cls.predict_proba(v)[0] # ind_probs = zip(range(len(prbs)), prbs) # ind_probs.sort(key=lambda x: x[1]) # prd = dig_to_char[ind_probs[-2][0]] else: prd = v if not width_dists.get(prd): width_dists[prd] = [new_boxes[i][2]] else: width_dists[prd].append(new_boxes[i][2]) new_boxes[i].append(prd) tmp_result.append(new_boxes[i]) if not isinstance(vectors[-1], unicode): prd = classify(vectors[-1], pca_trans=PCA_TRANS, multi=False) else: prd = vectors[-1] new_boxes[-1].append(prd) tmp_result.append(new_boxes[-1]) results.append(tmp_result) return results
def __init__(self, shapes, k): from sklearn.cluster import KMeans self.shapes = shapes self.k = k self.page_array = shapes.img_arr if shapes.conf['line_cluster_pos'] == 'top': tops = array(shapes.get_tops(), dtype=float64) elif shapes.conf['line_cluster_pos'] == 'center': tops = array( [t[1] + .5*shapes.char_mean for t in shapes.get_boxes() if t[3] > 2* shapes.tsek_mean], dtype=float64 ) else: raise ValueError, "The line_cluster_pos argument must be either 'top' or 'center'" tops.shape = (len(tops), 1) kmeans = KMeans(n_clusters=k) # print tops kmeans.fit(tops) ################## ######## mark cluster centroids on original image and show them # img_arr = shapes.img_arr.copy() # for centroid in kmeans.cluster_centers_: ## print centroid[0] # img_arr[centroid[0],:] = 0 # # import Image # Image.fromarray(img_arr*255).show() #######################3 lines = [[] for i in range(k)] ind = shapes.get_indices() ### Assign char pointers (ind) to the appropriate line ### # [lines[kmeans.labels_[i]].append(ind[i]) for i in range(len(ind))] [lines[kmeans.predict(shapes.get_boxes()[ind[i]][1])[0]].append(ind[i]) for i in range(len(ind))] lines = [l for l in lines if l] self.k = len(lines) boxes = shapes.get_boxes() ### Sort indices so they are in order from top to bottom using y from the first box in each line sort_inx = list(argsort([boxes[line[0]][1] for line in lines])) lines.sort(key=lambda line: boxes[line[0]][1]) ### Get breaklines for splitting up lines ### Uses the topmost box in each line cluster to determine breakline try: topmosts = [min([boxes[i][1] for i in line]) for line in lines] except ValueError: print 'failed to get topmosts...' raise vsums = self.page_array.sum(axis=1) breaklines = [] delta = 25 for c in topmosts: if c - delta < 0: lower = 0 else: lower = c-delta e = argmax(vsums[lower:c+delta]) c = c - delta + e if c < 0: c = 0 breaklines.append(c) breaklines.append(self.page_array.shape[0]) self.baselines = [] for i, br in enumerate(breaklines[:-1]): try: baseline_area = vsums[br:breaklines[i+1]] if baseline_area.any(): self.baselines.append(br + argmin(baseline_area)) else: print i print 'No baseline info' except ValueError: print 'ValueError. exiting...HERE' import traceback;traceback.print_exc() raise final_ind = dict((i, []) for i in range(len(lines))) self.new_contours = {} for j, br in enumerate(breaklines[1:-1]): topcount = 0 bottomcount = 0 for i in lines[j]: # if char extends into next line, break it # 253 is roughly global line height avg + 1 std # The following lines says that a box/char must be extending over # breakline by a non trivial amount eg. 30 px and must itself # be a tall-ish box (roughly the height of average line) in order # for it to be broken. # if (bounding[i][1] + bounding[i][3]) - br >= 30 and bounding[i][3] > 205: if (boxes[i][1] + boxes[i][3]) - br >= 30 and \ (boxes[i][1] + boxes[i][3]) - topmosts[j] > self.shapes.char_mean*2.85: chars = ones((boxes[i][3]+2, boxes[i][2]+2), dtype=uint8) contours = shapes.contours cv.drawContours(chars, [contours[i]], -1,0, \ thickness = -1, offset=(-boxes[i][0]+1,-boxes[i][1]+1)) cv.dilate(chars, None, chars) y_offset = boxes[i][1] new_br = br - y_offset prd_cut = [] ### Iterate through potential cut-points and ### and cut where top half has the highest probability ### that is not a tsek # print 'bottom bound cut point', int(.75*shapes.tsek_mean) for delta in range(-3, int(.75*shapes.tsek_mean), 1): # for delta in range(-3, 100, 1): cut_point = new_br + delta # chars[cut_point, :] = 0 # import Image # Image.fromarray(chars*255).show() tchr = chars[:cut_point,:] tchr = ftrim(tchr) if not tchr.any(): continue tchr = normalize_and_extract_features(tchr) probs = cls.predict_proba(tchr) max_prob_ind = argmax(probs) chr = label_chars[max_prob_ind] prd_cut.append((probs[0,max_prob_ind], chr, cut_point)) prd_cut = [q for q in prd_cut if q[1] != u'་'] try: cut_point = max(prd_cut)[-1] except: print 'No max prob for vertical char break, using default breakline. Usually this means the top half of the attempted segmentation looks like a tsek blob' cut_point = br-boxes[i][1] #######FOLLWNG NOT WORKING ATTEMPTS TO GET A BETTER BREAK LINE # br2 = br-bounding[i][1] # # csum = chars.sum(axis=1) # bzone = csum[br2-25:br2+40] # if bzone.any(): # br2 = np.argmax(bzone) + (br-25) ## print br, 'br' # chars = chars*255 # nbr = br # cv.line(chars, (0, br2), (chars.shape[1], br2), 0) # Image.fromarray(chars).save('/tmp/outt.tiff') # sys.exit() ############# tarr = chars[:cut_point,:] tarr, top_offset = ftrim(tarr, new_offset=True) tarr = fadd_padding(tarr, 3) barr = chars[cut_point:,:] barr = ftrim(barr, sides='brt') barr = fadd_padding(barr, 3) c1, h = cv.findContours(image=tarr, mode=cv.RETR_LIST, method=cv.CHAIN_APPROX_SIMPLE, offset=(boxes[i][0]+top_offset['left'],boxes[i][1])) c1 = c1[argmax([len(t) for t in c1])] # use the most complex contour bnc1 = cv.boundingRect(c1) c2, h = cv.findContours(barr, mode=cv.RETR_LIST, method=cv.CHAIN_APPROX_SIMPLE, offset=(boxes[i][0]-3,boxes[i][1]+cut_point-3)) c2 = c2[argmax([len(t) for t in c2])] bnc2 = cv.boundingRect(c2) topbox_name = 't%d_%d' % (j, topcount) final_ind[j].append(topbox_name) self.new_contours[topbox_name] = (bnc1, c1) topcount += 1 if bnc2[-1] > 8: #only add bottom contour if not trivially small bottombox_name = 'b%d_%d' % (j, bottomcount) final_ind[j+1].append(bottombox_name) self.new_contours[bottombox_name] = (bnc2, c2) bottomcount += 1 else: final_ind[j].append(i) # Don't forget to include the last line map(final_ind[len(lines)-1].append, lines[len(lines)-1]) self.lines_chars = final_ind cctops = [self.shapes.get_boxes()[i][1] for i in self.shapes.small_contour_indices] char_tops = zip(cctops, self.shapes.small_contour_indices) char_tops.sort(key=lambda x: x[0]) sorted_indices = [i[1] for i in char_tops] _line_insert_indxs = [] _line_insert_indxs.extend([bisect_right(char_tops, (i - 1,)) for i in breaklines]) self.small_cc_lines_chars = [] if not _line_insert_indxs: sys.exit() for i, l in enumerate(_line_insert_indxs[:-1]): self.small_cc_lines_chars.append(sorted_indices[l:_line_insert_indxs[i+1]]) self.small_cc_lines_chars.append(sorted_indices[_line_insert_indxs[-1]:]) self.small_cc_lines_chars = [self.small_cc_lines_chars[i] for i in range(len(self.lines_chars)) if self.lines_chars[i]] cctops = [self.shapes.get_boxes()[i][1] for i in self.shapes.emph_symbols] char_tops = zip(cctops, self.shapes.emph_symbols) char_tops.sort(key=lambda x: x[0]) empred = [kmeans.predict(shapes.get_boxes()[i][1])[0] for i in self.shapes.emph_symbols] self.emph_lines = [[] for i in range(k)] for nn, e in enumerate(empred): self.emph_lines[sort_inx.index(e)].append(self.shapes.emph_symbols[nn]) if self.shapes.detect_o: cctops = [self.shapes.get_boxes()[i][1] for i in self.shapes.naros] char_tops = zip(cctops, self.shapes.naros) char_tops.sort(key=lambda x: x[0]) sorted_indices = [i[1] for i in char_tops] _line_insert_indxs = [] _line_insert_indxs.extend([bisect_right(char_tops, (i - 1,)) for i in breaklines]) if not _line_insert_indxs: sys.exit() self.line_naros = [] for i, l in enumerate(_line_insert_indxs[:-1]): # self.line_naros.append(sorted_indices[l:_line_insert_indxs[i+1]]) self.line_naros.append(sorted_indices[_line_insert_indxs[-1]:]) self.line_naros = [self.line_naros[i] for i in range(len(self.lines_chars)) if self.lines_chars[i]] self.line_naro_spans = [] for ll, mm in enumerate(self.line_naros): thisline = [] for nn, naro in enumerate(mm): box = self.get_box(naro) thisline.append(box) thisline.sort(key=lambda x: x[0]) self.line_naros[ll].sort(key=lambda x: self.get_box(x)[0]) self.line_naro_spans.append(thisline) if self.shapes.low_ink: cctops = [lib[1] for lib in self.shapes.low_ink_boxes] char_tops = zip(cctops, self.shapes.low_ink_boxes) char_tops.sort(key=lambda x: x[0]) sorted_indices = [i[1] for i in char_tops] _line_insert_indxs = [] _line_insert_indxs.extend([bisect_right(char_tops, (i - 1,)) for i in breaklines]) self.low_ink_boxes = [] if not _line_insert_indxs: sys.exit() for i, l in enumerate(_line_insert_indxs[:-1]): self.low_ink_boxes.append(sorted_indices[l:_line_insert_indxs[i+1]]) self.low_ink_boxes.append(sorted_indices[_line_insert_indxs[-1]:]) self.low_ink_boxes = [self.low_ink_boxes[i] for i in range(len(self.lines_chars)) if self.lines_chars[i]]
def construct_vector_set_experimental(self): NINF = -np.inf final_box_info = CombineBoxesForPage(self.line_info) self.final_box_info = final_box_info final_boxes = final_box_info.final_boxes final_indices = final_box_info.final_indices scales = final_box_info.transitions self.vectors = [[] for i in range(self.line_info.k)] self.new_boxes = [[] for i in range(self.line_info.k)] # cur_mean = self.final_box_info.char_mean cur_std = self.final_box_info.char_std BREAKWIDTH = self.breakwidth rbfcls = self.line_info.rbfcls for l in range(len(final_indices)): # for each line try: scale_l = scales[l] except: print 'ERROR AT ', l, len(scales) raise char_mean_int = floor(final_box_info.char_mean) char_std_int = ceil(final_box_info.char_std) try: lb = range(len(final_indices[l])) except IndexError: print 'index error' continue segmented = 0 for i in lb: # for each line box ## New draw, takes into account tree hierarchy of contours x, y, w, h = final_boxes[l][i] letter = ones((h, w), dtype=uint8) for k in final_indices[l][i]: if not isinstance(k, str): letter = self.line_info.shapes.draw_contour_and_children( k, char_arr=letter, offset=(-x, -y)) else: cv.drawContours(letter, [self.line_info.get_contour(k)], -1, 0, thickness=-1, offset=(-x, -y)) letter = cv.resize(letter, dsize=(0, 0), fx=scale_l, fy=scale_l) if letter.shape[1] >= (final_box_info.char_mean + BREAKWIDTH * final_box_info.char_std ): # if a box is too large, break it # segmented += 1 sw = w * scale_l sh = h * scale_l vsum = letter.sum(axis=0) chars = sw // (final_box_info.char_mean - 1.5 * final_box_info.char_std ) # important, floor division if 10.0 > chars > 1.0: # Assume chars-to-be-broken don't span > 10 # if chars: w = sw h = sh best_box_dim = [] best_prob = 0.0 best_seq = None ## Iterate through a range of variable chars if ## chars is greater than 2. This allows potential ## breaks for chars-1, chars-2 # all_choices = [] for chars in range(int(chars), 1, -1): for z in range(0, 21, 2): segs = [] prev_breakline = 0 for pos in range(int(chars - 1)): if char_mean_int - z >= 0: upper_range = [ int( np.round((pos + 1) * (char_mean_int - z))), int( np.round((pos + 1) * (char_mean_int + z))) ] vsum_range = vsum[ upper_range[0]:upper_range[1]] if vsum_range.any(): breakline = int( np.round((pos + 1) * (char_mean_int - z) + argmax(vsum_range))) else: breakline = None if breakline: sg = letter[:, prev_breakline: breakline] prev_breakline = breakline else: sg = letter[:, int( np.round(pos * ( char_mean_int - z)) ):int( np. round((pos + 1) * ( char_mean_int - z)))] prev_breakline = int( np.round((pos + 1) * (char_mean_int - z))) segs.append(sg) segs.append( letter[:, int( np.round((chars - 1) * (char_mean_int - z))):]) segs = [fadd_padding(sg, 2) for sg in segs] seg_ctrs = [ cv.findContours( sg.copy(), mode=cv.RETR_CCOMP, method=cv.CHAIN_APPROX_SIMPLE) for sg in segs ] try: seg_bxs = [[ cv.boundingRect(k) for k in sgc[0] ] for sgc in seg_ctrs] except: print sgc raise bxs = [] nsegs = [] prev_w = 0 for zi, ltb in enumerate(seg_bxs): seg = segs[zi] for b in ltb: if b[2] < ( final_box_info.tsek_mean + 4 * final_box_info.tsek_std ) or b[3] < final_box_info.tsek_mean + 4 * final_box_info.tsek_std: seg[b[1] - 1:b[1] + b[3] + 1, b[0] - 1:b[0] + b[2] + 1] = True seg, ofst = ftrim(seg, new_offset=True) bx = [ x + prev_w + (ofst['left'] / scale_l), y + (ofst['top'] / scale_l), seg.shape[1] / scale_l, seg.shape[0] / scale_l ] prev_w += seg.shape[1] / scale_l bxs.append(bx) nsegs.append(seg) xt = [ normalize_and_extract_features(sg) for sg in nsegs if 0 not in sg.shape ] prd_probs = cls.predict_log_proba(xt) prd_probs = prd_probs.astype(np.float32) prob, prds = viterbi_cython( prd_probs.shape[0], n_states, start_p, trans_p, prd_probs) prob = np.exp(prob) if prob > best_prob: best_prob = prob best_seq = prds best_box_dim = bxs best_xt = xt if not best_box_dim: best_prob = prob best_seq = prds best_box_dim = bxs best_xt = xt for u in range(len(best_seq)): self.vectors[l].append(label_chars[best_seq[u]]) best_box = best_box_dim[u] best_box = [int(np.round(ii)) for ii in best_box] best_box.append(best_prob) best_box.append(label_chars[best_seq[u]]) self.new_boxes[l].append(best_box) try: self.line_info.shapes.img_arr[ best_box[1]:best_box[1] + best_box[3], best_box[0] + best_box[2]] = 1 except: pass else: self.new_boxes[l].append([x, y, w, h]) vect = normalize_and_extract_features(letter) self.vectors[l].append(vect) else: self.new_boxes[l].append([x, y, w, h]) vect = normalize_and_extract_features(letter) self.vectors[l].append(vect) if not any(self.vectors): print 'no vectors' return else: if self.line_info.shapes.detect_o: for i, l in enumerate(self.new_boxes): for n in self.line_info.line_naros[i]: box = self.line_info.get_box(n) x, y, w, h = box r0 = x + w for k, b in enumerate(l): if ((b[2] + w) - abs(b[0] - x) - abs( (b[0] + b[2]) - r0)) / ( 2 * float(min(w, b[2]))) > .8: try: nbox = list(combine_many_boxes([box, b])) except: print nbox[3] raise if isinstance(self.vectors[i][k], unicode): self.vectors[i][k] += u'ོ' nbox = b nbox[-1] = self.vectors[i][k] else: probs = cls.predict_log_proba( self.vectors[i][k]) mx = np.argmax(probs) prob = probs[0][mx] mx = rbfcls.predict(self.vectors[i][k])[0] ch = label_chars[mx] + u'ོ' self.vectors[i][k] = ch nbox.append(prob) nbox.append(ch) self.new_boxes[i][k] = nbox
def construct_vector_set_stochastic(self): # separate attached tsek # note this may note go here exactly, but somewhere in this function if self.line_info.shapes.conf.get('detach_tsek'): self._detach_tsek() final_box_info = CombineBoxesForPage(self.line_info) self.final_box_info = final_box_info final_boxes = final_box_info.final_boxes final_indices = final_box_info.final_indices scales = final_box_info.transitions self.vectors = [[] for i in range(self.line_info.k)] self.new_boxes = [[] for i in range(self.line_info.k)] # BREAKWIDTH = self.breakwidth for l in range(len(final_indices)): # for each line try: scale_l = scales[l] oo_scale_l = 1.0 / scale_l except: print 'ERROR AT ', l, len(scales) raise try: lb = range(len(final_indices[l])) except IndexError: continue segmented = 0 for i in lb: # for each line box ## New draw, takes into account tree hierarchy of contours x, y, w, h = final_boxes[l][i] letter = ones((h, w), dtype=uint8) lindices = final_indices[l][i] len_lindices = len(lindices) for k in lindices: if not isinstance(k, str): letter = self.line_info.shapes.draw_contour_and_children( k, char_arr=letter, offset=(-x, -y)) else: cv.drawContours(letter, [self.line_info.get_contour(k)], -1, 0, thickness=-1, offset=(-x, -y)) if w * scale_l >= 1 and h * scale_l >= 1: letter = cv.resize(letter, dsize=(0, 0), fx=scale_l, fy=scale_l) if letter.shape[1] >= (final_box_info.char_mean + BREAKWIDTH * final_box_info.char_std ): # if a box is too large, break it sw = w * scale_l sh = h * scale_l chars = sw // (final_box_info.char_mean - 1.5 * final_box_info.char_std ) # important, floor division chars = min(chars, 4) if chars > 1.0: w = sw h = sh all_choices = [] for chars in range(int(chars), 0, -1): # if l == 1: if self.line_info.shapes.detect_o: line_num = l else: line_num = None all_choices.append( self._sample_widths_method(chars, letter, final_boxes[l][i], oo_scale_l, line_num=line_num)) ## Append complete recognization results to vector list mx = max(all_choices) for v in mx[-1]: self.new_boxes[l].append(v) self.vectors[l].append(v) self.line_info.shapes.img_arr[v[1]:v[1] + v[3], v[0] + v[2]] = 1 else: self.new_boxes[l].append([x, y, w, h]) if len_lindices == 1: try: vect = self.cached_features[lindices[0]] except: #FIXME: should really check key used vect = normalize_and_extract_features(letter) else: vect = normalize_and_extract_features(letter) self.vectors[l].append(vect) else: self.new_boxes[l].append([x, y, w, h]) if len_lindices == 1: try: vect = self.cached_features[lindices[0]] except KeyError: vect = normalize_and_extract_features(letter) else: vect = normalize_and_extract_features(letter) self.vectors[l].append(vect) if not any(self.vectors): print 'no vectors' return else: if self.line_info.shapes.detect_o: for i, line in enumerate(self.new_boxes): used_boxes = set() for n in self.line_info.line_naros[i]: if n in used_boxes: continue box = self.line_info.get_box(n) x, y, w, h = box for k, box1 in enumerate(line): assert isinstance( box1, (list, tuple)), 'error - {}-{}-{}'.format( str(box1), i, k) assert isinstance(box, (list, tuple)), box try: overlap = check_for_overlap(box1, box) except: print i, k, box1, 'BOX problem' if overlap: used_boxes.add(n) try: nbox = list(combine_many_boxes([box, box1])) except: print nbox, 'slkfjlkfj' raise if isinstance(self.vectors[i][k], unicode): self.vectors[i][k] += u'ོ' nbox = box1 nbox[-1] = self.vectors[i][k] elif isinstance(self.vectors[i][k], list): if not self.vectors[i][k][-1][-1] == u'ོ': pchar = self.vectors[i][k][-1] + u'ོ' self.vectors[i][k][-1] = pchar nbox = self.vectors[i][k] else: probs = cls.predict_log_proba( self.vectors[i][k]) mx = np.argmax(probs) prob = probs[0][mx] ch = label_chars[mx] + u'ོ' self.vectors[i][k] = ch nbox.append(prob) nbox.append(ch) self.new_boxes[i][k] = nbox
def _sample_widths_method(self, chars, letter, letter_box, oo_scale_l, line_num=None): x, y, w, h = letter_box ################default cur_mean = self.final_box_info.char_mean * .97 cur_std = .295 * self.final_box_info.char_std ################# best_prob = -np.inf if chars > 1: letter = cv.dilate(letter.copy(), None, iterations=1) padding_amount = 3 for n in range(15): widths = [gauss(cur_mean, cur_std) for i in range(chars)] prev = 0 vecs = [] wdthprobs = 0 boxes = [] for i, val in enumerate(widths): if i == chars - 1: end = letter.shape[1] else: end = prev + val wdthprobs += gausslogprob(cur_mean, cur_std, end - prev) s = fadd_padding(letter[:, int(prev):int(end)], padding_amount) _, ctrs, hier = cv.findContours( s.copy(), mode=cv.RETR_TREE, method=cv.CHAIN_APPROX_NONE) bounding = map(boundingRect, ctrs) for k, b in enumerate(bounding): if (b[2] < 23 or b[3] < 23) and hier[0][k][3] == 0: s[b[1] - 1:b[1] + b[3] + 1, b[0] - 1:b[0] + b[2] + 1] = 1 s = s[padding_amount:-padding_amount, padding_amount:-padding_amount] s, ofst = ftrim(s, new_offset=True) if 0 not in s.shape: nnbox = [ x + (prev + ofst['left']) * oo_scale_l, y + (ofst['top'] * oo_scale_l), s.shape[1] * oo_scale_l, s.shape[0] * oo_scale_l ] if line_num is not None: naro = self.line_info.check_naro_overlap( line_num, nnbox) if naro != False: naro_box = self.line_info.get_box(naro) nnbox = combine_many_boxes([nnbox, naro_box]) ss = cv.resize(s, dsize=(0, 0), fx=oo_scale_l, fy=oo_scale_l) ss = np.vstack((ones( (nnbox[3] - ss.shape[0], ss.shape[1]), dtype=ss.dtype), ss)) ss = hstack( (ss, ones( (ss.shape[0], nnbox[2] - ss.shape[1]), dtype=ss.dtype))) cv.drawContours( ss, [self.line_info.get_contour(naro)], -1, 0, thickness=-1, offset=(-naro_box[0], -naro_box[1])) s = ss vecs.append(normalize_and_extract_features(s)) boxes.append(nnbox) else: break prev += val if not vecs: continue xn = len(vecs) vecs = np.array(vecs).reshape(xn, 346) # 346 is len(vecs[0]) probs = predict_log_proba(vecs) probs = probs.astype(np.float32) if n % 10 == 0 and n != 0: cur_mean = self.final_box_info.char_mean * ( .97 - (3 * n / 1000.0)) prob, prds = viterbi_cython(xn, n_states, start_p, trans_p, probs) prob = prob + wdthprobs if prob > best_prob: best_prob = prob best_prd = prds best_boxes = boxes else: best_boxes = [letter_box] probs = predict_log_proba(normalize_and_extract_features(letter)) amx = probs[0].argmax() try: startprob = start_p[amx] except IndexError: startprob = 1e-10 best_prob = probs[0][amx] + gausslogprob( cur_mean, cur_std, letter_box[2] / oo_scale_l) + startprob best_prd = [amx] final_prob = best_prob res = [] for i, val in enumerate(best_prd): best_boxes[i] = [int(np.round(k)) for k in best_boxes[i]] best_boxes[i].extend([float(np.exp(final_prob)), label_chars[val]]) res.append(best_boxes[i]) return (final_prob, res)
def construct_vector_set_simple(self): self.too_big = [[] for i in range(self.line_info.k)] self.too_big_box = [[] for i in range(self.line_info.k)] self.too_big_loc = [] char_mean = self.line_info.shapes.char_mean for i in range(self.line_info.k): line = self.line_info.lines_chars[i] for j, c in enumerate(line): x, y, w, h = self.line_info.get_box(c) if w > 1.75 * char_mean or h > 2.5 * char_mean: letter = ones((h, w), dtype=uint8) if not isinstance(c, str): letter = self.line_info.shapes.draw_contour_and_children( c, char_arr=letter, offset=(-x, -y)) else: cv.drawContours(letter, [self.line_info.get_contour(c)], -1, 0, thickness=-1, offset=(-x, -y)) self.too_big[i].append(letter) self.too_big_loc.append((i, j)) self.too_big_box[i].append([x, y, w, h]) for loc in self.too_big_loc: self.line_info.lines_chars[loc[0]][loc[1]] = 'xx' for k in self.line_info.lines_chars: self.line_info.lines_chars[k] = [ xx for xx in self.line_info.lines_chars[k] if xx != 'xx' ] final_box_info = CombineBoxesForPage(self.line_info) scales = final_box_info.transitions self.final_box_info = final_box_info final_boxes = final_box_info.final_boxes char_mean = self.final_box_info.char_mean final_indices = final_box_info.final_indices self.vectors = [[] for i in range(self.line_info.k)] self.new_boxes = [[] for i in range(self.line_info.k)] # for l in range(self.line_info.k): # for each line try: lb = range(len(final_indices[l])) except IndexError: continue try: scale_l = scales[l] oo_scale_l = 1.0 / scale_l except: print 'ERROR AT ', l, len(scales) raise for ii, i in enumerate(lb): # for each line box ## New draw, takes into account tree hierarchy of contours x, y, w, h = final_boxes[l][i] letter = ones((h, w), dtype=uint8) for k in final_indices[l][i]: if not isinstance(k, str): letter = self.line_info.shapes.draw_contour_and_children( k, char_arr=letter, offset=(-x, -y)) else: cv.drawContours(letter, [self.line_info.get_contour(k)], -1, 0, thickness=-1, offset=(-x, -y)) letter = cv.resize(letter, dsize=(0, 0), fx=scale_l, fy=scale_l) self.new_boxes[l].append([x, y, w, h]) vect = normalize_and_extract_features(letter) self.vectors[l].append(vect) if not any(self.vectors): print 'no vectors' return
def __init__(self, img_arr, fast_cls, small_coef=1, low_ink=False, \ page_type=None, flpath=None, detect_o=True,\ clear_hr = False): #lower coef means more filtering USE 3 for nying gyud self.img_arr = img_arr self.page_type = page_type self.flpath = flpath self.low_ink = low_ink self.detect_o = detect_o self.cached_features = OrderedDict() self.cached_pred_prob = OrderedDict() self._contour_mode = cv.RETR_TREE ### repeatedly called functions ones = np.ones uint8 = np.uint8 predict = fast_cls.predict predict_proba = fast_cls.predict_proba self.contours, self.hierarchy = self._contours() self.boxes = [] self.indices = [] self.small_coef = small_coef FILTERED_PUNC = (u'།', u'་', u']', u'[') self._set_shape_measurements() if page_type == 'pecha': if clear_hr: print 'Warning: clear_hr called on pecha format. For clearing text' self.force_clear_hr() self.set_pecha_layout() if self.indices: content_parent = int( statsmode([self.hierarchy[0][i][3] for i in self.indices])[0]) else: print 'no content found' else: content_parent = int( statsmode([hier[3] for hier in self.hierarchy[0]])[0]) self.indices = self.get_indices() outer_contours = [] outer_widths = [] ## Iterate through all contours for i in self.indices: cbox = self.get_boxes()[i] x, y, w, h = cbox ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS. Recently # added the len(indices) < 40 as a way to prevent exaggerated # filtering of small lines where gaussian width measures # are meaningless due to small sample size (too few contours) if self.hierarchy[0][i][3] == content_parent and ( cbox[2] < .1 * self.img_arr.shape[1] or len(self.indices) < 40): # if self.hierarchy[0][i][3] == content_parent and cbox[2] < 3*self.char_mean: ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS outer_contours.append(i) outer_widths.append(cbox[2]) else: if cbox[2] > .66 * self.img_arr.shape[1]: print cbox[2] / float(self.img_arr.shape[1]) if clear_hr and .995*self.img_arr.shape[1] > cbox[2] > \ .66*self.img_arr.shape[1] and cbox[1] < .25*self.img_arr.shape[0]: self.img_arr[0:cbox[1] + cbox[3], :] = 1 # print 'rejected box. too wide?', cbox[2] >= .1*self.img_arr.shape[1] width_measures = self.char_gaussians(outer_widths) for i, j in zip(['char_mean', 'char_std', 'tsek_mean', 'tsek_std'], width_measures): setattr(self, i, j) self.small_contour_indices = [] self.indices = [] # Need to reset!19 self.emph_symbols = [] self.naros = [] for i in outer_contours: cbox = self.get_boxes()[i] # if small and has no children, put in small list (this could backfire with false interiors e.g. from salt and pepper noise) ## NOTE: previously small was defined as less than tsek_mean + 3xtsek std ## however, this wasn't always working. changing to less than charmean ## minus 2xchar std however should watch to see if is ok for many different inputs... x, y, w, h = cbox tmparr = ones((h, w), dtype=uint8) tmparr = self.draw_contour_and_children(i, tmparr, (-x, -y)) features = normalize_and_extract_features(tmparr) self.cached_features[i] = features prprob = predict_proba(features) mxinx = prprob.argmax() quick_prd = label_chars[mxinx] self.cached_pred_prob[i] = (mxinx, prprob[0]) is_emph_symbol = quick_prd in set([u'༷', u'༵', u'༼', u'༽']) if is_emph_symbol: self.emph_symbols.append(i) print 'EMPHSYMBOLFOUND', quick_prd elif quick_prd == u'ོ' and self.detect_o: self.naros.append(i) elif cbox[2] < 7: continue elif (cbox[2] <= self.tsek_mean * 3 and quick_prd in FILTERED_PUNC) and not self.low_ink: # default!!! self.small_contour_indices.append(i) else: self.indices.append(i) if self.detect_o: print 'pre-filtered na-ro vowel', len(self.naros), 'found' if self.low_ink: self._low_ink_setting()