from root_based_finder import is_non_std, word_parts from segment import Segmenter, combine_many_boxes import shelve import signal import simplejson as json from sklearn.externals import joblib import sys from termset import syllables from tparser import parse_syllables from utils import local_file from viterbi_cython import viterbi_cython # from viterbi_search import viterbi_search, word_bigram import warnings cls = load_cls('logistic-cls') ## Ignore warnings. THis is mostlu in response to incessant sklearn ## warnings about passing in 1d arrays warnings.filterwarnings("ignore") print 'ignoring all warnings' ### rbfcls = load_cls('rbf-cls') predict_log_proba = cls.predict_log_proba predict_proba = cls.predict_proba # Trained characters are labeled by number. Open the shelve that contains # the mappings between the Unicode character and its number label. allchars = shelve.open(local_file('allchars_dict2')) char_to_dig = allchars['allchars']
# encoding: utf-8 '''Line breaking''' from numpy import array, float64, argmax, argmin, uint8, ones, floor, mean, std, where, argsort import cv2 as cv from utils import check_for_overlap from fast_utils import ftrim, fadd_padding import sys from bisect import bisect, bisect_right from feature_extraction import normalize_and_extract_features from classify import load_cls, label_chars cls = load_cls('logistic-cls') class LineCut(object): '''Line Cutting object - breaks lines in a page where lines are separated by empty whitespace Parameters: -------------------- shapes: page_element object, (see page_elements.py) thresh_scale: float, default=.9995 A threshold value for determining the breakline in the event that there is black pixel noise between lines. Should be set high to avoid setting line breaks through characters themselves. Attributes: ----------- lines_chars: list of lists, length=number of lines on page. Each sub-list contains the indices for the bounding boxes/contours assigned to its corresponding line.