def __check_yacc(self, check_lex_data=None): """ Returns: { 'latex': self.latex, 'latex_list': self.latex_list, 'latex_string': self.latex_string, 'yacc_errors_history': self.yacc_errors_history, 'lex_errors_history': self.lex_errors_history, 'yacc_pure_errors': self.pure_yacc_errors, 'lex_pure_errors': self.pure_lex_errors # Não está sendo adicionado aqui } """ helpers.debug('[check_grammar.py] __check_yacc()') helpers.debug('[check_grammar.py] __check_yacc() | \ before CheckSintax() ') cgs = check_grammar_yacc.CheckSintax() cgs.attempts = self.__attempts_grammar if check_lex_data: cgs.set_lex_data(check_lex_data) check_data_yacc = cgs.check_correct_grammar() return check_data_yacc
def resize_full_image(self, image): helpers.debug('[preprocessing.py] resize_full_image()') h, w = image.shape[:2] if w > 4000: width = int(w * 20 / 100) r = width / float(w) size = (width, int(h * r)) image = cv2.resize(image, size) return image
def binarization(self, image): helpers.debug('[preprocessing.py] binarization()') img = image.copy() img = self.invert(img) # img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2) img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 9, 2) img = self.invert(img) return img
def to_gray_denoise(self, image): helpers.debug('[preprocessing.py] to_gray_denoise()') img = image.copy() if img.ndim == 3: img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) img = cv2.fastNlMeansDenoising(img, None, 5, 9) img = np.array(img) return img
def get_new_index(pred): helpers.debug("[base_grammar.py] correct_grammar_lex() | \ Reset prediction of current symbol") new_pred = pred.copy() new_pred[0][np.argmax(pred)] = 0 helpers.debug("[base_grammar.py] correct_grammar_lex() | \ Gets new index and prediction from \ next index with higher prediction") new_index = np.argmax(new_pred) return new_index, new_pred
def __tree_to_list(self, tree, node=None): helpers.debug('[parser.py] __tree_to_list()') latex = [] def recur(root_node): current = tree.root_node if not root_node else root_node if current is None: return if isinstance(current.data, str): latex.append(current.data) else: try: real_label = labels[current.data['label']] if real_label == '{': real_label = '\\{' if real_label == '}': real_label = '\\}' current.data['label'] = real_label latex.append(current.data) except BaseException as e: print('Exception: ', e) if current.node_type == 'RegionNode': latex.append('{') for node in current.children: recur(node) if current.node_type == 'RegionNode': latex.append('}') recur(node) if latex[0] == 'Expression': latex.remove('Expression') if latex[-1] == "}": latex.pop() if latex[0] == '{': latex.reverse() latex.pop() latex.reverse() return latex
def __list_to_latex_obj(self, tlist): """ It turns the list into a dict: latex.append({ 'label': symbol[], 'prediction': [], 'type': '' }) """ helpers.debug('[parser.py] __list_to_latex_obj()') latex = [] for symbol in tlist: if isinstance(symbol, dict): latex.append({ 'label': symbol['label'], 'prediction': symbol['prediction'] \ if 'prediction' in symbol else [], 'type': symbol['type'] or '' }) else: latex.append({ 'label': symbol, 'prediction': [], 'type': 'context' }) grammar = { '-': 'frac', 'below': 'below', 'sqrt': 'sqrt', 'super': 'super', '*': 'mult', 'subsc': 'subsc', 'neq': 'neq' } subst = helpers.subst print('\n\n') latex = self.__token_substitution(latex, grammar, subst) print('\n\n') return latex
def check(self, latex_data): """ Returns: { 'latex': self.latex, 'latex_list': self.latex_list, 'latex_string': self.latex_string, 'yacc_errors_history': self.yacc_errors_history, 'lex_errors_history': self.lex_errors_history, 'yacc_pure_errors': self.pure_yacc_errors, 'lex_pure_errors': self.pure_lex_errors # Não está sendo adicionado aqui } """ helpers.debug("[check_grammar.py] check()") helpers.debug("[check_grammar.py] latex_data: {0}".format(latex_data)) latex = latex_data['latex'] latex_list = latex_data['latex_list'] latex_string = latex_data['latex_string'] lstring = latex_data['lstring'] helpers.debug("[check_grammar.py] check() | Latex List:") helpers.debug(latex_list) helpers.debug("[check_grammar.py] check() | \ Latex String: %s " % latex_string) try: check_lex_data = self.__check_lex(latex_string, latex, latex_list) check_yacc_data = self.__check_yacc(check_lex_data) if check_lex_data['latex_string'] != -1 and \ check_lex_data['latex_string'] is not None: lstring = check_lex_data['latex_string'] check_yacc_data.update({'latex_string_original': latex_string}) return check_yacc_data except (GrammarError, SintaticError, LexicalError) as e: e.data.update({'latex_string_original': latex_string}) raise e except BaseException as e: raise e
def __start(self, listin, R): helpers.debug('\n[parser.py] __start()') print('[parser.py] __start() | region [R]: ', R) # Adicionei o round() | tirei o round left = R[0][0] top = R[0][1] right = R[1][0] bottom = R[1][1] helpers.debug('[parser.py] __start() | region [R]: \ left: %d, right: %d,\ top: %d, bottom: %d' % (left, right, top, bottom)) leftmostIndex = -1 listIndex = 0 overlapIndex = -1 n = len(listin) while leftmostIndex == -1 and listIndex < n: helpers.debug('[parser.py] __start() | ... \ symbol index: %d' % listIndex) helpers.debug('[parser.py] __start() | ... \ symbol label: %s' % listin[listIndex]['label']) print( '[parser.py]__start() | ... \ symbol centroid: ', listin[listIndex]['centroid']) if not listin[listIndex]['checked'] and \ listin[listIndex]['centroid'][0] >= left and \ listin[listIndex]['centroid'][1] >= top and \ listin[listIndex]['centroid'][0] <= right and \ listin[listIndex]['centroid'][1] <= bottom: leftmostIndex = listIndex else: listIndex = listIndex + 1 helpers.debug('[parser.py] __start() | \ leftmostIndex: %d' % leftmostIndex) if leftmostIndex == -1: return leftmostIndex else: return self.__overlap(leftmostIndex, top, bottom, listin)
def to_parse(self): """ Returns: { 'latex': self.latex, 'latex_list': self.latex_list, 'latex_string': self.latex_string, 'yacc_errors_history': self.yacc_errors_history, 'lex_errors_history': self.lex_errors_history, 'yacc_pure_errors': self.pure_yacc_errors, 'lex_pure_errors': self.pure_lex_errors # It is not being added here 'latex_before_cg': latex_before_cg, 'tree': ..., 'tlist': ... } """ helpers.debug('[parser.py] to_parse()') try: structural_analysis = sa.StructuralAnalysis(self.symbols) structured_data = structural_analysis.analyze() if not structured_data: return latex_data = self.organize_latex_data(structured_data['latex']) check_grammar = cg.CheckGrammar() check_grammar_data = check_grammar.check(latex_data) data = {} data.update(check_grammar_data) data.update({'latex_before_cg': structured_data['latex']}) data.update({ 'tree': structured_data['tree'], 'tlist': structured_data['tlist'] }) return data except Exception as e: print('error') raise e
def treatment(self, img): helpers.debug('[preprocessing.py] treatment()') try: if type(img) is str: image = cv2.imread(img) else: image = img original = image.copy() image = self.resize_full_image(image) normalized = self.normalize(image) self.image = normalized.copy() symbols = self.segment(normalized) return symbols except BaseException as e: print(e) return []
def normalize(self, image): helpers.debug('[preprocessing.py] normalize()') img = image.copy() img = self.to_gray_denoise(image) if not self.configs['black']: img = self.invert(img) kernel = np.ones((2, 2), np.uint8) if 'dilate' in self.configs: if self.configs['dilate']: img = cv2.dilate(img, kernel, iterations=2) if 'erode' in self.configs: if self.configs['erode']: img = cv2.erode(img, kernel, iterations=1) return img
def __locate_grammar_error(self, yacc_error_list): helpers.debug("\n[check_grammar_sintax.py] __locate_grammar_error() | \ Locating all errors and creating a data structure.") yacc_error_list = yacc_error_list.copy() latex = self.latex.copy() yacc_errors = [] yacc_errors_history = self.yacc_errors_history.copy() helpers.debug("[check_grammar_sintax.py] __locate_grammar_error() | \ Errors: {0}".format(yacc_error_list)) for error in yacc_error_list: if error['value'] is not None: helpers.debug("[check_grammar_sintax.py] \ __locate_grammar_error() | ...for() value not none") count = 0 count_list = 0 latex_error_pos = error['lexpos'] latex_error_token = error['value'] for symbol in latex: if symbol['label'] == latex_error_token and \ count == latex_error_pos: yacc_errors.append({ 'pos': latex_error_pos, 'pos_list': count_list, 'label': symbol['label'], 'prediction': symbol['prediction'], # It adds itself as a attempt of solution 'attempts': [symbol['label']] }) yacc_errors_history.extend(yacc_errors) break count += len(symbol['label']) count_list += 1 else: helpers.debug("Use automata to fix") continue return yacc_errors, yacc_errors_history
def __find_lexical_errors(self): helpers.debug("\n................FIND LEXICAL ERRORS................") cgl = check_grammar_lex.CheckLex() cgl.latex_string = self.latex_string cgl.latex = self.latex cgl.latex_list = self.latex_list cgl.attempts = 0 check_lex_data = cgl.check_correct_lex() # If the executtion got here is because the error was solved self.latex = check_lex_data['latex'] self.latex_list = check_lex_data['latex_list'] self.latex_string = check_lex_data['latex_string'] self.pure_lex_errors = check_lex_data['pure_errors'] lex_errors_history = check_lex_data['errors_history'] self.lex_errors_history.extend(lex_errors_history) helpers.debug("...................................................")
def __attempt_to_fix_error(self, lex_errors): helpers.debug("[check_grammar_lex.py] self.__attempt_to_fix_error() \ | Tries to fix the error.") # It tries to solve the FIRST error and returns an updated list of errors bg = correct_grammar.CorrectGrammar() # lex_errors: current error, self.lex_errors_history: all errors corrected_data = bg.correct_grammar_lex(lex_errors, self.latex, self.latex_list, 0, self.lex_errors_history) update_latex_string = corrected_data['latex_string'] # Updated error with attempt self.lex_error_list = corrected_data['errors'] self.index = corrected_data['index'] # If there are remaining errors if self.lex_error_list: self.lex_errors_history = self.lex_error_list.copy() helpers.debug("[check_grammar_lex.py] self.__attempt_to_fix_error() | \ Updated lex error: {0}".format(self.lex_error_list)) helpers.debug("[check_grammar_lex.py] self.__attempt_to_fix_error() | \ Updated lex error history: {0}".format(self.lex_errors_history)) # if update_latex_string: self.latex_string = update_latex_string self.attempts += 1 return self.check_correct_lex()
def recur_get_new_index(pred): new_index, pred = get_new_index(pred) label_recog = helpers_labels[json_label][str(new_index)] new_label = helpers_labels["labels_recognition"][ label_recog] new_identification = labels[new_label] if new_identification in errors[index]['attempts'] or \ new_identification in previous_attemptions: helpers.debug("[base_grammar.py] \ correct_grammar_lex() | \ New index is in previous attempts. Getting next.") return recur_get_new_index(pred) else: if new_identification == '{': new_identification = '\\{' if new_identification == '}': new_identification = '\\}' return new_index, pred, new_identification
def __change_label(i, label, substitution_list): for substitution_index in range(0, len(substitution_list)): nomatch = False aux = [] helpers.debug('[parser.py] __list_to_latex_obj() | \ ...substitutions: ') helpers.debug(substitution_list[substitution_index]) initial_index = i i, nomatch = __list_substitution(i, nomatch, aux, initial_index, label, substitution_list, substitution_index) if not nomatch: helpers.debug('[parser.py] __list_to_latex_obj() \ | match - updating value ') for matched in aux: latex[matched['index']]['label'] = matched['label'] return i
def __attempt_to_fix_error(self, yacc_errors): helpers.debug('[check_grammar_sintax.py] __attempt_to_fix_error()') ''' It tries to solve the FIRST error and returns an updated list of errors ''' bg = correct_grammar.CorrectGrammar() # It join Lex and Yacc's attempts at solutions. fix_attempts = self.lex_errors_history.copy() fix_attempts.extend(self.yacc_errors_history) corrected_data = bg.correct_grammar_lex(yacc_errors, self.latex, self.latex_list, 0, fix_attempts) updated_latex_string = corrected_data['latex_string'] # Updated error with attempt self.yacc_error_list = corrected_data['errors'] self.index = corrected_data['index'] # It there's remaining errors if self.yacc_error_list: self.yacc_errors_history = self.yacc_error_list.copy() helpers.debug("[check_grammar_yacc.py] \ self.__attempt_to_fix_error() | \ Updated yacc error: {0}".format(self.yacc_error_list)) helpers.debug("[check_grammar_yacc.py] \ self.__attempt_to_fix_error() | \ Updated yacc error history: {0}".format(self.yacc_errors_history)) # if updated_latex_string: self.latex_string = updated_latex_string self.attempts += 1 return self.check_correct_grammar()
def __main_parsing(self, symbols): helpers.debug('\n[parser.py] __main_parsing()') listin = symbols T = DS.Tree() Q = DS.Queue() S = DS.Stack() temp1 = 0 temp2 = 0 R = [[0, 0], [9999999999, 9999999999]] sstart = self.__sp(listin, R) if sstart == -1: return helpers.debug('\n[parser.py] __main_parsing() | \ STARTING symbol index: %d ' % sstart) helpers.debug('[parser.py] __main_parsing() | \ STARTING symbol label: %s ' % listin[sstart]['label']) s = listin[sstart] Q.enqueue(sstart) Q.enqueue(T.root_node) listin[sstart]['checked'] = True while not Q.is_empty(): ''' abc^{2-1} ============= -> | EOBL | a | c | ============= | b | | a | ''' ''' abc^{2-1] ============= -> | | 2 | | ============= | - | | 2 | ''' helpers.debug('\n[parser.py] __main_parsing() \ | find main baseline') while not Q.is_empty(): temp1 = Q.dequeue() # a, 2 ParentNode = Q.dequeue() SymbolNode = DS.SymbolNode(listin[temp1]) T.insert(SymbolNode, ParentNode, 'Node') S.push(temp1) # a, 2 S.push(SymbolNode) print( '\n[parser.py] __main_parsing() | \ find baseline of symbol: ', temp1, listin[temp1]['label']) helpers.debug('\n[parser.py] __main_parsing() | \ temp2 hor...') temp2 = self.__hor(listin, temp1) # b, - print( '\n[parser.py] __main_parsing() | \ temp2: ', temp2, listin[temp2]['label']) while temp2 != -1: print('[parser.py] __main_parsing() | \ ... while temp2') listin[temp2]['checked'] = True print( '[parser.py] __main_parsing() | \ ... wall attributes of temp1: ', listin[temp1]['wall']) listin[temp2]['wall'] = listin[temp1]['wall'].copy() ''' a.wall = -1 -1 9999 9999 b.checked = true b.wall = a.wall (-1 -1 9999 9999) c.checked = true c.wall = b.wall --------------------------------------------- -.checked = true -.wall = 2.wall (wall da região super?) ''' print( '[parser.py] __main_parsing() | \ ...wall attributes of temp2: ', listin[temp2]['wall']) SymbolNode = DS.SymbolNode(listin[temp2]) T.insert(SymbolNode, ParentNode, 'Node') S.push(temp2) S.push(SymbolNode) listin[temp1]['wall']['right'] = listin[temp2]['xmin'] ''' a.wall.right = b.xmin b.wall.right = c.xmin ''' print( '[parser.py] __main_parsing() | \ ...updated wall attributes \ of temp1: ', listin[temp1]['wall']) temp1 = temp2 # b temp2 = self.__hor(listin, temp1) # c - 1 print( '[parser.py] __main_parsing() | \ new temp2: ', temp2) S.push("EOBL") helpers.debug('\n[parser.py] __main_parsing() \ | find secondary baseline') ''' abc^2 ============= -> | EOBL | 2 | c | ============= | b | | a | ''' while not S.is_empty(): if S.peek() == "EOBL": S.pop() SymbolNode = S.pop() temp1 = S.pop() # c helpers.debug('[parser.py] __main_parsing() \ | symbol: %s ' % temp1) label = int(listin[temp1]['label']) helpers.debug('[parser.py] __main_parsing() \ | temp1 label: %s' % label) # 1/6 upperThreshold = listin[temp1]['ymin'] + \ ((1/6.5) * listin[temp1]['h']) # 5/6 lowerThreshold = listin[temp1]['ymin'] + \ ((5.5/6.5) * listin[temp1]['h']) ''' Changes in xmin and xmax because of the 'a' When it overlaps the fraction ''' leftThreshold = ( listin[temp1]['xmin'] + ((1/6) * listin[temp1]['w']) ) \ if label != 10 else listin[temp1]['xmin'] rightThreshold = ( listin[temp1]['xmax'] - ((1/6) * listin[temp1]['w']) ) \ if label != 10 else listin[temp1]['xmax'] R = [{ 'above': [[leftThreshold, listin[temp1]['wall']['top']], [rightThreshold, upperThreshold]] }, { 'below': [[leftThreshold, lowerThreshold], [rightThreshold, listin[temp1]['wall']['bottom']]] }] for region in R: # For each region, it looks for the initial symbol reg = region[list(region.keys())[0]] region_name = list(region.keys())[0] helpers.debug('\n[parser.py] __main_parsing() | \ região: %s' % region_name) # ( ) [ ] { } . * = neq + sqrt operators = bool(label in range(11, 17) or label in range(27, 31) or label == 17 or label == 23) if (region_name == 'above' and not operators) or \ (region_name == 'below' and not operators): temp2 = self.__start(listin, reg) if temp2 != -1: if not listin[temp2]['checked']: listin[temp2]['checked'] = True listin[temp2]['wall']['left'] = reg[0][0] listin[temp2]['wall']['right'] = reg[1][0] listin[temp2]['wall']['top'] = reg[0][1] listin[temp2]['wall']['bottom'] = reg[1][1] RelationNode = DS.RegionNode( list(region.keys())[0]) T.insert(RelationNode, SymbolNode, 'Node') Q.enqueue(temp2) Q.enqueue(RelationNode) ''' Changes in xmin and xmax because of the 'a' When it overlaps the fraction ''' R = [ { 'contains': [ # left, top [listin[temp1]['xmin'], listin[temp1]['ymin']], # right, bottom [listin[temp1]['xmax'], listin[temp1]['ymax']] ] }, { 'super': [ # left, top [rightThreshold, listin[temp1]['wall']['top']], # right, bottom [listin[temp1]['wall']['right'], upperThreshold] ] }, { 'subsc': [ # left, top [rightThreshold, lowerThreshold], # right, bottom [ listin[temp1]['wall']['right'], listin[temp1]['wall']['bottom'] ] ] } ] for region in R: # Para cada região, busca o símbolo inicial reg = region[list(region.keys())[0]] region_name = list(region.keys())[0] helpers.debug('\n[parser.py] __main_parsing() | \ região: %s' % region_name) # - ( ) [ ] { } . * = neq + operators = bool(label == 10 or label in range(27, 31) or label == 17) if (region_name == 'super' and not operators) or \ (region_name == 'subsc' and not operators) or \ (region_name == 'contains' and \ int(listin[temp1]['label']) == 23): temp2 = self.__start(listin, reg) if temp2 != -1: if not listin[temp2]['checked']: listin[temp2]['checked'] = True listin[temp2]['wall']['left'] = reg[0][0] listin[temp2]['wall']['right'] = reg[1][0] listin[temp2]['wall']['top'] = reg[0][1] listin[temp2]['wall']['bottom'] = reg[1][1] RelationNode = DS.RegionNode( list(region.keys())[0]) T.insert(RelationNode, SymbolNode, 'Node') Q.enqueue(temp2) Q.enqueue(RelationNode) return T
def __overlap(self, symbolIndex, top, bottom, listin): helpers.debug('\n\n[parser.py] __overlap()') listIndex = symbolIndex stop = False n = len(listin) helpers.debug('[parser.py] __overlap() | listIndex: %d ' % listIndex) if listin[symbolIndex]['label'] == '10': maxLength = listin[symbolIndex]['xmax'] - listin[symbolIndex][ 'xmin'] else: maxLength = -1 mainLine = -1 helpers.debug('[parser.py] __overlap() | \ mainLine: %d ' % mainLine) helpers.debug('[parser.py] __overlap() | \ maxLength: %d ' % maxLength) while listIndex > 0 and stop == False: print('[parser.py] __overlap() | xmin, xmin', listin[listIndex - 1]['xmin'], listin[symbolIndex]['xmin']) if listin[listIndex - 1]['xmin'] <= listin[symbolIndex]['xmin']: listIndex = listIndex - 1 # stop = True else: stop = True # listIndex = listIndex - 1 helpers.debug('[parser.py] __overlap() | \ listIndex: %d ' % listIndex) helpers.debug('[parser.py] __overlap() | \ n: %d ' % n) helpers.debug('[parser.py] __overlap() | \ top: %d ' % top) helpers.debug('[parser.py] __overlap() | \ bottom: %d ' % bottom) line1x = range(listin[symbolIndex]['xmin'], listin[symbolIndex]['xmax'] + 1) len_line1x = len(line1x) while listIndex < n and \ listin[listIndex]['xmin'] < listin[symbolIndex]['xmax']: line2x = range(listin[listIndex]['xmin'], listin[listIndex]['xmax'] + 1) len_line2x = len(line2x) x_set = set(line1x) if len_line1x < len_line2x else set(line2x) x_intersection = x_set.intersection( line1x if len_line1x >= len_line2x else line2x) min_line = min(len_line1x, len_line2x) print( '\n[parser.py] __overlap() | ... \ listIndex: ', listIndex) print('[parser.py] __overlap() | ... \ label: ', listin[listIndex]['label']) print('[parser.py] __overlap() | ... \ centroid: ', listin[listIndex]['centroid']) print('[parser.py] __overlap() | ... \ xmin: ', listin[listIndex]['xmin']) print('[parser.py] __overlap() | ... \ xmax: ', listin[listIndex]['xmax']) print( '[parser.py] __overlap() | ... \ max length: ', (listin[listIndex]['xmax'] - listin[listIndex]['xmin'])) print( '[parser.py] __overlap() | ... \ len(x_intersection): ', len(x_intersection)) print( '[parser.py] __overlap() | ... \ min_line/2: ', min_line / 2) if not listin[listIndex]['checked'] and \ listin[listIndex]['label'] == '10' and \ listin[listIndex]['centroid'][1] >= top and \ listin[listIndex]['centroid'][1] <= bottom and \ listin[listIndex]['xmin'] <= (listin[symbolIndex]['xmin'] + 8) and \ len(x_intersection) > (min_line/2) and \ (listin[listIndex]['xmax'] - listin[listIndex]['xmin']) > maxLength: maxLength = (listin[listIndex]['xmax'] - listin[listIndex]['xmin']) mainLine = listIndex listIndex += 1 helpers.debug('[parser.py] __overlap() | listIndex: %d ' % listIndex) helpers.debug('[parser.py] __overlap() | mainLine: %d ' % mainLine) helpers.debug('[parser.py] __overlap() | maxLength: %d ' % maxLength) if mainLine == -1: return symbolIndex else: return mainLine
def invert(self, image): helpers.debug('[preprocessing.py] invert()') img = image.copy() return 255 - img
def __sp(self, listin, R): helpers.debug('\n[parser.py] __sp()') return self.__start(listin, R)
def __hor(self, listin, index): print('\n[parser.py] __hor()') print('[parser.py] __hor() | symbol index: ', index) print('[parser.py] __hor() | symbol label: ', listin[index]['label']) global stop stop = False global a a = -1 label = int(listin[index]['label']) right = listin[index]['wall']['right'] # to avoid get symbols behind left = listin[index]['xmin'] # to treat expoent and subscript # 1/6 top = listin[index]['ymin'] + (listin[index]['h'] * (1 / 6.5)) # 5/6 bottom = listin[index]['ymin'] + (listin[index]['h'] * (5.5 / 6.5)) # it doesn't have expoent and subscript if label == 10 or label in [27, 28, 29, 30]: top = listin[index]['wall']['top'] bottom = listin[index]['wall']['bottom'] # if it is square root, the left wall id xmax if label == 23: left = listin[index]['xmax'] # if it is horizontal line or brackets if label in range(10, 17): R = [[listin[index]['xmax'], top], [right, bottom]] print('[parser.py] __hor() | R', R) a = self.__start(listin, R) stop = True else: helpers.debug('[parser.py] __hor() | top: %d, bottom: %d, \ left: %d, right: %d' % (top, bottom, left, right)) for s in range(0, len(listin)): checked = listin[s]['checked'] if not checked: symbol = listin[s] helpers.debug('[parser.py] __hor() | ... \ symbol: %s' % symbol['label']) helpers.debug('[parser.py] __hor() | ... \ symbol centroid: %s ' % symbol['centroid']) helpers.debug('[parser.py] __hor() | ... \ symbol coordinates: xmin: %s xmax: %s \ ymin: %s ymax: %s' % (symbol['xmin'], symbol['xmax'], symbol['ymin'], symbol['ymax'])) if symbol['centroid'][0] >= left and \ symbol['centroid'][0] <= right and \ symbol['centroid'][1] <= bottom and \ symbol['centroid'][1] >= top: helpers.debug('[parser.py] __hor() | \ ......... founded: %s' % s) a = s stop = True break helpers.debug('[parser.py] __hor() | a: %d ' % a) if a != -1: helpers.debug('[parser.py] __hor() | \ a label: %s ' % listin[a]['label']) if stop and a != -1: helpers.debug('[parser.py] __hor() | ... before overlap') return self.__overlap(a, listin[a]['wall']['top'], listin[a]['wall']['bottom'], listin) else: return -1
def segment(self, img): helpers.debug('[preprocessing.py] segment()') image = img.copy() symbols = [] cnts, somethingElse = cv2.findContours(image.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) helpers.debug('[preprocessing.py] segment() | contours founded:') helpers.debug(len(cnts)) for i in range(len(cnts)): # It was 0 and was changed to 10 to try reducing the noise if (cv2.contourArea(cnts[i]) < 10): continue if (self.configs['dataset'] and len(cnts) > 1): continue try: # Draw contour in new image (mask) mask = np.zeros_like(image) cv2.drawContours(mask, cnts, i, (255, 255, 255), -50) out = np.zeros_like(image) ''' At the position where the mask is 255 it paints the normalized position with the same positions from mask were it is 255 I.e. mask has the inner part of "2" painted, but the normalized doesn't have it. It was changed to > 0 instead of 255 this prevents the image from having to be binarized ''' out[mask > 0] = image[mask > 0] # # Get bounding box coordinates _x, _y, _w, _h = cv2.boundingRect(cnts[i]) # For now, it's worthless # ALL points where the mask == 255 # list_y, list_x = np.where(out > 0) # (topx, topy) = (np.min(list_x), np.min(list_y)) # (bottomx, bottomy) = (np.max(list_x), np.max(list_y)) # Crop the image ycrop = _y + _h + 1 xcrop = _x + _w + 1 cropped = out[_y:ycrop, _x:xcrop] resized = self.resize(cropped) # Test - It was not here during validation binarized = self.binarization(resized) result_image = self._255_to_1(binarized) # result_image = self._255_to_1(resized) helpers.show_image(result_image) attributes = { 'index': i, 'image': result_image.copy(), 'xmin': _x, 'xmax': _x + _w, 'ymin': _y, 'ymax': _y + _h, 'w': _w, 'h': _h, 'centroid': [(_x + (_x + _w)) / 2, (_y + (_y + _h)) / 2] } symbols.append(attributes) mask = None out = None cropped = None resized = None binarized = None result_image = None # self.image = self.print_bounding_box(image, (_x, _y, _w, _h)) self.image = image except BaseException as e: print(e) continue return (symbols, self.image)
def __token_substitution(self, latex, grammar, subst): def __list_substitution(i, nomatch, aux, initial_index, label, substitution_list, substitution_index): for substitution in substitution_list[substitution_index]: helpers.debug('[parser.py] __list_to_latex_obj() | \ ......substitution: ') # subsc helpers.debug('[parser.py] __list_to_latex_obj() | \ ......latex index: %d ' % i) # subsc helpers.debug(substitution) # subsc try: helpers.debug('[parser.py] __list_to_latex_obj() | \ ......current latex: %s ' % latex[i]['label']) # latex[i]['label'] = subst[subs][substitution] if latex[i]['label'] == substitution: helpers.debug('[parser.py] __list_to_latex_obj() | \ ......match ') helpers.debug('[parser.py] __list_to_latex_obj() | \ ......from: %s %s ' % (latex[i]['label'], substitution)) helpers.debug( '[parser.py] __list_to_latex_obj() | \ ......to: %s ' % substitution_list[substitution_index][substitution] ) aux.append({ "index": i, "label": substitution_list[substitution_index][substitution] }) i += 1 else: helpers.debug('[parser.py] __list_to_latex_obj() | \ ......no match ') i -= 1 nomatch = True except IndexError as e: helpers.debug('[parser.py] __list_to_latex_obj() | \ ......no match: IndexError ') nomatch = True break helpers.debug('[parser.py] __list_to_latex_obj() | \ ...... no match value: %s ' % nomatch) if nomatch: i = initial_index helpers.debug('[parser.py] __list_to_latex_obj() | \ ...... continue ......') break helpers.debug('[parser.py] __list_to_latex_obj() | \ ...... next ......') return i, nomatch def __change_label(i, label, substitution_list): for substitution_index in range(0, len(substitution_list)): nomatch = False aux = [] helpers.debug('[parser.py] __list_to_latex_obj() | \ ...substitutions: ') helpers.debug(substitution_list[substitution_index]) initial_index = i i, nomatch = __list_substitution(i, nomatch, aux, initial_index, label, substitution_list, substitution_index) if not nomatch: helpers.debug('[parser.py] __list_to_latex_obj() \ | match - updating value ') for matched in aux: latex[matched['index']]['label'] = matched['label'] return i for i in range(0, len(latex)): if latex[i]['label'] in grammar: label = grammar[latex[i]['label']] if label in subst: substitution_list = subst[label] # list of substitutions helpers.debug('[parser.py] __list_to_latex_obj() \ | substitution_list: ') helpers.debug(substitution_list) i = __change_label(i, label, substitution_list) return latex
def __preprocessing(self, symbols): helpers.debug('[parser.py] preprocessing()') xmin_sorted = sorted(symbols, key=lambda i: i['xmin']) symbols = xmin_sorted for i in range(0, len(symbols)): s = symbols[i] s['centroid'] = list(s['centroid']) s['checked'] = False if s['label'] in ['11', '13', '15']: s['type'] = 'Open' elif s['label'] in ['12', '14', '16']: s['type'] = 'Close' else: s['type'] = 'Normal' ''' Centroid was too below or too above in these cases It was changed to: 1/3, 2/3 ''' ''' [0-9] b s['ymin'] + (2/3) * (s['ymax'] - s['ymin']) y sqrt ()[]{} s['centroid'][1] = s['ymin'] + (1/3) * s['h'] others s['centroid'][1] = s['ymin'] + ((s['ymax'] - s['ymin'])/2) ''' ''' Until validation: [0-9], b After validation: (), {}, [], sqrt ''' if re.search("^[0-9]$", str(s['label'])) or \ s['label'] == '19' or s['label'] == '23' or \ s['type'] == 'Open' or s['type'] == 'Close': s['centroid_class'] = 'Ascending' s['centroid'][1] = s['ymin'] + (2 / 3) * (s['h']) # 3/5 elif s['label'] == '25': ''' Until validation: y sqrt ( { [ After validation: y sqrt ''' s['centroid_class'] = 'Descending' s['centroid'][1] = s['ymin'] + (1 / 3) * s['h'] else: s['centroid_class'] = 'Centred' s['centroid'][1] = s['ymin'] + ((s['h']) / 2) s['wall'] = {} s['wall']['top'] = -1 s['wall']['bottom'] = 9999999999999 s['wall']['left'] = -1 s['wall']['right'] = 9999999999999 return symbols
def _255_to_1(self, image): helpers.debug('[preprocessing.py] _255_to_1()') img = image.copy() return (img / 255)
def __list_substitution(i, nomatch, aux, initial_index, label, substitution_list, substitution_index): for substitution in substitution_list[substitution_index]: helpers.debug('[parser.py] __list_to_latex_obj() | \ ......substitution: ') # subsc helpers.debug('[parser.py] __list_to_latex_obj() | \ ......latex index: %d ' % i) # subsc helpers.debug(substitution) # subsc try: helpers.debug('[parser.py] __list_to_latex_obj() | \ ......current latex: %s ' % latex[i]['label']) # latex[i]['label'] = subst[subs][substitution] if latex[i]['label'] == substitution: helpers.debug('[parser.py] __list_to_latex_obj() | \ ......match ') helpers.debug('[parser.py] __list_to_latex_obj() | \ ......from: %s %s ' % (latex[i]['label'], substitution)) helpers.debug( '[parser.py] __list_to_latex_obj() | \ ......to: %s ' % substitution_list[substitution_index][substitution] ) aux.append({ "index": i, "label": substitution_list[substitution_index][substitution] }) i += 1 else: helpers.debug('[parser.py] __list_to_latex_obj() | \ ......no match ') i -= 1 nomatch = True except IndexError as e: helpers.debug('[parser.py] __list_to_latex_obj() | \ ......no match: IndexError ') nomatch = True break helpers.debug('[parser.py] __list_to_latex_obj() | \ ...... no match value: %s ' % nomatch) if nomatch: i = initial_index helpers.debug('[parser.py] __list_to_latex_obj() | \ ...... continue ......') break helpers.debug('[parser.py] __list_to_latex_obj() | \ ...... next ......') return i, nomatch
def resize(self, image): helpers.debug('[preprocessing.py] resize()') old_size = image.shape[:2] # (height, width) height, width = old_size[0], old_size[1] ratio = float(26) / max(old_size) size = tuple([int(x * ratio) for x in old_size]) size_height, size_width = size[0], size[1] size_height = size_height if size_height > 0 else 1 size_width = size_width if size_width > 0 else 1 division_height = int(height / 2) division_width = int(width / 2) around_w = round(width * 20 / 100) around_h = round(height * 20 / 100) middle_width = [ image[division_height][division_width - 1], image[division_height][division_width], image[division_height][division_width + 1] ] middle_height = [] for a in range(division_height - around_h, division_height + around_h): middle_height.append(image[a][division_width], ) middle_width = [] for b in range(division_width - around_w, division_width + around_w): middle_width.append(image[division_height][division_width], ) helpers.debug('[preprocessing.py] segment() | \ before line and sqrt processing') if size_height <= 15 and size_width >= 20 and \ (any(i > 0.0000 for i in middle_height) or any(i > 0.0000 for i in middle_width)): # For horizontal line nsize = 4 if size_height < 5 else size_height nsize = 10 if size_height > 10 else size_height print('hor ', size_height) new_size = tuple([int(nsize), 26]) else: if size_width / size_height >= 2: # For rectangle (sqrt) kernel = np.ones((2, 2), np.uint8) image = cv2.dilate(image, kernel, iterations=7) # xinit = int(width * 2 / 100) # validation # xend = int(width * 65 / 100) # validation xinit = int(width * 5 / 100) xend = int(width * 85 / 100) image = image[0:height, xinit:xend] new_size = size helpers.debug('[preprocessing.py] segment() | \ after line and sqrt processing') helpers.debug('[preprocessing.py] segment() | \ before resize') if self.configs['resize'] == 'smaller': image = cv2.resize(image.copy(), (new_size[1], new_size[0]), interpolation=cv2.INTER_AREA) elif self.configs['resize'] == 'bigger': image = cv2.resize(image.copy(), (new_size[1], new_size[0]), interpolation=cv2.INTER_LINEAR) helpers.debug('[preprocessing.py] segment() | after resize') # Cria borda ao redor do símbolo e normaliza para 28x28 px helpers.debug('[preprocessing.py] segment() | before border') delta_w = 28 - new_size[1] delta_h = 28 - new_size[0] top, bottom = delta_h // 2, delta_h - (delta_h // 2) left, right = delta_w // 2, delta_w - (delta_w // 2) color = [0, 0, 0] image = cv2.copyMakeBorder(image.copy(), top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) helpers.debug('[preprocessing.py] segment() | after border') return image
def check_correct_lex(self): """Check and correct lex errors Args: latex_string (str): Latex string. latex (list): First latex structure. latex_list (list): [description] Returns: { 'latex': self.latex, 'latex_list': self.latex_list, 'latex_string': self.latex_string, 'lex_errors_history': self.lex_errors_history, 'lex_pure_errors': self.pure_lex_errors } """ helpers.debug("\n[check_grammar_lex.py] check_correct_lex() | \ attempts: %s" % self.attempts) second_lex_error_list = None lex_errors = [] if not self.lex_error_list and \ self.__first_error and \ self.attempts < 3 and \ self.latex_string: helpers.debug("\n[check_grammar_lex.py] check_correct_lex() | \ There's no previous error. Searching the first one.") lex_error_list = lex.LatexLexer(self.latex_string) if lex_error_list: self.pure_lex_errors.extend(lex_error_list) helpers.debug("[check_grammar_lex.py] check_correct_lex() | \ pure_lex_errors: {0}".format(self.pure_lex_errors)) lex_errors, lex_errors_history = self.__locate_lex_error( lex_error_list) self.lex_error_list = lex_errors self.lex_errors_history = lex_errors_history helpers.debug("[check_grammar_lex.py] check_correct_lex() | \ lex_error_list: {0}".format(self.lex_error_list)) helpers.debug("[check_grammar_lex.py] check_correct_lex() | \ lex_errors_history: {0}".format(self.lex_errors_history)) self.__first_error = False self.__attempt_to_fix_error(lex_errors) elif self.lex_error_list and \ not self.__first_error and \ self.attempts < 3 and \ self.latex_string: helpers.debug("\n[check_grammar_lex.py] check_correct_lex() | \ There's previous error. Searching for new errors.") second_lex_error_list = lex.LatexLexer(self.latex_string) if second_lex_error_list: helpers.debug("[check_grammar_lex.py] check_correct_lex() | \ New errors found.") self.pure_lex_errors.extend(second_lex_error_list) helpers.debug("[check_grammar_lex.py] check_correct_lex() | \ pure_lex_errors: {0}".format(self.pure_lex_errors)) ''' HUM... CONFERIR. If new error is EOF error Remove the error from the list. Takes the next one. ''' if second_lex_error_list[0][1] == -1: second_lex_error_list.reverse() second_lex_error_list.pop() second_lex_error_list.reverse() lex_errors, lex_errors_history = self.__locate_lex_error( second_lex_error_list) self.lex_error_list = lex_errors self.lex_errors_history = lex_errors_history helpers.debug("[check_grammar_lex.py] check_correct_lex() | \ lex_error_list: {0}".format(self.lex_error_list)) helpers.debug("[check_grammar_lex.py] check_correct_lex() | \ lex_errors_history: {0}".format(self.lex_errors_history)) self.__attempt_to_fix_error(lex_errors) elif (self.lex_error_list and self.attempts >= 3) or \ not self.latex_string: raise LexicalError({ 'latex': self.latex, 'latex_list': self.latex_list, 'latex_string': self.latex_string, 'error': self.lex_error_list, # Current error 'errors_history': self.lex_errors_history, 'pure_errors': self.pure_lex_errors }) return { 'latex': self.latex, 'latex_list': self.latex_list, 'latex_string': self.latex_string, 'errors_history': self.lex_errors_history, 'pure_errors': self.pure_lex_errors }