def add_adjacent_tuple_information(self, tuples, tuples_with_adjacent_info): final_tuple_list = [] change = False for tuple_index, tuple in enumerate(tuples): tuplec = tuple[:] tuplec_with_adjacent_info = tuples_with_adjacent_info[ tuple_index][:] tuplec_low_end = tuplec[0] tuplec_high_end = tuple[len(tuple) - 1] tupleca_low_end = tuplec_with_adjacent_info[0] tupleca_high_end = tuplec_with_adjacent_info[ len(tuplec_with_adjacent_info) - 1] if tuplec_low_end == ' ': if tupleca_low_end != None and tupleca_low_end != ' ': tuplec = Random.replace_value_in_tuple( tuplec, tupleca_low_end, 0) change = True if tuplec_high_end == ' ': if tupleca_high_end != None and tupleca_low_end != ' ': tuplec = Random.replace_value_in_tuple( tuplec, tupleca_high_end, len(tuplec) - 1) change = True final_tuple_list.append(tuplec) return final_tuple_list, change
def obtain_line_info(self, best_index, other_indices): line_1 = self._set_lines[other_indices[0]] line_2 = self._set_lines[best_index] # should be best line_3 = self._set_lines[other_indices[1]] text_1 = self.get_line_content(line_1) text_2 = self.get_line_content(line_2) # should be best text_3 = self.get_line_content(line_3) self._cpr.print("ocr_set:") self._cpr.print("text_A", text_1) self._cpr.print("text_B", text_2) self._cpr.print("text_C", text_3) line_1_ok = not Random.is_false_true_or_none(line_1) line_2_ok = not Random.is_false_true_or_none(line_2) line_3_ok = not Random.is_false_true_or_none(line_3) ok_lines = [line_1_ok, line_2_ok, line_3_ok] ok_indices = [] for ok_index, ok in enumerate(ok_lines): if ok is True: # not_ok_indices.append(ok_index) ok_indices.append(ok_index) ok_len = len(ok_indices) texts_return = [text_1, text_2, text_3] lines_return = [line_1, line_2, line_3] lines_return_ok = [line_1_ok, line_2_ok, line_3_ok] return texts_return, lines_return, lines_return_ok, ok_len
def calculate_ld_information_tesseract(self, tesseract_page): # counters for final results # overall results overall_ldist = 0 overall_gap_height = 0 overall_line_height = 0 # overall length counters overall_y_gaps_len = 0 overall_line_height_len = 0 for c_area in tesseract_page.areas: for c_paragraph in c_area.paragraphs: paragraph_line_count = len(c_paragraph.lines) # count in paragraph information, if it's only one line, don't count if paragraph_line_count >= 2: ldist_paragraph, paragraph_gap_height, paragraph_line_height, paragraph_y_gaps_len, paragraph_lh_len = \ self.calculate_line_distance_information(c_paragraph.lines, False) overall_ldist, unused = Random.add_to_mean(overall_ldist, ldist_paragraph, \ overall_line_height_len, paragraph_lh_len) overall_gap_height, overall_y_gaps_len = Random.add_to_mean(overall_gap_height, paragraph_gap_height, \ overall_y_gaps_len, paragraph_y_gaps_len) overall_line_height, overall_line_height_len = Random.add_to_mean(overall_line_height, paragraph_line_height, \ overall_line_height_len, paragraph_lh_len) #new_ovlh_counter = (overall_gap_height*overall_gap_count) +\ # (paragraph_gap_height*paragraph_line_count) #new_ovlh_divisor = overall_gap_count + paragraph_line_count #new_overall_line_height = new_ovlh_counter / new_ovlh_divisor #overall_gap_height = new_overall_line_height #overall_gap_count += paragraph_line_count final_ldist = TypeCasts.round_to_int(overall_ldist) final_gap_height = TypeCasts.round_to_int(overall_gap_height) final_line_height = TypeCasts.round_to_int(overall_line_height) # just for veriying final_ldist, slightly less inaccurate # final_ldist_2 = TypeCasts.round_to_int(final_gap_height + final_line_height) final_y_gaps_len = TypeCasts.round_to_int(overall_y_gaps_len) final_line_height_length = TypeCasts.round_to_int(overall_line_height_len) return final_ldist, final_gap_height, final_line_height, final_y_gaps_len, final_line_height_length
def create_spaced_string(self, text, diff_tuples, size_filo, search_range_filo): PADDING_CHAR = '¦' MID_FILL_CHAR = '¯' final_text = text for current_tuple in diff_tuples: current_tuple_list = list(current_tuple) middle_index_list = Random.find_middle(len(current_tuple_list), True) current_tuple_list[middle_index_list] = MID_FILL_CHAR stringed_tuple = TypeCasts.list_to_string(current_tuple_list) stringed_tuple = stringed_tuple.strip() # trim outbound spaces stringed_tuple = stringed_tuple.replace(PADDING_CHAR, '') stringed_tuple_final = stringed_tuple.replace(MID_FILL_CHAR, '') stringed_replacement = stringed_tuple.replace(MID_FILL_CHAR, ' ') # found_in_text = text.find(stringed_tuple_final) new_text = final_text.replace(stringed_tuple_final, stringed_replacement) final_text = new_text return final_text
def create_non_spaced_string(self, text, diff_tuples, size_filo, search_range_filo): PADDING_CHAR = '¦' # pad values because of filos text_padded = Random.append_pad_values(text, size_filo, PADDING_CHAR) text_split = list(text_padded) current_chars_filo = Ranged_Filo(size_filo, search_range_filo, True) filo_mid_index = current_chars_filo.get_middle_index() final_text = "" for char_index, char in enumerate(text_split): current_chars_filo.push(char) # if current middle char is ' ' and there is a diff tuple for that, don't push it to final string current_tuple = current_chars_filo.get_middle_items(True, True) current_middle_char = current_tuple[filo_mid_index] its_a_diff_tuple = False for diff_tuple_index, diff_tuple in enumerate(diff_tuples): if current_tuple == diff_tuple: diff_tuples[ diff_tuple_index] = "done" # mark this tuple as corrected its_a_diff_tuple = True break # escape inner loop if current_middle_char is not PADDING_CHAR: # do not append padded chars if not its_a_diff_tuple and current_middle_char is not None: final_text += current_middle_char return final_text
def get_confidence_count(self, char1, char2, char3, cconf1, cconf2, cconf3, wildcard_char='¦'): def get_other_char(char_first, char_sec, char_thrd, co1, co2, co3): if char_first != char_sec: return char_sec, float(co2) elif char_first != char_thrd: return char_thrd, float(co3) same_ctr = 0 cconf_ctr = float(cconf1) if char1 == char2: same_ctr += 1 cconf_ctr += float(cconf2) if char1 == char3: same_ctr += 1 cconf_ctr += float(cconf3) # special cases space: ' ', ' ', 'x' # wildcard character : '¦', '¦', '¦' if char1 == ' ' and same_ctr == 1: # if the confidence of the other character is below that value, space gets the high put in confidence value return 1, 95.0 #todo j4t SPACE_TRESH = 50.0 SPACE_PUT_IN_VALUE = 99.0 otherchar, otherconf = get_other_char(char1, char2, char3, cconf1, cconf2, cconf3) #print("otherchar",otherchar,"otherconf",otherconf) if otherconf < SPACE_TRESH: return 1, SPACE_PUT_IN_VALUE elif char1 == wildcard_char and same_ctr == 1: #todo: differentiate type of character ?? # if there is two wildcards and one characters, characters confidence has to be higher than # WILDCARD_TRESH to be taken wildcard_tresh = 98.5 if self.config.MSA_BEST_CHANGE_VOTING_TRESHS_ON_EMPTY_LINE: wildcard_tresh -= 10 # 0:99,19%, 20:99.16%, 10:99.27% return 1, wildcard_tresh elif char1 == wildcard_char and same_ctr == 0: pass # todo maybe cover this case (cause wildcard has no confidence i.e if the two otherchars are very low prob, take wildcard) elif char1 == '' and same_ctr == 0: pass # todo maybe cover this case (cause space has no confidence ... elif self.config.MSA_BEST_VOTING_DOWNSCALE_ONLY_SC \ and Random.is_special_character(char1) and same_ctr == 0 \ and char2 == wildcard_char and char3 == wildcard_char: # lower the confidence of special characters which stand without any other chars return same_ctr, cconf_ctr * 0.9 return same_ctr, cconf_ctr
def add_linebreaks(self, previous_line, current_line, previous_line_index, sd_line_index, line_heigth_info): MODE = 'TAKE_CURRENT_LINE_DIST' if previous_line is None: return None if MODE is 'TAKE_CURRENT_LINE_DIST': MARGIN = 0 # tolerance margin current_lh_info = line_heigth_info[sd_line_index] (xp_start, yp_start, xp_stop, yp_stop) = previous_line.coordinates (xc_start, yc_start, xc_stop, yc_stop) = current_line.coordinates y_dist = yc_start - yp_stop if y_dist <= 0: return None line_distance = current_lh_info.get_line_distance() y_times = (y_dist + MARGIN) / line_distance y_times_absolute = TypeCasts.round_to_int(y_times) if y_times_absolute > 0: generated_text = Random.append_pad_values( "", y_times_absolute, "\n") return generated_text else: return None self.cpr.print("Undefined case reached shouldn't happen") return None
def compare_ocr_strings_hamming(ocr_string1, ocr_string2, pad_difference=True): if pad_difference is True: len_str1 = len(ocr_string1) len_str2 = len(ocr_string2) if len_str1 > len_str2: ocr_string2 = Random.append_pad_values(ocr_string2, len_str1 - len_str2) elif len_str2 > len_str1: ocr_string1 = Random.append_pad_values(ocr_string1, len_str2 - len_str1) # print("Do Hammingdist ",ocr_string1," to " ,ocr_string2) result = distpkg.hamming(ocr_string1, ocr_string2) return result
def maybe_replace_voted_by_predicted_char(self, voted_char, aufsichtsrat_prediction_toggled, predicted_char, \ wildcard_character, voted_acc_conf, character_1, character_2, character_3): if aufsichtsrat_prediction_toggled: if Random.is_special_character(predicted_char): one_char_sc = Random.is_special_character(character_1) \ or Random.is_special_character(character_2) or Random.is_special_character( character_3) voted_char_sc = Random.is_special_character(voted_char) if predicted_char != voted_char and ( one_char_sc or voted_char_sc) and voted_char != wildcard_character: # print("FiloContent:", filo_content) self.cpr_sc_predict.print("pc:", predicted_char, "vc:", voted_char, "vc_acc", voted_acc_conf) if voted_acc_conf <= 90.0: if voted_char != '\f': # don't swap formfeeds, they don't get predicted at all self.cpr_sc_predict.print("swap") voted_char = predicted_char return voted_char
def __init__(self, y_size, x_size, wildcard_character, substitution_character): self._y_size = y_size self._x_size = x_size self._middle_index = Random.find_middle(self._x_size, True) self._pre_middle_index = self.get_middle_index() - 1 self._nex_middle_index = self.get_middle_index() + 1 self._wildcard_character = wildcard_character self._substitution_character = substitution_character self.similar_chars = [] self.similar_chars.append(['o', 'ö']) self.similar_chars.append(['<', 'o']) # untested is this really better? self.similar_chars.append(['O', 'Ö']) self.similar_chars.append(['0', 'O', '9']) self.similar_chars.append(['d', 'ö']) #self.similar_chars.append(['1', 'l']) self.similar_chars.append(['l', 'j', '1']) self.similar_chars.append(['I', 'l']) self.similar_chars.append(['u', 'ü']) self.similar_chars.append(['U', 'Ü', 'O']) self.similar_chars.append(['a', 'ä']) self.similar_chars.append(['A', 'Ä']) self.similar_chars.append([':', ';']) self.similar_chars.append(['-', '¬']) self.similar_chars.append(['"', "'"]) self.similar_chars.append(['C', "G", "c"]) # just for testing ... self.similar_chars.append(['.', ',']) self.similar_chars.append([',', ';']) self.similar_chars.append(['v', 'V']) self.similar_chars.append(['w', 'W']) self.similar_chars.append(['i', 'l', 't', '1', '.']) # 1 l i also possible self.similar_chars.append(['r', 'n']) self.similar_chars.append(['%', 'm']) self.similar_chars.append(['&', 'é']) self.similar_chars.append(['e', 'é']) config_handler = ConfigurationHandler(first_init=False) self._config = config_handler.get_config() self._cpr = ConditionalPrint(self._config.PRINT_SEARCH_SPACE_PROCESSOR, self._config.PRINT_EXCEPTION_LEVEL, self._config.PRINT_WARNING_LEVEL)
def __init__(self, size_limit, search_range, fill_with_none, fill_range_only=False): self.original_range = search_range self.range = search_range self.size_limit = size_limit self.middle_index = Random.find_middle(size_limit, True) self.low_end_for_setting = self.middle_index - self.range super().__init__(size_limit) if fill_with_none: if fill_range_only is False: for index in range(0, self.size_limit): self.items.append(None) else: for index in range(0, search_range): self.items.append(None)
def fill_filo_last_chars(self, voted_char): """ fill filo for predictor usage with voted_char some additional chars around this char :param voted_char: :return: """ if self.config.PREDICTOR_AUFSICHTSRAT_ENABLED: # create pre semi-tokenized input strings in the filos from the voted characters for prediction if voted_char == ' ': # the models usally use the 'ƿ' char in substitution for spaces self.filo_last_chars.push(' ', filterchar='¦') self.filo_last_chars.push('ƿ', filterchar='¦') self.filo_last_chars.push(' ', filterchar='¦') elif Random.is_special_character(voted_char): self.filo_last_chars.push(' ', filterchar='¦') self.filo_last_chars.push(voted_char, filterchar='¦') self.filo_last_chars.push(' ', filterchar='¦') else: self.filo_last_chars.push(voted_char, filterchar='¦')
def recognize_a_line(self, line): if line == None or line == False or line == True or line.textstr == None: return False whole_text = line.textstr self.cpr.print("recognizing line:", whole_text) # counters counter_special_chars = 0 counter_alphanumerical_chars = 0 counter_numbers = 0 counter_chars = len(whole_text) counter_alphabetical = 0 counter_words = 0 counters_alphabetical_ratios = [] counters_wordlengths = [] counters_numbers = [] character_index = 0 # special conditions ultimo_is_first_word = False first_word_no_table_indicator = False starts_with_parenthesis = False ends_with_parenthesis = False last_xstop = 0 x_box_sizes = [] x_gaps = [] for key_index, key in enumerate(line.word['text']): word = line.word['text'][key] uid_info = line.word['UID'][key] word_xstart = line.data['word_x0'][character_index] word_xstop = line.data['word_x1'][character_index] word_box_size = word_xstop - word_xstart x_box_sizes.append(word_box_size) if key_index >= 1: x_gap = word_xstop - last_xstop x_gaps.append(x_gap) #line.data['word_x0'] if word is None or word == "": continue if key_index == 0: if word in self.filter_start_words: first_word_no_table_indicator = True if word.lower() == "ultimo": ultimo_is_first_word = True if word[0] == "(": starts_with_parenthesis = True if key_index == len(line.word['text'])-1: if word[-1] == ")": ends_with_parenthesis = True counter_alphabetical_chars_word = 0 counter_alphanumerical_chars_word = 0 counter_numbers_word = 0 counter_words += 1 word_list = list(word) for char in word_list: if Random.is_special_character(char): counter_special_chars += 1 elif Random.is_alphanumerical_character(char): counter_alphanumerical_chars += 1 counter_alphanumerical_chars_word += 1 if char.isdigit(): counter_numbers += 1 counter_numbers_word += 1 counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word ratio_alphabetical_word = np.round(counter_alphabetical_word/len(word), 2) counters_alphabetical_ratios.append(ratio_alphabetical_word) counters_wordlengths.append(len(word)) counters_numbers.append(counter_numbers_word) character_index += len(uid_info) last_xstop = word_xstop # get number of spaces len_whole_unspace = len(whole_text.replace(" ", "")) counter_spaces = counter_chars - len_whole_unspace # set alphabetical counter counter_alphabetical = counter_alphanumerical_chars - counter_numbers if counter_chars == 0: self.cpr.printw("no chars shouldn't happen, no recognizion") return False special_chars_ratio = counter_special_chars/ counter_chars alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars alphabetical_ratio = counter_alphabetical / counter_chars spaces_ratio = counter_spaces/ counter_chars numbers_ratio = counter_numbers / counter_chars maximum_x_gap = None mean_x_gap = None median_x_gap = None if len(x_gaps) >= 1: maximum_x_gap = max(x_gaps) mean_x_gap = np.mean(x_gaps) median_x_gap = np.median(x_gaps) many_numbers_in_first_word = False many_alphabetical_in_middle_words = False many_alphabetical_in_last_word = False # check some middle and last word conditions for counter_index, counter in enumerate(counters_wordlengths): if counter_index == 0: ctr_numbers = counters_numbers[counter_index] numbers_ratio_word = np.round(ctr_numbers/counter,2) if numbers_ratio_word > 0.8: many_numbers_in_first_word = True elif counter_index == len(counters_wordlengths)-1: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_last_word = True else: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_middle_words = True self.cpr.print("alle cntr:", counter_chars) self.cpr.print("spec cntr:", counter_special_chars, "ratio", special_chars_ratio) self.cpr.print("alnr cntr:", counter_alphanumerical_chars, "ratio", alphanumerical_chars_ratio) self.cpr.print("albt cntr:", counter_alphabetical, "ratio", alphabetical_ratio) self.cpr.print("spce cntr:", counter_spaces, "ratio", spaces_ratio) self.cpr.print("nmbr cntr:", counter_numbers, "ratio", numbers_ratio) self.cpr.print("x_box_sizes", x_box_sizes) self.cpr.print("x_gaps", x_gaps) self.cpr.print("x_gap_max_size", maximum_x_gap) self.cpr.print("x_gaps_mean", mean_x_gap) self.cpr.print("x_gaps_median", median_x_gap) if "Gewinn nach Vortrag" in whole_text: print("") if ((alphabetical_ratio < 0.75 and \ numbers_ratio > 0.2 and \ counter_chars > 5 and \ counter_words >= 2) and not \ (starts_with_parenthesis and ends_with_parenthesis)) or ultimo_is_first_word: if first_word_no_table_indicator: return False if mean_x_gap <= 115: return False if many_alphabetical_in_last_word: return False if many_alphabetical_in_middle_words and many_numbers_in_first_word: return False self.cpr.print("possible entry:", whole_text) if self.PRINT_TO_CHECKFILE: with open("checkfile_tables.txt", "a") as myfile: myfile.write(whole_text+ "||| max x_gap: " + str(maximum_x_gap)+"||| mean x_gap: " + str(mean_x_gap) \ + "||| median x_gap: " + str(median_x_gap)+"\n") print("jab") return True return False
def vocabulary_related_corrections(self, accumulated_chars, wildcard_character, accumulated_confs): if self.config.KEYING_RESULT_VOCABULARY_CORRECTION_VOTE: accumulated_chars_final = "" acc_split = accumulated_chars.split() len_split = len(acc_split) for word_index, word in enumerate(acc_split): if self.config.KEYING_RESULT_VC_IGNORE_SEPERATE_WRITING_CORRECTION: if word_index == len_split - 1 and word.replace( wildcard_character, "").endswith('-'): self.previous_word_with_seperator = True accumulated_chars_final += word + " " continue if word_index == 0: if self.previous_word_with_seperator is True: self.previous_word_with_seperator = False accumulated_chars_final += word + " " continue acc_confs_word = accumulated_confs.pop_multi(len(word)) acc_conf, rate, change, word_starting_borders, word_trailing_borders, word_reduced = \ self.vocab_checker.get_accumulated_confidence_rate(word, acc_confs_word, wildcard_character) self.cpr_vocab_check.print("w:", word, "wr:", word_reduced, "accr:", acc_conf, "rate", rate) # don't correct words below min vocab length ( mind that special chars in dict are toggled) check_len = len(word) if self.config.KEYING_RESULT_VC_DICT_REMOVE_SPECIAL_BORDER_CHARS: check_len = len(word_reduced) if check_len < self.config.KEYING_RESULT_VC_MIN_VOCAB_WORD_LENGTH: accumulated_chars_final += word + " " continue if self.config.KEYING_RESULT_VC_CORRECT_ONLY_ERRONOUS_CHARS: swappable_char_indices = [] acc_confs_used = None word_used = None if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS: # use the full length confidences array including trailing and leading special characters acc_confs_used = acc_confs_word word_used = word else: # don't use trailing and starting special characters if no special chars needed acc_confs_used = acc_confs_word[ len(word_starting_borders):( len(acc_confs_word) - len(word_trailing_borders))] word_used = word_reduced for conf_index, conf in enumerate(acc_confs_used): if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS: if conf <= 250: character_related = word_used[conf_index] is_special_char = Random.is_special_character( character_related) if is_special_char and character_related != wildcard_character: # only swap special character indices swappable_char_indices.append(conf_index) else: if conf <= 215: swappable_char_indices.append(conf_index) if len(swappable_char_indices) >= 1: word_reduced_correct = self.vocab_checker.correct_text_at_certain_indices_only( word_used, swappable_char_indices) if word_reduced_correct != None: word_correct_withtrails = None if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS: if Random.has_special_character( word_reduced_correct): # if special character was replaced with special character word_correct_withtrails = word_reduced_correct else: # if special character was replaced by alphanumerical character word_correct_withtrails = word else: word_correct_withtrails = word_starting_borders + word_reduced_correct + word_trailing_borders # only print the changed results if word != word_correct_withtrails: self.cpr_vocab_check.print( "w:", word, "wc:", word_correct_withtrails, "accr:", acc_conf, "rate", rate) accumulated_chars_final += word_correct_withtrails + " " else: accumulated_chars_final += word + " " else: accumulated_chars_final += word + " " continue if rate < self.config.KEYING_RESULT_VOCABULARY_CORRECTION_VOTE_TRESH \ and len(word_reduced) > 2: # if the rate drops below tresh, try to fetch vocab entry word_reduced_correct, suggestions, flh = self.vocab_checker.correct_text( word_reduced) if word_reduced_correct != None and word_reduced_correct != word_reduced: word_correct_withtrails = word_starting_borders + word_reduced_correct + word_trailing_borders self.cpr_vocab_check.print("w:", word, "wc:", word_correct_withtrails, "accr:", acc_conf, "rate", rate) accumulated_chars_final += word_correct_withtrails + " " else: accumulated_chars_final += word + " " else: accumulated_chars_final += word + " " accumulated_chars = accumulated_chars_final return accumulated_chars
def calculate_msa_best_charconf(self, take_n_dist_best_index=False, take_longest_as_pivot=True): # do a preselection of best element, if the parameter is set to take best n_dist_index as a pivot best_index = 1 if take_n_dist_best_index is True: ldist_best_index = self.get_shortest_n_distance_index( ) # this doesn't work in all cases atm best_index = ldist_best_index if take_longest_as_pivot is True: best_index = self.get_longest_index() indices = [0, 1, 2] indices.remove(best_index) index1 = indices[0] index2 = indices[1] self._cpr.print("msa selection taking best:", best_index, "others:(", index1, "and", index2, ")") try: line_1 = self._set_lines[index1] line_2 = self._set_lines[best_index] line_3 = self._set_lines[index2] text_1 = self.get_line_content(line_1) text_2 = self.get_line_content(line_2) # should be best text_3 = self.get_line_content(line_3) self._cpr.print("ocr_set:") self._cpr.print("text_A", text_1) self._cpr.print("text_B", text_2) self._cpr.print("text_C", text_3) lines = [text_1, text_2, text_3] line_1_ok = not Random.is_false_true_or_none(text_1) line_2_ok = not Random.is_false_true_or_none(text_2) line_3_ok = not Random.is_false_true_or_none(text_3) ok_lines = [line_1_ok, line_2_ok, line_3_ok] not_ok_indices = [] ok_indices = [] for ok_index, ok in enumerate(ok_lines): if ok is True: # not_ok_indices.append(ok_index) ok_indices.append(ok_index) ok_len = len(ok_indices) if ok_len == 0: result = None else: result = self._msa_handler.get_best_of_three(text_1, text_2, text_3, use_charconfs=True, \ line_1=line_1,line_2=line_2,line_3=line_3) self._best_msa_text = result except Exception as e: self._cpr.printex( "ocr_set.py Exception in MSA, just taking line prio exception:", e) tr = inspect.trace() self._cpr.printex("trace is:", tr) if take_n_dist_best_index is True: self._best_msa_text = self.get_line_content( self._set_lines[ldist_best_index]) else: self._best_msa_text = self.get_line_content( self._set_lines[best_index])
def extract_line_features(self, line): final_line_features = {} whole_text = line['text'] self.cpr.print("recognizing text:", whole_text) # counters counter_special_chars = 0 counter_alphanumerical_chars = 0 counter_numbers = 0 counter_chars = len(whole_text) counter_alphabetical = 0 counter_words = 0 counters_alphabetical_ratios = [] counters_wordlengths = [] counters_numbers = [] character_index = 0 # special conditions ultimo_is_first_word = False first_word_no_table_indicator = False starts_with_parenthesis = False ends_with_parenthesis = False last_xstop = 0 x_box_sizes = [] x_gaps = [] for word_obj in line['words']: word_index = word_obj['word_index'] word_text = word_obj['text'] hocr_coordinates = word_obj['hocr_coordinates'] word_xstart = hocr_coordinates[0] word_xstop = hocr_coordinates[2] word_box_size = word_xstop - word_xstart x_box_sizes.append(word_box_size) if word_index >= 1: x_gap = word_xstop - last_xstop x_gaps.append(x_gap) #line.data['word_x0'] if word_text is None or word_text == "": continue if word_index == 0: if word_text in self.filter_start_words: first_word_no_table_indicator = True if word_text.lower() == "ultimo": ultimo_is_first_word = True if word_text[0] == "(": starts_with_parenthesis = True if word_index == len(whole_text) - 1: if word_text[-1] == ")": ends_with_parenthesis = True counter_alphabetical_chars_word = 0 counter_alphanumerical_chars_word = 0 counter_numbers_word = 0 counter_words += 1 word_list = list(word_text) for char in word_list: if Random.is_special_character(char): counter_special_chars += 1 elif Random.is_alphanumerical_character(char): counter_alphanumerical_chars += 1 counter_alphanumerical_chars_word += 1 if char.isdigit(): counter_numbers += 1 counter_numbers_word += 1 counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word ratio_alphabetical_word = np.round( counter_alphabetical_word / len(word_text), 2) counters_alphabetical_ratios.append(ratio_alphabetical_word) counters_wordlengths.append(len(word_text)) counters_numbers.append(counter_numbers_word) character_index += len(word_text) last_xstop = word_xstop # get number of spaces len_whole_unspace = len(whole_text.replace(" ", "")) counter_spaces = counter_chars - len_whole_unspace # set alphabetical counter counter_alphabetical = counter_alphanumerical_chars - counter_numbers if counter_chars == 0: self.cpr.printw("no chars in line:", str(line['line_index']), "no features here") return False special_chars_ratio = counter_special_chars / counter_chars alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars alphabetical_ratio = counter_alphabetical / counter_chars spaces_ratio = counter_spaces / counter_chars numbers_ratio = counter_numbers / counter_chars maximum_x_gap = None mean_x_gap = None median_x_gap = None if len(x_gaps) >= 1: maximum_x_gap = max(x_gaps) mean_x_gap = np.mean(x_gaps) median_x_gap = np.median(x_gaps) many_numbers_in_first_word = False many_alphabetical_in_middle_words = False many_alphabetical_in_last_word = False # check some middle and last word conditions for counter_index, counter in enumerate(counters_wordlengths): if counter_index == 0: ctr_numbers = counters_numbers[counter_index] numbers_ratio_word = np.round(ctr_numbers / counter, 2) if numbers_ratio_word > 0.8: many_numbers_in_first_word = True elif counter_index == len(counters_wordlengths) - 1: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[ counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_last_word = True else: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[ counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_middle_words = True final_line_features = LineFeatures(cpr=self.cpr) final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word final_line_features.counter_special_chars = counter_special_chars final_line_features.counter_chars = counter_chars final_line_features.counter_spaces = counter_spaces final_line_features.counter_numbers = counter_numbers final_line_features.counter_alphabetical = counter_alphabetical final_line_features.counter_alphanumerical_chars = counter_alphanumerical_chars final_line_features.counter_words = counter_words final_line_features.counters_numbers = counters_numbers final_line_features.counters_wordlengths = counters_wordlengths final_line_features.counters_alphabetical_ratios = counters_alphabetical_ratios final_line_features.numbers_ratio = numbers_ratio final_line_features.alphabetical_ratio = alphabetical_ratio final_line_features.alphanumerical_chars_ratio = alphanumerical_chars_ratio final_line_features.special_chars_ratio = special_chars_ratio final_line_features.spaces_ratio = spaces_ratio final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word final_line_features.many_alphabetical_in_middle_words = many_alphabetical_in_middle_words final_line_features.many_numbers_in_first_word = many_numbers_in_first_word final_line_features.x_box_sizes = x_box_sizes final_line_features.x_gaps = x_gaps final_line_features.maximum_x_gap = maximum_x_gap final_line_features.mean_x_gap = mean_x_gap final_line_features.median_x_gap = median_x_gap return final_line_features
def validate_column_features(self, search_space, x_index, reference_char=None, count_up_similar_references=False): counter_whitespaces = 0 counter_wildcards = 0 counter_nones = 0 counter_characters = 0 counter_reference_char = 0 counter_same_characters = 0 counter_dict = {} counter_special_characters = 0 most_occuring_char = None otherchar = None otherchar_y_index = None simchars = None if reference_char is not None and count_up_similar_references is True: simchars = self.get_simchars_for_char(reference_char) if len(simchars) != 1: self._cpr.print("evaluate") # gather data for y_index in range(0, self.get_y_size()): row = search_space[y_index] column_item = row[x_index] if column_item == self.get_wildcard_char(): counter_wildcards += 1 elif column_item == ' ': counter_whitespaces += 1 elif column_item == None or column_item == False or column_item == True: counter_nones += 1 else: if reference_char is not None: if count_up_similar_references is False and column_item == reference_char: counter_reference_char += 1 if count_up_similar_references is True: matching = [s for s in simchars if column_item in s] boolmatch = len(matching) >= 1 if boolmatch is True: counter_reference_char += 1 counter_characters += 1 otherchar = column_item otherchar_y_index = y_index if column_item != None: if column_item != self._wildcard_character and \ column_item != " ": if not column_item in counter_dict.keys(): counter_dict.update({column_item: 1}) else: counter_dict[column_item] += 1 if Random.is_special_character(column_item): counter_special_characters += 1 # the highest amount of same characters in this column if len(counter_dict.items()) >= 1: most_occuring_char, counter_same_characters = max( counter_dict.items(), key=operator.itemgetter(1)) # extract features features = [] counter_whitespace_and_wildcards = counter_whitespaces + counter_wildcards if counter_nones == self.get_y_size(): features.append(ColumnFeatures.ONLY_NONE.value) if counter_wildcards == self.get_y_size( ) - 1 and counter_characters == 1: features.append((ColumnFeatures.ONE_CHAR_REST_WILDCARDS).value) # additional feature, the only char is a special character if Random.is_special_character(otherchar): features.append( ColumnFeatures.ONE_SPECIALCHAR_REST_WILDCARDS.value) if counter_whitespaces == self.get_y_size( ) - 1 and counter_characters == 1: features.append(ColumnFeatures.ONE_CHAR_REST_WHITESPACE.value) if counter_whitespace_and_wildcards == self.get_y_size( ) - 1 and counter_characters == 1: features.append( ColumnFeatures.ONE_CHAR_REST_WHITESPACE_OR_WILDCARDS.value) # additional feature, the only char is a special character if otherchar != self._wildcard_character and otherchar != " "\ and Random.is_special_character(otherchar): #print("feature extraction") #print(search_space[0]) #print(search_space[1]) #print(search_space[2]) #print("x-index",x_index) features.append( ColumnFeatures. ONE_SPECIALCHAR_REST_WHITESPACE_OR_WILDCARDS.value) if counter_reference_char == self.get_y_size() - 1 and ( counter_whitespaces == 1 or counter_wildcards == 1): features.append(ColumnFeatures.MOSTLY_REFERENCE_CHAR.value) if counter_whitespaces == self.get_y_size(): features.append(ColumnFeatures.ONLY_WHITESPACE.value) if counter_reference_char == self.get_y_size(): features.append(ColumnFeatures.ONLY_WILDCARD.value) if counter_whitespace_and_wildcards == self.get_y_size(): features.append(ColumnFeatures.ONLY_WHITESPACE_OR_WILDCARD.value) if counter_reference_char >= 1: features.append(ColumnFeatures.CONTAINS_REFERENCE_CHAR.value) if counter_same_characters == self.get_y_size(): if counter_special_characters == self.get_y_size(): features.append(ColumnFeatures.ONLY_SAME_SPECIAL_CHAR.value) if Random.is_special_character(most_occuring_char) \ and counter_same_characters == self.get_y_size()-1 \ and most_occuring_char != self._wildcard_character \ and counter_whitespace_and_wildcards == 1: features.append(ColumnFeatures.MOSTLY_SAME_SPECIAL_CHAR.value) return features, otherchar, otherchar_y_index