def get_confidence_count(self, char1, char2, char3, cconf1, cconf2, cconf3, wildcard_char='¦'): def get_other_char(char_first, char_sec, char_thrd, co1, co2, co3): if char_first != char_sec: return char_sec, float(co2) elif char_first != char_thrd: return char_thrd, float(co3) same_ctr = 0 cconf_ctr = float(cconf1) if char1 == char2: same_ctr += 1 cconf_ctr += float(cconf2) if char1 == char3: same_ctr += 1 cconf_ctr += float(cconf3) # special cases space: ' ', ' ', 'x' # wildcard character : '¦', '¦', '¦' if char1 == ' ' and same_ctr == 1: # if the confidence of the other character is below that value, space gets the high put in confidence value return 1, 95.0 #todo j4t SPACE_TRESH = 50.0 SPACE_PUT_IN_VALUE = 99.0 otherchar, otherconf = get_other_char(char1, char2, char3, cconf1, cconf2, cconf3) #print("otherchar",otherchar,"otherconf",otherconf) if otherconf < SPACE_TRESH: return 1, SPACE_PUT_IN_VALUE elif char1 == wildcard_char and same_ctr == 1: #todo: differentiate type of character ?? # if there is two wildcards and one characters, characters confidence has to be higher than # WILDCARD_TRESH to be taken wildcard_tresh = 98.5 if self.config.MSA_BEST_CHANGE_VOTING_TRESHS_ON_EMPTY_LINE: wildcard_tresh -= 10 # 0:99,19%, 20:99.16%, 10:99.27% return 1, wildcard_tresh elif char1 == wildcard_char and same_ctr == 0: pass # todo maybe cover this case (cause wildcard has no confidence i.e if the two otherchars are very low prob, take wildcard) elif char1 == '' and same_ctr == 0: pass # todo maybe cover this case (cause space has no confidence ... elif self.config.MSA_BEST_VOTING_DOWNSCALE_ONLY_SC \ and Random.is_special_character(char1) and same_ctr == 0 \ and char2 == wildcard_char and char3 == wildcard_char: # lower the confidence of special characters which stand without any other chars return same_ctr, cconf_ctr * 0.9 return same_ctr, cconf_ctr
def maybe_replace_voted_by_predicted_char(self, voted_char, aufsichtsrat_prediction_toggled, predicted_char, \ wildcard_character, voted_acc_conf, character_1, character_2, character_3): if aufsichtsrat_prediction_toggled: if Random.is_special_character(predicted_char): one_char_sc = Random.is_special_character(character_1) \ or Random.is_special_character(character_2) or Random.is_special_character( character_3) voted_char_sc = Random.is_special_character(voted_char) if predicted_char != voted_char and ( one_char_sc or voted_char_sc) and voted_char != wildcard_character: # print("FiloContent:", filo_content) self.cpr_sc_predict.print("pc:", predicted_char, "vc:", voted_char, "vc_acc", voted_acc_conf) if voted_acc_conf <= 90.0: if voted_char != '\f': # don't swap formfeeds, they don't get predicted at all self.cpr_sc_predict.print("swap") voted_char = predicted_char return voted_char
def fill_filo_last_chars(self, voted_char): """ fill filo for predictor usage with voted_char some additional chars around this char :param voted_char: :return: """ if self.config.PREDICTOR_AUFSICHTSRAT_ENABLED: # create pre semi-tokenized input strings in the filos from the voted characters for prediction if voted_char == ' ': # the models usally use the 'ƿ' char in substitution for spaces self.filo_last_chars.push(' ', filterchar='¦') self.filo_last_chars.push('ƿ', filterchar='¦') self.filo_last_chars.push(' ', filterchar='¦') elif Random.is_special_character(voted_char): self.filo_last_chars.push(' ', filterchar='¦') self.filo_last_chars.push(voted_char, filterchar='¦') self.filo_last_chars.push(' ', filterchar='¦') else: self.filo_last_chars.push(voted_char, filterchar='¦')
def recognize_a_line(self, line): if line == None or line == False or line == True or line.textstr == None: return False whole_text = line.textstr self.cpr.print("recognizing line:", whole_text) # counters counter_special_chars = 0 counter_alphanumerical_chars = 0 counter_numbers = 0 counter_chars = len(whole_text) counter_alphabetical = 0 counter_words = 0 counters_alphabetical_ratios = [] counters_wordlengths = [] counters_numbers = [] character_index = 0 # special conditions ultimo_is_first_word = False first_word_no_table_indicator = False starts_with_parenthesis = False ends_with_parenthesis = False last_xstop = 0 x_box_sizes = [] x_gaps = [] for key_index, key in enumerate(line.word['text']): word = line.word['text'][key] uid_info = line.word['UID'][key] word_xstart = line.data['word_x0'][character_index] word_xstop = line.data['word_x1'][character_index] word_box_size = word_xstop - word_xstart x_box_sizes.append(word_box_size) if key_index >= 1: x_gap = word_xstop - last_xstop x_gaps.append(x_gap) #line.data['word_x0'] if word is None or word == "": continue if key_index == 0: if word in self.filter_start_words: first_word_no_table_indicator = True if word.lower() == "ultimo": ultimo_is_first_word = True if word[0] == "(": starts_with_parenthesis = True if key_index == len(line.word['text'])-1: if word[-1] == ")": ends_with_parenthesis = True counter_alphabetical_chars_word = 0 counter_alphanumerical_chars_word = 0 counter_numbers_word = 0 counter_words += 1 word_list = list(word) for char in word_list: if Random.is_special_character(char): counter_special_chars += 1 elif Random.is_alphanumerical_character(char): counter_alphanumerical_chars += 1 counter_alphanumerical_chars_word += 1 if char.isdigit(): counter_numbers += 1 counter_numbers_word += 1 counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word ratio_alphabetical_word = np.round(counter_alphabetical_word/len(word), 2) counters_alphabetical_ratios.append(ratio_alphabetical_word) counters_wordlengths.append(len(word)) counters_numbers.append(counter_numbers_word) character_index += len(uid_info) last_xstop = word_xstop # get number of spaces len_whole_unspace = len(whole_text.replace(" ", "")) counter_spaces = counter_chars - len_whole_unspace # set alphabetical counter counter_alphabetical = counter_alphanumerical_chars - counter_numbers if counter_chars == 0: self.cpr.printw("no chars shouldn't happen, no recognizion") return False special_chars_ratio = counter_special_chars/ counter_chars alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars alphabetical_ratio = counter_alphabetical / counter_chars spaces_ratio = counter_spaces/ counter_chars numbers_ratio = counter_numbers / counter_chars maximum_x_gap = None mean_x_gap = None median_x_gap = None if len(x_gaps) >= 1: maximum_x_gap = max(x_gaps) mean_x_gap = np.mean(x_gaps) median_x_gap = np.median(x_gaps) many_numbers_in_first_word = False many_alphabetical_in_middle_words = False many_alphabetical_in_last_word = False # check some middle and last word conditions for counter_index, counter in enumerate(counters_wordlengths): if counter_index == 0: ctr_numbers = counters_numbers[counter_index] numbers_ratio_word = np.round(ctr_numbers/counter,2) if numbers_ratio_word > 0.8: many_numbers_in_first_word = True elif counter_index == len(counters_wordlengths)-1: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_last_word = True else: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_middle_words = True self.cpr.print("alle cntr:", counter_chars) self.cpr.print("spec cntr:", counter_special_chars, "ratio", special_chars_ratio) self.cpr.print("alnr cntr:", counter_alphanumerical_chars, "ratio", alphanumerical_chars_ratio) self.cpr.print("albt cntr:", counter_alphabetical, "ratio", alphabetical_ratio) self.cpr.print("spce cntr:", counter_spaces, "ratio", spaces_ratio) self.cpr.print("nmbr cntr:", counter_numbers, "ratio", numbers_ratio) self.cpr.print("x_box_sizes", x_box_sizes) self.cpr.print("x_gaps", x_gaps) self.cpr.print("x_gap_max_size", maximum_x_gap) self.cpr.print("x_gaps_mean", mean_x_gap) self.cpr.print("x_gaps_median", median_x_gap) if "Gewinn nach Vortrag" in whole_text: print("") if ((alphabetical_ratio < 0.75 and \ numbers_ratio > 0.2 and \ counter_chars > 5 and \ counter_words >= 2) and not \ (starts_with_parenthesis and ends_with_parenthesis)) or ultimo_is_first_word: if first_word_no_table_indicator: return False if mean_x_gap <= 115: return False if many_alphabetical_in_last_word: return False if many_alphabetical_in_middle_words and many_numbers_in_first_word: return False self.cpr.print("possible entry:", whole_text) if self.PRINT_TO_CHECKFILE: with open("checkfile_tables.txt", "a") as myfile: myfile.write(whole_text+ "||| max x_gap: " + str(maximum_x_gap)+"||| mean x_gap: " + str(mean_x_gap) \ + "||| median x_gap: " + str(median_x_gap)+"\n") print("jab") return True return False
def vocabulary_related_corrections(self, accumulated_chars, wildcard_character, accumulated_confs): if self.config.KEYING_RESULT_VOCABULARY_CORRECTION_VOTE: accumulated_chars_final = "" acc_split = accumulated_chars.split() len_split = len(acc_split) for word_index, word in enumerate(acc_split): if self.config.KEYING_RESULT_VC_IGNORE_SEPERATE_WRITING_CORRECTION: if word_index == len_split - 1 and word.replace( wildcard_character, "").endswith('-'): self.previous_word_with_seperator = True accumulated_chars_final += word + " " continue if word_index == 0: if self.previous_word_with_seperator is True: self.previous_word_with_seperator = False accumulated_chars_final += word + " " continue acc_confs_word = accumulated_confs.pop_multi(len(word)) acc_conf, rate, change, word_starting_borders, word_trailing_borders, word_reduced = \ self.vocab_checker.get_accumulated_confidence_rate(word, acc_confs_word, wildcard_character) self.cpr_vocab_check.print("w:", word, "wr:", word_reduced, "accr:", acc_conf, "rate", rate) # don't correct words below min vocab length ( mind that special chars in dict are toggled) check_len = len(word) if self.config.KEYING_RESULT_VC_DICT_REMOVE_SPECIAL_BORDER_CHARS: check_len = len(word_reduced) if check_len < self.config.KEYING_RESULT_VC_MIN_VOCAB_WORD_LENGTH: accumulated_chars_final += word + " " continue if self.config.KEYING_RESULT_VC_CORRECT_ONLY_ERRONOUS_CHARS: swappable_char_indices = [] acc_confs_used = None word_used = None if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS: # use the full length confidences array including trailing and leading special characters acc_confs_used = acc_confs_word word_used = word else: # don't use trailing and starting special characters if no special chars needed acc_confs_used = acc_confs_word[ len(word_starting_borders):( len(acc_confs_word) - len(word_trailing_borders))] word_used = word_reduced for conf_index, conf in enumerate(acc_confs_used): if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS: if conf <= 250: character_related = word_used[conf_index] is_special_char = Random.is_special_character( character_related) if is_special_char and character_related != wildcard_character: # only swap special character indices swappable_char_indices.append(conf_index) else: if conf <= 215: swappable_char_indices.append(conf_index) if len(swappable_char_indices) >= 1: word_reduced_correct = self.vocab_checker.correct_text_at_certain_indices_only( word_used, swappable_char_indices) if word_reduced_correct != None: word_correct_withtrails = None if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS: if Random.has_special_character( word_reduced_correct): # if special character was replaced with special character word_correct_withtrails = word_reduced_correct else: # if special character was replaced by alphanumerical character word_correct_withtrails = word else: word_correct_withtrails = word_starting_borders + word_reduced_correct + word_trailing_borders # only print the changed results if word != word_correct_withtrails: self.cpr_vocab_check.print( "w:", word, "wc:", word_correct_withtrails, "accr:", acc_conf, "rate", rate) accumulated_chars_final += word_correct_withtrails + " " else: accumulated_chars_final += word + " " else: accumulated_chars_final += word + " " continue if rate < self.config.KEYING_RESULT_VOCABULARY_CORRECTION_VOTE_TRESH \ and len(word_reduced) > 2: # if the rate drops below tresh, try to fetch vocab entry word_reduced_correct, suggestions, flh = self.vocab_checker.correct_text( word_reduced) if word_reduced_correct != None and word_reduced_correct != word_reduced: word_correct_withtrails = word_starting_borders + word_reduced_correct + word_trailing_borders self.cpr_vocab_check.print("w:", word, "wc:", word_correct_withtrails, "accr:", acc_conf, "rate", rate) accumulated_chars_final += word_correct_withtrails + " " else: accumulated_chars_final += word + " " else: accumulated_chars_final += word + " " accumulated_chars = accumulated_chars_final return accumulated_chars
def extract_line_features(self, line): final_line_features = {} whole_text = line['text'] self.cpr.print("recognizing text:", whole_text) # counters counter_special_chars = 0 counter_alphanumerical_chars = 0 counter_numbers = 0 counter_chars = len(whole_text) counter_alphabetical = 0 counter_words = 0 counters_alphabetical_ratios = [] counters_wordlengths = [] counters_numbers = [] character_index = 0 # special conditions ultimo_is_first_word = False first_word_no_table_indicator = False starts_with_parenthesis = False ends_with_parenthesis = False last_xstop = 0 x_box_sizes = [] x_gaps = [] for word_obj in line['words']: word_index = word_obj['word_index'] word_text = word_obj['text'] hocr_coordinates = word_obj['hocr_coordinates'] word_xstart = hocr_coordinates[0] word_xstop = hocr_coordinates[2] word_box_size = word_xstop - word_xstart x_box_sizes.append(word_box_size) if word_index >= 1: x_gap = word_xstop - last_xstop x_gaps.append(x_gap) #line.data['word_x0'] if word_text is None or word_text == "": continue if word_index == 0: if word_text in self.filter_start_words: first_word_no_table_indicator = True if word_text.lower() == "ultimo": ultimo_is_first_word = True if word_text[0] == "(": starts_with_parenthesis = True if word_index == len(whole_text) - 1: if word_text[-1] == ")": ends_with_parenthesis = True counter_alphabetical_chars_word = 0 counter_alphanumerical_chars_word = 0 counter_numbers_word = 0 counter_words += 1 word_list = list(word_text) for char in word_list: if Random.is_special_character(char): counter_special_chars += 1 elif Random.is_alphanumerical_character(char): counter_alphanumerical_chars += 1 counter_alphanumerical_chars_word += 1 if char.isdigit(): counter_numbers += 1 counter_numbers_word += 1 counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word ratio_alphabetical_word = np.round( counter_alphabetical_word / len(word_text), 2) counters_alphabetical_ratios.append(ratio_alphabetical_word) counters_wordlengths.append(len(word_text)) counters_numbers.append(counter_numbers_word) character_index += len(word_text) last_xstop = word_xstop # get number of spaces len_whole_unspace = len(whole_text.replace(" ", "")) counter_spaces = counter_chars - len_whole_unspace # set alphabetical counter counter_alphabetical = counter_alphanumerical_chars - counter_numbers if counter_chars == 0: self.cpr.printw("no chars in line:", str(line['line_index']), "no features here") return False special_chars_ratio = counter_special_chars / counter_chars alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars alphabetical_ratio = counter_alphabetical / counter_chars spaces_ratio = counter_spaces / counter_chars numbers_ratio = counter_numbers / counter_chars maximum_x_gap = None mean_x_gap = None median_x_gap = None if len(x_gaps) >= 1: maximum_x_gap = max(x_gaps) mean_x_gap = np.mean(x_gaps) median_x_gap = np.median(x_gaps) many_numbers_in_first_word = False many_alphabetical_in_middle_words = False many_alphabetical_in_last_word = False # check some middle and last word conditions for counter_index, counter in enumerate(counters_wordlengths): if counter_index == 0: ctr_numbers = counters_numbers[counter_index] numbers_ratio_word = np.round(ctr_numbers / counter, 2) if numbers_ratio_word > 0.8: many_numbers_in_first_word = True elif counter_index == len(counters_wordlengths) - 1: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[ counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_last_word = True else: if counter >= 4: alphabetical_ratio_word = counters_alphabetical_ratios[ counter_index] if alphabetical_ratio_word >= 0.75: many_alphabetical_in_middle_words = True final_line_features = LineFeatures(cpr=self.cpr) final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word final_line_features.counter_special_chars = counter_special_chars final_line_features.counter_chars = counter_chars final_line_features.counter_spaces = counter_spaces final_line_features.counter_numbers = counter_numbers final_line_features.counter_alphabetical = counter_alphabetical final_line_features.counter_alphanumerical_chars = counter_alphanumerical_chars final_line_features.counter_words = counter_words final_line_features.counters_numbers = counters_numbers final_line_features.counters_wordlengths = counters_wordlengths final_line_features.counters_alphabetical_ratios = counters_alphabetical_ratios final_line_features.numbers_ratio = numbers_ratio final_line_features.alphabetical_ratio = alphabetical_ratio final_line_features.alphanumerical_chars_ratio = alphanumerical_chars_ratio final_line_features.special_chars_ratio = special_chars_ratio final_line_features.spaces_ratio = spaces_ratio final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word final_line_features.many_alphabetical_in_middle_words = many_alphabetical_in_middle_words final_line_features.many_numbers_in_first_word = many_numbers_in_first_word final_line_features.x_box_sizes = x_box_sizes final_line_features.x_gaps = x_gaps final_line_features.maximum_x_gap = maximum_x_gap final_line_features.mean_x_gap = mean_x_gap final_line_features.median_x_gap = median_x_gap return final_line_features
def validate_column_features(self, search_space, x_index, reference_char=None, count_up_similar_references=False): counter_whitespaces = 0 counter_wildcards = 0 counter_nones = 0 counter_characters = 0 counter_reference_char = 0 counter_same_characters = 0 counter_dict = {} counter_special_characters = 0 most_occuring_char = None otherchar = None otherchar_y_index = None simchars = None if reference_char is not None and count_up_similar_references is True: simchars = self.get_simchars_for_char(reference_char) if len(simchars) != 1: self._cpr.print("evaluate") # gather data for y_index in range(0, self.get_y_size()): row = search_space[y_index] column_item = row[x_index] if column_item == self.get_wildcard_char(): counter_wildcards += 1 elif column_item == ' ': counter_whitespaces += 1 elif column_item == None or column_item == False or column_item == True: counter_nones += 1 else: if reference_char is not None: if count_up_similar_references is False and column_item == reference_char: counter_reference_char += 1 if count_up_similar_references is True: matching = [s for s in simchars if column_item in s] boolmatch = len(matching) >= 1 if boolmatch is True: counter_reference_char += 1 counter_characters += 1 otherchar = column_item otherchar_y_index = y_index if column_item != None: if column_item != self._wildcard_character and \ column_item != " ": if not column_item in counter_dict.keys(): counter_dict.update({column_item: 1}) else: counter_dict[column_item] += 1 if Random.is_special_character(column_item): counter_special_characters += 1 # the highest amount of same characters in this column if len(counter_dict.items()) >= 1: most_occuring_char, counter_same_characters = max( counter_dict.items(), key=operator.itemgetter(1)) # extract features features = [] counter_whitespace_and_wildcards = counter_whitespaces + counter_wildcards if counter_nones == self.get_y_size(): features.append(ColumnFeatures.ONLY_NONE.value) if counter_wildcards == self.get_y_size( ) - 1 and counter_characters == 1: features.append((ColumnFeatures.ONE_CHAR_REST_WILDCARDS).value) # additional feature, the only char is a special character if Random.is_special_character(otherchar): features.append( ColumnFeatures.ONE_SPECIALCHAR_REST_WILDCARDS.value) if counter_whitespaces == self.get_y_size( ) - 1 and counter_characters == 1: features.append(ColumnFeatures.ONE_CHAR_REST_WHITESPACE.value) if counter_whitespace_and_wildcards == self.get_y_size( ) - 1 and counter_characters == 1: features.append( ColumnFeatures.ONE_CHAR_REST_WHITESPACE_OR_WILDCARDS.value) # additional feature, the only char is a special character if otherchar != self._wildcard_character and otherchar != " "\ and Random.is_special_character(otherchar): #print("feature extraction") #print(search_space[0]) #print(search_space[1]) #print(search_space[2]) #print("x-index",x_index) features.append( ColumnFeatures. ONE_SPECIALCHAR_REST_WHITESPACE_OR_WILDCARDS.value) if counter_reference_char == self.get_y_size() - 1 and ( counter_whitespaces == 1 or counter_wildcards == 1): features.append(ColumnFeatures.MOSTLY_REFERENCE_CHAR.value) if counter_whitespaces == self.get_y_size(): features.append(ColumnFeatures.ONLY_WHITESPACE.value) if counter_reference_char == self.get_y_size(): features.append(ColumnFeatures.ONLY_WILDCARD.value) if counter_whitespace_and_wildcards == self.get_y_size(): features.append(ColumnFeatures.ONLY_WHITESPACE_OR_WILDCARD.value) if counter_reference_char >= 1: features.append(ColumnFeatures.CONTAINS_REFERENCE_CHAR.value) if counter_same_characters == self.get_y_size(): if counter_special_characters == self.get_y_size(): features.append(ColumnFeatures.ONLY_SAME_SPECIAL_CHAR.value) if Random.is_special_character(most_occuring_char) \ and counter_same_characters == self.get_y_size()-1 \ and most_occuring_char != self._wildcard_character \ and counter_whitespace_and_wildcards == 1: features.append(ColumnFeatures.MOSTLY_SAME_SPECIAL_CHAR.value) return features, otherchar, otherchar_y_index