Пример #1
0
    def add_adjacent_tuple_information(self, tuples,
                                       tuples_with_adjacent_info):

        final_tuple_list = []
        change = False

        for tuple_index, tuple in enumerate(tuples):
            tuplec = tuple[:]
            tuplec_with_adjacent_info = tuples_with_adjacent_info[
                tuple_index][:]

            tuplec_low_end = tuplec[0]
            tuplec_high_end = tuple[len(tuple) - 1]
            tupleca_low_end = tuplec_with_adjacent_info[0]
            tupleca_high_end = tuplec_with_adjacent_info[
                len(tuplec_with_adjacent_info) - 1]

            if tuplec_low_end == ' ':
                if tupleca_low_end != None and tupleca_low_end != ' ':
                    tuplec = Random.replace_value_in_tuple(
                        tuplec, tupleca_low_end, 0)
                    change = True

            if tuplec_high_end == ' ':
                if tupleca_high_end != None and tupleca_low_end != ' ':
                    tuplec = Random.replace_value_in_tuple(
                        tuplec, tupleca_high_end,
                        len(tuplec) - 1)
                    change = True

            final_tuple_list.append(tuplec)

        return final_tuple_list, change
Пример #2
0
    def obtain_line_info(self, best_index, other_indices):

        line_1 = self._set_lines[other_indices[0]]
        line_2 = self._set_lines[best_index]  # should be best
        line_3 = self._set_lines[other_indices[1]]

        text_1 = self.get_line_content(line_1)
        text_2 = self.get_line_content(line_2)  # should be best
        text_3 = self.get_line_content(line_3)

        self._cpr.print("ocr_set:")
        self._cpr.print("text_A", text_1)
        self._cpr.print("text_B", text_2)
        self._cpr.print("text_C", text_3)

        line_1_ok = not Random.is_false_true_or_none(line_1)
        line_2_ok = not Random.is_false_true_or_none(line_2)
        line_3_ok = not Random.is_false_true_or_none(line_3)
        ok_lines = [line_1_ok, line_2_ok, line_3_ok]

        ok_indices = []
        for ok_index, ok in enumerate(ok_lines):
            if ok is True:
                # not_ok_indices.append(ok_index)
                ok_indices.append(ok_index)

        ok_len = len(ok_indices)

        texts_return = [text_1, text_2, text_3]
        lines_return = [line_1, line_2, line_3]
        lines_return_ok = [line_1_ok, line_2_ok, line_3_ok]

        return texts_return, lines_return, lines_return_ok, ok_len
Пример #3
0
    def calculate_ld_information_tesseract(self, tesseract_page):

        # counters for final results

        # overall results
        overall_ldist = 0
        overall_gap_height = 0
        overall_line_height = 0

        # overall length counters
        overall_y_gaps_len = 0
        overall_line_height_len = 0


        for c_area in tesseract_page.areas:

            for c_paragraph in c_area.paragraphs:

                paragraph_line_count = len(c_paragraph.lines)

                # count in paragraph information, if it's only one line, don't count
                if paragraph_line_count >= 2:
                    ldist_paragraph, paragraph_gap_height, paragraph_line_height, paragraph_y_gaps_len, paragraph_lh_len = \
                                                    self.calculate_line_distance_information(c_paragraph.lines, False)

                    overall_ldist, unused = Random.add_to_mean(overall_ldist, ldist_paragraph, \
                                                               overall_line_height_len, paragraph_lh_len)


                    overall_gap_height, overall_y_gaps_len = Random.add_to_mean(overall_gap_height, paragraph_gap_height, \
                                                                                overall_y_gaps_len, paragraph_y_gaps_len)

                    overall_line_height, overall_line_height_len = Random.add_to_mean(overall_line_height, paragraph_line_height, \

                                                                                      overall_line_height_len, paragraph_lh_len)
                    #new_ovlh_counter = (overall_gap_height*overall_gap_count) +\
                    #                    (paragraph_gap_height*paragraph_line_count)
                    #new_ovlh_divisor = overall_gap_count + paragraph_line_count
                    #new_overall_line_height = new_ovlh_counter / new_ovlh_divisor
                    #overall_gap_height = new_overall_line_height
                    #overall_gap_count += paragraph_line_count




        final_ldist = TypeCasts.round_to_int(overall_ldist)
        final_gap_height = TypeCasts.round_to_int(overall_gap_height)
        final_line_height = TypeCasts.round_to_int(overall_line_height)
        # just for veriying final_ldist, slightly less inaccurate
        # final_ldist_2 = TypeCasts.round_to_int(final_gap_height + final_line_height)
        final_y_gaps_len = TypeCasts.round_to_int(overall_y_gaps_len)
        final_line_height_length = TypeCasts.round_to_int(overall_line_height_len)


        return final_ldist, final_gap_height, final_line_height, final_y_gaps_len, final_line_height_length
Пример #4
0
    def create_spaced_string(self, text, diff_tuples, size_filo,
                             search_range_filo):

        PADDING_CHAR = '¦'
        MID_FILL_CHAR = '¯'

        final_text = text

        for current_tuple in diff_tuples:
            current_tuple_list = list(current_tuple)
            middle_index_list = Random.find_middle(len(current_tuple_list),
                                                   True)
            current_tuple_list[middle_index_list] = MID_FILL_CHAR
            stringed_tuple = TypeCasts.list_to_string(current_tuple_list)
            stringed_tuple = stringed_tuple.strip()  # trim outbound spaces
            stringed_tuple = stringed_tuple.replace(PADDING_CHAR, '')
            stringed_tuple_final = stringed_tuple.replace(MID_FILL_CHAR, '')
            stringed_replacement = stringed_tuple.replace(MID_FILL_CHAR, ' ')
            # found_in_text = text.find(stringed_tuple_final)

            new_text = final_text.replace(stringed_tuple_final,
                                          stringed_replacement)
            final_text = new_text

        return final_text
Пример #5
0
    def create_non_spaced_string(self, text, diff_tuples, size_filo,
                                 search_range_filo):

        PADDING_CHAR = '¦'
        # pad values because of filos
        text_padded = Random.append_pad_values(text, size_filo, PADDING_CHAR)

        text_split = list(text_padded)
        current_chars_filo = Ranged_Filo(size_filo, search_range_filo, True)
        filo_mid_index = current_chars_filo.get_middle_index()
        final_text = ""

        for char_index, char in enumerate(text_split):
            current_chars_filo.push(char)
            # if current middle char is ' ' and there is a diff tuple for that, don't push it to final string
            current_tuple = current_chars_filo.get_middle_items(True, True)
            current_middle_char = current_tuple[filo_mid_index]
            its_a_diff_tuple = False
            for diff_tuple_index, diff_tuple in enumerate(diff_tuples):
                if current_tuple == diff_tuple:
                    diff_tuples[
                        diff_tuple_index] = "done"  # mark this tuple as corrected
                    its_a_diff_tuple = True
                    break  # escape inner loop

            if current_middle_char is not PADDING_CHAR:  # do not append padded chars
                if not its_a_diff_tuple and current_middle_char is not None:
                    final_text += current_middle_char

        return final_text
Пример #6
0
    def get_confidence_count(self,
                             char1,
                             char2,
                             char3,
                             cconf1,
                             cconf2,
                             cconf3,
                             wildcard_char='¦'):
        def get_other_char(char_first, char_sec, char_thrd, co1, co2, co3):
            if char_first != char_sec:
                return char_sec, float(co2)
            elif char_first != char_thrd:
                return char_thrd, float(co3)

        same_ctr = 0
        cconf_ctr = float(cconf1)

        if char1 == char2:
            same_ctr += 1
            cconf_ctr += float(cconf2)
        if char1 == char3:
            same_ctr += 1
            cconf_ctr += float(cconf3)

        # special cases space: ' ', ' ', 'x'
        # wildcard character : '¦', '¦', '¦'

        if char1 == ' ' and same_ctr == 1:
            # if the confidence of the other character is below that value, space gets the high put in confidence value
            return 1, 95.0  #todo j4t

            SPACE_TRESH = 50.0
            SPACE_PUT_IN_VALUE = 99.0
            otherchar, otherconf = get_other_char(char1, char2, char3, cconf1,
                                                  cconf2, cconf3)
            #print("otherchar",otherchar,"otherconf",otherconf)
            if otherconf < SPACE_TRESH:
                return 1, SPACE_PUT_IN_VALUE

        elif char1 == wildcard_char and same_ctr == 1:  #todo: differentiate type of character ??
            # if there is two wildcards and one characters, characters confidence has to be higher than
            # WILDCARD_TRESH to be taken

            wildcard_tresh = 98.5
            if self.config.MSA_BEST_CHANGE_VOTING_TRESHS_ON_EMPTY_LINE:
                wildcard_tresh -= 10  # 0:99,19%, 20:99.16%, 10:99.27%

            return 1, wildcard_tresh

        elif char1 == wildcard_char and same_ctr == 0:
            pass  # todo maybe cover this case (cause wildcard has no confidence i.e if the two otherchars are very low prob, take wildcard)
        elif char1 == '' and same_ctr == 0:
            pass  # todo maybe cover this case (cause space has no confidence ...
        elif self.config.MSA_BEST_VOTING_DOWNSCALE_ONLY_SC \
            and Random.is_special_character(char1) and same_ctr == 0 \
            and char2 == wildcard_char and char3 == wildcard_char:
            # lower the confidence of special characters which stand without any other chars
            return same_ctr, cconf_ctr * 0.9

        return same_ctr, cconf_ctr
Пример #7
0
    def add_linebreaks(self, previous_line, current_line, previous_line_index,
                       sd_line_index, line_heigth_info):
        MODE = 'TAKE_CURRENT_LINE_DIST'

        if previous_line is None:
            return None
        if MODE is 'TAKE_CURRENT_LINE_DIST':
            MARGIN = 0  # tolerance margin
            current_lh_info = line_heigth_info[sd_line_index]
            (xp_start, yp_start, xp_stop, yp_stop) = previous_line.coordinates
            (xc_start, yc_start, xc_stop, yc_stop) = current_line.coordinates

            y_dist = yc_start - yp_stop

            if y_dist <= 0:
                return None

            line_distance = current_lh_info.get_line_distance()
            y_times = (y_dist + MARGIN) / line_distance
            y_times_absolute = TypeCasts.round_to_int(y_times)
            if y_times_absolute > 0:
                generated_text = Random.append_pad_values(
                    "", y_times_absolute, "\n")
                return generated_text
            else:
                return None

        self.cpr.print("Undefined case reached shouldn't happen")
        return None
Пример #8
0
    def compare_ocr_strings_hamming(ocr_string1,
                                    ocr_string2,
                                    pad_difference=True):

        if pad_difference is True:
            len_str1 = len(ocr_string1)
            len_str2 = len(ocr_string2)
            if len_str1 > len_str2:
                ocr_string2 = Random.append_pad_values(ocr_string2,
                                                       len_str1 - len_str2)
            elif len_str2 > len_str1:
                ocr_string1 = Random.append_pad_values(ocr_string1,
                                                       len_str2 - len_str1)

        # print("Do Hammingdist ",ocr_string1," to " ,ocr_string2)

        result = distpkg.hamming(ocr_string1, ocr_string2)
        return result
Пример #9
0
    def maybe_replace_voted_by_predicted_char(self, voted_char, aufsichtsrat_prediction_toggled, predicted_char, \
                                              wildcard_character, voted_acc_conf, character_1, character_2, character_3):
        if aufsichtsrat_prediction_toggled:
            if Random.is_special_character(predicted_char):
                one_char_sc = Random.is_special_character(character_1) \
                              or Random.is_special_character(character_2) or Random.is_special_character(
                    character_3)
                voted_char_sc = Random.is_special_character(voted_char)

                if predicted_char != voted_char and (
                        one_char_sc
                        or voted_char_sc) and voted_char != wildcard_character:
                    # print("FiloContent:", filo_content)
                    self.cpr_sc_predict.print("pc:", predicted_char, "vc:",
                                              voted_char, "vc_acc",
                                              voted_acc_conf)
                    if voted_acc_conf <= 90.0:
                        if voted_char != '\f':  # don't swap formfeeds, they don't get predicted at all
                            self.cpr_sc_predict.print("swap")
                            voted_char = predicted_char

        return voted_char
    def __init__(self, y_size, x_size, wildcard_character,
                 substitution_character):
        self._y_size = y_size
        self._x_size = x_size
        self._middle_index = Random.find_middle(self._x_size, True)
        self._pre_middle_index = self.get_middle_index() - 1
        self._nex_middle_index = self.get_middle_index() + 1

        self._wildcard_character = wildcard_character
        self._substitution_character = substitution_character
        self.similar_chars = []
        self.similar_chars.append(['o', 'ö'])
        self.similar_chars.append(['<',
                                   'o'])  # untested is this really better?
        self.similar_chars.append(['O', 'Ö'])
        self.similar_chars.append(['0', 'O', '9'])
        self.similar_chars.append(['d', 'ö'])
        #self.similar_chars.append(['1', 'l'])
        self.similar_chars.append(['l', 'j', '1'])
        self.similar_chars.append(['I', 'l'])
        self.similar_chars.append(['u', 'ü'])
        self.similar_chars.append(['U', 'Ü', 'O'])
        self.similar_chars.append(['a', 'ä'])
        self.similar_chars.append(['A', 'Ä'])
        self.similar_chars.append([':', ';'])
        self.similar_chars.append(['-', '¬'])
        self.similar_chars.append(['"', "'"])
        self.similar_chars.append(['C', "G", "c"])
        # just for testing ...
        self.similar_chars.append(['.', ','])
        self.similar_chars.append([',', ';'])
        self.similar_chars.append(['v', 'V'])
        self.similar_chars.append(['w', 'W'])

        self.similar_chars.append(['i', 'l', 't', '1',
                                   '.'])  # 1 l i also possible
        self.similar_chars.append(['r', 'n'])
        self.similar_chars.append(['%', 'm'])
        self.similar_chars.append(['&', 'é'])
        self.similar_chars.append(['e', 'é'])

        config_handler = ConfigurationHandler(first_init=False)
        self._config = config_handler.get_config()
        self._cpr = ConditionalPrint(self._config.PRINT_SEARCH_SPACE_PROCESSOR,
                                     self._config.PRINT_EXCEPTION_LEVEL,
                                     self._config.PRINT_WARNING_LEVEL)
Пример #11
0
    def __init__(self,
                 size_limit,
                 search_range,
                 fill_with_none,
                 fill_range_only=False):

        self.original_range = search_range
        self.range = search_range
        self.size_limit = size_limit
        self.middle_index = Random.find_middle(size_limit, True)
        self.low_end_for_setting = self.middle_index - self.range

        super().__init__(size_limit)
        if fill_with_none:
            if fill_range_only is False:
                for index in range(0, self.size_limit):
                    self.items.append(None)
            else:
                for index in range(0, search_range):
                    self.items.append(None)
Пример #12
0
    def fill_filo_last_chars(self, voted_char):
        """
        fill filo for predictor usage with voted_char some additional chars around this char
        :param voted_char:
        :return:
        """

        if self.config.PREDICTOR_AUFSICHTSRAT_ENABLED:
            # create pre semi-tokenized input strings in the filos from the voted characters for prediction
            if voted_char == ' ':
                # the models usally use the 'ƿ' char in substitution for spaces
                self.filo_last_chars.push(' ', filterchar='¦')
                self.filo_last_chars.push('ƿ', filterchar='¦')
                self.filo_last_chars.push(' ', filterchar='¦')
            elif Random.is_special_character(voted_char):
                self.filo_last_chars.push(' ', filterchar='¦')
                self.filo_last_chars.push(voted_char, filterchar='¦')
                self.filo_last_chars.push(' ', filterchar='¦')

            else:
                self.filo_last_chars.push(voted_char, filterchar='¦')
Пример #13
0
    def recognize_a_line(self, line):

        if line == None or line == False or line == True or line.textstr == None:
            return False

        whole_text = line.textstr
        self.cpr.print("recognizing line:", whole_text)

        # counters
        counter_special_chars = 0
        counter_alphanumerical_chars = 0
        counter_numbers = 0
        counter_chars = len(whole_text)
        counter_alphabetical = 0
        counter_words = 0
        counters_alphabetical_ratios = []
        counters_wordlengths = []
        counters_numbers = []

        character_index = 0
        # special conditions
        ultimo_is_first_word = False
        first_word_no_table_indicator = False
        starts_with_parenthesis = False
        ends_with_parenthesis = False

        last_xstop = 0
        x_box_sizes = []
        x_gaps = []
        for key_index, key in enumerate(line.word['text']):
            word = line.word['text'][key]
            uid_info = line.word['UID'][key]
            word_xstart = line.data['word_x0'][character_index]
            word_xstop = line.data['word_x1'][character_index]
            word_box_size = word_xstop - word_xstart
            x_box_sizes.append(word_box_size)

            if key_index >= 1:
                x_gap = word_xstop - last_xstop
                x_gaps.append(x_gap)

            #line.data['word_x0']
            if word is None or word == "":
                continue

            if key_index == 0:
                if word in self.filter_start_words:
                    first_word_no_table_indicator = True
                if word.lower() == "ultimo":
                    ultimo_is_first_word = True
                if word[0] == "(":
                    starts_with_parenthesis = True


            if key_index == len(line.word['text'])-1:
                if word[-1] == ")":
                    ends_with_parenthesis = True



            counter_alphabetical_chars_word = 0
            counter_alphanumerical_chars_word = 0
            counter_numbers_word = 0


            counter_words += 1

            word_list = list(word)
            for char in word_list:
                if Random.is_special_character(char):
                    counter_special_chars += 1
                elif Random.is_alphanumerical_character(char):
                    counter_alphanumerical_chars += 1
                    counter_alphanumerical_chars_word += 1
                if char.isdigit():
                    counter_numbers += 1
                    counter_numbers_word += 1

            counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word
            ratio_alphabetical_word = np.round(counter_alphabetical_word/len(word), 2)
            counters_alphabetical_ratios.append(ratio_alphabetical_word)
            counters_wordlengths.append(len(word))
            counters_numbers.append(counter_numbers_word)
            character_index += len(uid_info)
            last_xstop = word_xstop


        # get number of spaces
        len_whole_unspace = len(whole_text.replace(" ", ""))
        counter_spaces = counter_chars - len_whole_unspace
        # set alphabetical counter
        counter_alphabetical = counter_alphanumerical_chars - counter_numbers


        if counter_chars == 0:
            self.cpr.printw("no chars shouldn't happen, no recognizion")
            return False

        special_chars_ratio = counter_special_chars/ counter_chars
        alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars
        alphabetical_ratio = counter_alphabetical / counter_chars
        spaces_ratio = counter_spaces/ counter_chars
        numbers_ratio = counter_numbers / counter_chars


        maximum_x_gap = None
        mean_x_gap = None
        median_x_gap = None

        if len(x_gaps) >= 1:
            maximum_x_gap = max(x_gaps)
            mean_x_gap = np.mean(x_gaps)
            median_x_gap = np.median(x_gaps)

        many_numbers_in_first_word = False
        many_alphabetical_in_middle_words = False
        many_alphabetical_in_last_word = False

        # check some middle and last word conditions
        for counter_index, counter in enumerate(counters_wordlengths):
            if counter_index == 0:
                ctr_numbers = counters_numbers[counter_index]
                numbers_ratio_word = np.round(ctr_numbers/counter,2)
                if numbers_ratio_word > 0.8:
                    many_numbers_in_first_word = True
            elif counter_index == len(counters_wordlengths)-1:
                if counter >= 4:
                    alphabetical_ratio_word = counters_alphabetical_ratios[counter_index]
                    if alphabetical_ratio_word >= 0.75:
                        many_alphabetical_in_last_word = True

            else:
                if counter >= 4:
                    alphabetical_ratio_word = counters_alphabetical_ratios[counter_index]
                    if alphabetical_ratio_word >= 0.75:
                        many_alphabetical_in_middle_words = True



        self.cpr.print("alle cntr:", counter_chars)
        self.cpr.print("spec cntr:", counter_special_chars, "ratio", special_chars_ratio)
        self.cpr.print("alnr cntr:", counter_alphanumerical_chars, "ratio", alphanumerical_chars_ratio)
        self.cpr.print("albt cntr:", counter_alphabetical, "ratio", alphabetical_ratio)
        self.cpr.print("spce cntr:", counter_spaces, "ratio", spaces_ratio)
        self.cpr.print("nmbr cntr:", counter_numbers, "ratio", numbers_ratio)
        self.cpr.print("x_box_sizes", x_box_sizes)
        self.cpr.print("x_gaps", x_gaps)
        self.cpr.print("x_gap_max_size", maximum_x_gap)
        self.cpr.print("x_gaps_mean", mean_x_gap)
        self.cpr.print("x_gaps_median", median_x_gap)

        if "Gewinn nach Vortrag" in whole_text:
            print("")


        if ((alphabetical_ratio < 0.75 and \
            numbers_ratio > 0.2 and \
            counter_chars > 5 and \
            counter_words >= 2) and not \
            (starts_with_parenthesis and ends_with_parenthesis)) or ultimo_is_first_word:

            if first_word_no_table_indicator:
                return False

            if mean_x_gap <= 115:
                return False
            if many_alphabetical_in_last_word:
                return False
            if many_alphabetical_in_middle_words and many_numbers_in_first_word:
                return False


            self.cpr.print("possible entry:", whole_text)

            if self.PRINT_TO_CHECKFILE:
                with open("checkfile_tables.txt", "a") as myfile:
                    myfile.write(whole_text+ "||| max x_gap: " + str(maximum_x_gap)+"||| mean x_gap: " + str(mean_x_gap) \
                             + "||| median x_gap: " + str(median_x_gap)+"\n")

            print("jab")
            return True

        return False
Пример #14
0
    def vocabulary_related_corrections(self, accumulated_chars,
                                       wildcard_character, accumulated_confs):

        if self.config.KEYING_RESULT_VOCABULARY_CORRECTION_VOTE:
            accumulated_chars_final = ""
            acc_split = accumulated_chars.split()
            len_split = len(acc_split)

            for word_index, word in enumerate(acc_split):

                if self.config.KEYING_RESULT_VC_IGNORE_SEPERATE_WRITING_CORRECTION:
                    if word_index == len_split - 1 and word.replace(
                            wildcard_character, "").endswith('-'):
                        self.previous_word_with_seperator = True
                        accumulated_chars_final += word + " "
                        continue
                    if word_index == 0:
                        if self.previous_word_with_seperator is True:
                            self.previous_word_with_seperator = False
                            accumulated_chars_final += word + " "
                            continue

                acc_confs_word = accumulated_confs.pop_multi(len(word))
                acc_conf, rate, change, word_starting_borders, word_trailing_borders, word_reduced = \
                    self.vocab_checker.get_accumulated_confidence_rate(word, acc_confs_word, wildcard_character)
                self.cpr_vocab_check.print("w:", word, "wr:", word_reduced,
                                           "accr:", acc_conf, "rate", rate)

                # don't correct words below min vocab length ( mind that special chars in dict are toggled)
                check_len = len(word)
                if self.config.KEYING_RESULT_VC_DICT_REMOVE_SPECIAL_BORDER_CHARS:
                    check_len = len(word_reduced)
                if check_len < self.config.KEYING_RESULT_VC_MIN_VOCAB_WORD_LENGTH:
                    accumulated_chars_final += word + " "
                    continue

                if self.config.KEYING_RESULT_VC_CORRECT_ONLY_ERRONOUS_CHARS:
                    swappable_char_indices = []

                    acc_confs_used = None
                    word_used = None

                    if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS:
                        # use the full length confidences array including trailing and leading special characters
                        acc_confs_used = acc_confs_word
                        word_used = word
                    else:
                        # don't use trailing and starting special characters if no special chars needed
                        acc_confs_used = acc_confs_word[
                            len(word_starting_borders):(
                                len(acc_confs_word) -
                                len(word_trailing_borders))]
                        word_used = word_reduced

                    for conf_index, conf in enumerate(acc_confs_used):
                        if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS:
                            if conf <= 250:
                                character_related = word_used[conf_index]
                                is_special_char = Random.is_special_character(
                                    character_related)
                                if is_special_char and character_related != wildcard_character:
                                    # only swap special character indices
                                    swappable_char_indices.append(conf_index)
                        else:
                            if conf <= 215:
                                swappable_char_indices.append(conf_index)

                    if len(swappable_char_indices) >= 1:
                        word_reduced_correct = self.vocab_checker.correct_text_at_certain_indices_only(
                            word_used, swappable_char_indices)
                        if word_reduced_correct != None:
                            word_correct_withtrails = None

                            if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS:
                                if Random.has_special_character(
                                        word_reduced_correct):
                                    # if special character was replaced with special character
                                    word_correct_withtrails = word_reduced_correct
                                else:
                                    # if special character was replaced by alphanumerical character
                                    word_correct_withtrails = word
                            else:
                                word_correct_withtrails = word_starting_borders + word_reduced_correct + word_trailing_borders

                            # only print the changed results
                            if word != word_correct_withtrails:
                                self.cpr_vocab_check.print(
                                    "w:", word, "wc:", word_correct_withtrails,
                                    "accr:", acc_conf, "rate", rate)

                            accumulated_chars_final += word_correct_withtrails + " "
                        else:
                            accumulated_chars_final += word + " "
                    else:
                        accumulated_chars_final += word + " "

                    continue

                if rate < self.config.KEYING_RESULT_VOCABULARY_CORRECTION_VOTE_TRESH \
                        and len(word_reduced) > 2:
                    # if the rate drops below tresh, try to fetch vocab entry
                    word_reduced_correct, suggestions, flh = self.vocab_checker.correct_text(
                        word_reduced)
                    if word_reduced_correct != None and word_reduced_correct != word_reduced:

                        word_correct_withtrails = word_starting_borders + word_reduced_correct + word_trailing_borders

                        self.cpr_vocab_check.print("w:", word, "wc:",
                                                   word_correct_withtrails,
                                                   "accr:", acc_conf, "rate",
                                                   rate)

                        accumulated_chars_final += word_correct_withtrails + " "
                    else:
                        accumulated_chars_final += word + " "
                else:
                    accumulated_chars_final += word + " "

            accumulated_chars = accumulated_chars_final

        return accumulated_chars
Пример #15
0
    def calculate_msa_best_charconf(self,
                                    take_n_dist_best_index=False,
                                    take_longest_as_pivot=True):

        # do a preselection of best element, if the parameter is set to take best n_dist_index as a pivot
        best_index = 1

        if take_n_dist_best_index is True:
            ldist_best_index = self.get_shortest_n_distance_index(
            )  # this doesn't work in all cases atm
            best_index = ldist_best_index
        if take_longest_as_pivot is True:
            best_index = self.get_longest_index()

        indices = [0, 1, 2]
        indices.remove(best_index)
        index1 = indices[0]
        index2 = indices[1]

        self._cpr.print("msa selection taking best:", best_index, "others:(",
                        index1, "and", index2, ")")

        try:

            line_1 = self._set_lines[index1]
            line_2 = self._set_lines[best_index]
            line_3 = self._set_lines[index2]

            text_1 = self.get_line_content(line_1)
            text_2 = self.get_line_content(line_2)  # should be best
            text_3 = self.get_line_content(line_3)

            self._cpr.print("ocr_set:")
            self._cpr.print("text_A", text_1)
            self._cpr.print("text_B", text_2)
            self._cpr.print("text_C", text_3)

            lines = [text_1, text_2, text_3]

            line_1_ok = not Random.is_false_true_or_none(text_1)
            line_2_ok = not Random.is_false_true_or_none(text_2)
            line_3_ok = not Random.is_false_true_or_none(text_3)
            ok_lines = [line_1_ok, line_2_ok, line_3_ok]
            not_ok_indices = []
            ok_indices = []
            for ok_index, ok in enumerate(ok_lines):
                if ok is True:
                    # not_ok_indices.append(ok_index)
                    ok_indices.append(ok_index)

            ok_len = len(ok_indices)

            if ok_len == 0:
                result = None
            else:
                result = self._msa_handler.get_best_of_three(text_1, text_2, text_3, use_charconfs=True, \
                                                      line_1=line_1,line_2=line_2,line_3=line_3)

            self._best_msa_text = result
        except Exception as e:
            self._cpr.printex(
                "ocr_set.py Exception in MSA, just taking line prio exception:",
                e)
            tr = inspect.trace()
            self._cpr.printex("trace is:", tr)
            if take_n_dist_best_index is True:
                self._best_msa_text = self.get_line_content(
                    self._set_lines[ldist_best_index])
            else:
                self._best_msa_text = self.get_line_content(
                    self._set_lines[best_index])
Пример #16
0
    def extract_line_features(self, line):

        final_line_features = {}

        whole_text = line['text']

        self.cpr.print("recognizing text:", whole_text)

        # counters
        counter_special_chars = 0
        counter_alphanumerical_chars = 0
        counter_numbers = 0
        counter_chars = len(whole_text)
        counter_alphabetical = 0
        counter_words = 0
        counters_alphabetical_ratios = []
        counters_wordlengths = []
        counters_numbers = []

        character_index = 0
        # special conditions
        ultimo_is_first_word = False
        first_word_no_table_indicator = False
        starts_with_parenthesis = False
        ends_with_parenthesis = False

        last_xstop = 0
        x_box_sizes = []
        x_gaps = []
        for word_obj in line['words']:
            word_index = word_obj['word_index']
            word_text = word_obj['text']
            hocr_coordinates = word_obj['hocr_coordinates']

            word_xstart = hocr_coordinates[0]
            word_xstop = hocr_coordinates[2]
            word_box_size = word_xstop - word_xstart
            x_box_sizes.append(word_box_size)

            if word_index >= 1:
                x_gap = word_xstop - last_xstop
                x_gaps.append(x_gap)

            #line.data['word_x0']
            if word_text is None or word_text == "":
                continue

            if word_index == 0:
                if word_text in self.filter_start_words:
                    first_word_no_table_indicator = True
                if word_text.lower() == "ultimo":
                    ultimo_is_first_word = True
                if word_text[0] == "(":
                    starts_with_parenthesis = True

            if word_index == len(whole_text) - 1:
                if word_text[-1] == ")":
                    ends_with_parenthesis = True

            counter_alphabetical_chars_word = 0
            counter_alphanumerical_chars_word = 0
            counter_numbers_word = 0

            counter_words += 1

            word_list = list(word_text)
            for char in word_list:
                if Random.is_special_character(char):
                    counter_special_chars += 1
                elif Random.is_alphanumerical_character(char):
                    counter_alphanumerical_chars += 1
                    counter_alphanumerical_chars_word += 1
                if char.isdigit():
                    counter_numbers += 1
                    counter_numbers_word += 1

            counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word
            ratio_alphabetical_word = np.round(
                counter_alphabetical_word / len(word_text), 2)
            counters_alphabetical_ratios.append(ratio_alphabetical_word)
            counters_wordlengths.append(len(word_text))
            counters_numbers.append(counter_numbers_word)
            character_index += len(word_text)
            last_xstop = word_xstop

        # get number of spaces
        len_whole_unspace = len(whole_text.replace(" ", ""))
        counter_spaces = counter_chars - len_whole_unspace
        # set alphabetical counter
        counter_alphabetical = counter_alphanumerical_chars - counter_numbers

        if counter_chars == 0:
            self.cpr.printw("no chars in line:", str(line['line_index']),
                            "no features here")
            return False

        special_chars_ratio = counter_special_chars / counter_chars
        alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars
        alphabetical_ratio = counter_alphabetical / counter_chars
        spaces_ratio = counter_spaces / counter_chars
        numbers_ratio = counter_numbers / counter_chars

        maximum_x_gap = None
        mean_x_gap = None
        median_x_gap = None

        if len(x_gaps) >= 1:
            maximum_x_gap = max(x_gaps)
            mean_x_gap = np.mean(x_gaps)
            median_x_gap = np.median(x_gaps)

        many_numbers_in_first_word = False
        many_alphabetical_in_middle_words = False
        many_alphabetical_in_last_word = False

        # check some middle and last word conditions
        for counter_index, counter in enumerate(counters_wordlengths):
            if counter_index == 0:
                ctr_numbers = counters_numbers[counter_index]
                numbers_ratio_word = np.round(ctr_numbers / counter, 2)
                if numbers_ratio_word > 0.8:
                    many_numbers_in_first_word = True
            elif counter_index == len(counters_wordlengths) - 1:
                if counter >= 4:
                    alphabetical_ratio_word = counters_alphabetical_ratios[
                        counter_index]
                    if alphabetical_ratio_word >= 0.75:
                        many_alphabetical_in_last_word = True

            else:
                if counter >= 4:
                    alphabetical_ratio_word = counters_alphabetical_ratios[
                        counter_index]
                    if alphabetical_ratio_word >= 0.75:
                        many_alphabetical_in_middle_words = True

        final_line_features = LineFeatures(cpr=self.cpr)
        final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word

        final_line_features.counter_special_chars = counter_special_chars
        final_line_features.counter_chars = counter_chars
        final_line_features.counter_spaces = counter_spaces
        final_line_features.counter_numbers = counter_numbers
        final_line_features.counter_alphabetical = counter_alphabetical
        final_line_features.counter_alphanumerical_chars = counter_alphanumerical_chars
        final_line_features.counter_words = counter_words

        final_line_features.counters_numbers = counters_numbers
        final_line_features.counters_wordlengths = counters_wordlengths
        final_line_features.counters_alphabetical_ratios = counters_alphabetical_ratios

        final_line_features.numbers_ratio = numbers_ratio
        final_line_features.alphabetical_ratio = alphabetical_ratio
        final_line_features.alphanumerical_chars_ratio = alphanumerical_chars_ratio
        final_line_features.special_chars_ratio = special_chars_ratio
        final_line_features.spaces_ratio = spaces_ratio

        final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word
        final_line_features.many_alphabetical_in_middle_words = many_alphabetical_in_middle_words
        final_line_features.many_numbers_in_first_word = many_numbers_in_first_word
        final_line_features.x_box_sizes = x_box_sizes
        final_line_features.x_gaps = x_gaps

        final_line_features.maximum_x_gap = maximum_x_gap
        final_line_features.mean_x_gap = mean_x_gap
        final_line_features.median_x_gap = median_x_gap

        return final_line_features
    def validate_column_features(self,
                                 search_space,
                                 x_index,
                                 reference_char=None,
                                 count_up_similar_references=False):
        counter_whitespaces = 0
        counter_wildcards = 0
        counter_nones = 0
        counter_characters = 0
        counter_reference_char = 0
        counter_same_characters = 0
        counter_dict = {}
        counter_special_characters = 0
        most_occuring_char = None

        otherchar = None
        otherchar_y_index = None
        simchars = None
        if reference_char is not None and count_up_similar_references is True:
            simchars = self.get_simchars_for_char(reference_char)
            if len(simchars) != 1:
                self._cpr.print("evaluate")

        # gather data
        for y_index in range(0, self.get_y_size()):
            row = search_space[y_index]
            column_item = row[x_index]
            if column_item == self.get_wildcard_char():
                counter_wildcards += 1
            elif column_item == ' ':
                counter_whitespaces += 1
            elif column_item == None or column_item == False or column_item == True:
                counter_nones += 1
            else:
                if reference_char is not None:

                    if count_up_similar_references is False and column_item == reference_char:
                        counter_reference_char += 1
                    if count_up_similar_references is True:
                        matching = [s for s in simchars if column_item in s]
                        boolmatch = len(matching) >= 1
                        if boolmatch is True:
                            counter_reference_char += 1

                counter_characters += 1
                otherchar = column_item
                otherchar_y_index = y_index

            if column_item != None:
                if column_item != self._wildcard_character and \
                    column_item != " ":
                    if not column_item in counter_dict.keys():
                        counter_dict.update({column_item: 1})
                    else:
                        counter_dict[column_item] += 1
            if Random.is_special_character(column_item):
                counter_special_characters += 1

        # the highest amount of same characters in this column
        if len(counter_dict.items()) >= 1:
            most_occuring_char, counter_same_characters = max(
                counter_dict.items(), key=operator.itemgetter(1))

        # extract features
        features = []
        counter_whitespace_and_wildcards = counter_whitespaces + counter_wildcards

        if counter_nones == self.get_y_size():
            features.append(ColumnFeatures.ONLY_NONE.value)
        if counter_wildcards == self.get_y_size(
        ) - 1 and counter_characters == 1:
            features.append((ColumnFeatures.ONE_CHAR_REST_WILDCARDS).value)
            # additional feature, the only char is a special character
            if Random.is_special_character(otherchar):
                features.append(
                    ColumnFeatures.ONE_SPECIALCHAR_REST_WILDCARDS.value)

        if counter_whitespaces == self.get_y_size(
        ) - 1 and counter_characters == 1:
            features.append(ColumnFeatures.ONE_CHAR_REST_WHITESPACE.value)
        if counter_whitespace_and_wildcards == self.get_y_size(
        ) - 1 and counter_characters == 1:
            features.append(
                ColumnFeatures.ONE_CHAR_REST_WHITESPACE_OR_WILDCARDS.value)
            # additional feature, the only char is a special character
            if otherchar != self._wildcard_character and otherchar != " "\
                    and Random.is_special_character(otherchar):
                #print("feature extraction")

                #print(search_space[0])
                #print(search_space[1])
                #print(search_space[2])
                #print("x-index",x_index)
                features.append(
                    ColumnFeatures.
                    ONE_SPECIALCHAR_REST_WHITESPACE_OR_WILDCARDS.value)

        if counter_reference_char == self.get_y_size() - 1 and (
                counter_whitespaces == 1 or counter_wildcards == 1):
            features.append(ColumnFeatures.MOSTLY_REFERENCE_CHAR.value)
        if counter_whitespaces == self.get_y_size():
            features.append(ColumnFeatures.ONLY_WHITESPACE.value)
        if counter_reference_char == self.get_y_size():
            features.append(ColumnFeatures.ONLY_WILDCARD.value)
        if counter_whitespace_and_wildcards == self.get_y_size():
            features.append(ColumnFeatures.ONLY_WHITESPACE_OR_WILDCARD.value)
        if counter_reference_char >= 1:
            features.append(ColumnFeatures.CONTAINS_REFERENCE_CHAR.value)
        if counter_same_characters == self.get_y_size():
            if counter_special_characters == self.get_y_size():
                features.append(ColumnFeatures.ONLY_SAME_SPECIAL_CHAR.value)
        if Random.is_special_character(most_occuring_char) \
            and counter_same_characters == self.get_y_size()-1 \
            and most_occuring_char != self._wildcard_character \
            and counter_whitespace_and_wildcards == 1:

            features.append(ColumnFeatures.MOSTLY_SAME_SPECIAL_CHAR.value)

        return features, otherchar, otherchar_y_index