class AkfParsingFunctionsTablesOne(object):
    def __init__(self, endobject_factory, output_analyzer, dictionary_handler):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(
            self.config.PRINT_SEGMENT_PARSER_AKF_FN_TABLES_ONE,
            self.config.PRINT_EXCEPTION_LEVEL,
            self.config.PRINT_WARNING_LEVEL,
            leading_tag=self.__class__.__name__)

        self.cpr.print("init akf parsing functions tables one")

        self.ef = endobject_factory
        self.output_analyzer = output_analyzer
        self.dictionary_handler = dictionary_handler

    def parse_aktienkurse(self, real_start_tag, content_texts, content_lines,
                          feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(
            segmentation_class.segment_tag, content_texts, real_start_tag)

    def parse_dividenden(self, real_start_tag, content_texts, content_lines,
                         feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(
            segmentation_class.segment_tag, content_texts, real_start_tag)

    def parse_dividenden_auf_xyaktien(self, real_start_tag, content_texts,
                                      content_lines, feature_lines,
                                      segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(
            segmentation_class.segment_tag, content_texts, real_start_tag)
class AkfParsingFunctionsOne(object):
    def __init__(self, endobject_factory, output_analyzer, dictionary_handler):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(
            self.config.PRINT_SEGMENT_PARSER_AKF_FN_ONE,
            self.config.PRINT_EXCEPTION_LEVEL,
            self.config.PRINT_WARNING_LEVEL,
            leading_tag=self.__class__.__name__)

        self.cpr.print("init akf parsing functions one")

        self.ef = endobject_factory
        self.output_analyzer = output_analyzer
        self.dictionary_handler = dictionary_handler

    def parse_firmenname(self, real_start_tag, content_texts, content_lines,
                         feature_lines, segmentation_class):
        # get basic data
        element_counter = 0

        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # get relevant info
        accumulated_text = ""
        for text in content_texts:
            accumulated_text += " " + text

        only_add_if_value = False
        accumulated_text = accumulated_text.strip()
        self.ef.add_to_my_obj("Firmenname",
                              accumulated_text,
                              object_number=element_counter,
                              only_filled=only_add_if_value)

    def parse_sitz(self, real_start_tag, content_texts, content_lines,
                   feature_lines, segmentation_class):
        """
         "Sitz": [
                {
                  "origpost": "Mergenthalerallee 79-81, 65760 Eschborn Telefon:(069) 7 50 06-0 Telefax:(069) 7 50 06-111 e-mail:[email protected] Internetseite:http://www.3u.net ",
                  "type": "Sitz",
                  "street": "Mergenthalerallee",
                  "street_number": "79-81",
                  "zip": "65760",
                  "city": "Eschborn",
                  "phone": "(069) 7 50 06-0",
                  "fax": "(069) 7 50 06-111",
                  "email": [
                    "*****@*****.**"
                  ],
                  "www": [
                    "http://www.3u.net"
                  ]
                }
              ],
        """
        # get basic data
        element_counter = 0

        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # get relevant info
        num_id, city, street, street_number, additional_info = cf.parse_id_location(
            origpost_red)

        # add stuff to ef
        only_add_if_value = True
        self.ef.add_to_my_obj("numID",
                              num_id,
                              object_number=element_counter,
                              only_filled=only_add_if_value)
        self.ef.add_to_my_obj("city",
                              city,
                              object_number=element_counter,
                              only_filled=only_add_if_value)
        self.ef.add_to_my_obj("street",
                              street,
                              object_number=element_counter,
                              only_filled=only_add_if_value)
        self.ef.add_to_my_obj("street_number",
                              street_number,
                              object_number=element_counter,
                              only_filled=only_add_if_value)
        self.ef.add_to_my_obj("additional_info",
                              additional_info,
                              object_number=element_counter,
                              only_filled=only_add_if_value)

        return True

    def parse_verwaltung(self, real_start_tag, content_texts, content_lines,
                         feature_lines, segmentation_class):
        # kmy_obj_2 = self.ef.print_me_and_return()
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        # self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        if "srat" in real_start_tag:
            # Verwaltungsrat ..
            persons_final = cf.parse_persons(
                origpost_red, self.dictionary_handler,
                self.config.USE_DICTIONARIES_FOR_PERSON_PARSING)
            only_add_if_filed = True
            for entry in persons_final:
                name, first_name, last_name, city, title, funct, rest_info = entry
                self.ef.add_to_my_obj("name",
                                      name,
                                      object_number=element_counter,
                                      only_filled=only_add_if_filed)
                self.ef.add_to_my_obj("first_name",
                                      first_name,
                                      object_number=element_counter,
                                      only_filled=only_add_if_filed)
                self.ef.add_to_my_obj("last_name",
                                      last_name,
                                      object_number=element_counter,
                                      only_filled=only_add_if_filed)

                self.ef.add_to_my_obj("city",
                                      city,
                                      object_number=element_counter,
                                      only_filled=only_add_if_filed)
                self.ef.add_to_my_obj("title",
                                      title,
                                      object_number=element_counter,
                                      only_filled=only_add_if_filed)
                self.ef.add_to_my_obj("rest",
                                      rest_info,
                                      object_number=element_counter,
                                      only_filled=only_add_if_filed)
                self.ef.add_to_my_obj("funct",
                                      funct,
                                      object_number=element_counter,
                                      only_filled=only_add_if_filed)

                element_counter += 1
            return True
        elif "Verw." in real_start_tag:
            # Verw.
            num_id, city, street, street_number, additional_info = cf.parse_id_location(
                origpost_red)

            # add stuff to ef
            only_add_if_value = True
            self.ef.add_to_my_obj("numID",
                                  num_id,
                                  object_number=element_counter,
                                  only_filled=only_add_if_value)
            self.ef.add_to_my_obj("city",
                                  city,
                                  object_number=element_counter,
                                  only_filled=only_add_if_value)
            self.ef.add_to_my_obj("street",
                                  street,
                                  object_number=element_counter,
                                  only_filled=only_add_if_value)
            self.ef.add_to_my_obj("street_number",
                                  street_number,
                                  object_number=element_counter,
                                  only_filled=only_add_if_value)
            self.ef.add_to_my_obj("additional_info",
                                  additional_info,
                                  object_number=element_counter,
                                  only_filled=only_add_if_value)

            return True
        else:
            # Verwaltung
            final_items = cf.parse_general_and_keys(
                content_texts,
                join_separated_lines=False,
                current_key_initial_value="General_Info")
            for key in final_items.keys():
                value = final_items[key]
                if value is None or value == "":
                    continue
                self.ef.add_to_my_obj(key,
                                      value,
                                      object_number=element_counter,
                                      only_filled=True)
                element_counter += 1
            return True

    def parse_telefon_fernruf(self, real_start_tag, content_texts,
                              content_lines, feature_lines,
                              segmentation_class):

        # get basic data
        origpost, origpost_red, element_counter, content_texts = cf.add_check_element(
            self, content_texts, real_start_tag, segmentation_class, 0)
        # do special match: Verwaltung und Betriebshof
        split_post = []

        match_special = regex.match(
            r"(?<Verw>Verwaltung.*)"
            r"(?<Betr>Betriebshof.*)", origpost_red)
        if match_special:
            betriebshof = match_special.group("Betr")
            verwaltung = match_special.group("Verw")
            origpost_red = origpost_red.replace(betriebshof, "")
            origpost_red = origpost_red.replace(verwaltung, "")
            split_post.append(betriebshof)
            split_post.append(verwaltung)
        # do special match: Ortsgespräche and Ferngespräche

        match_special2 = regex.match(
            r"(?<og>Ortsgespräche.*)"
            r"(?<fg>Ferngespräche.*)", origpost_red)
        if match_special2:
            ortsgespr = match_special2.group("og")
            ferngespr = match_special2.group("fg")
            origpost_red = origpost_red.replace(ortsgespr, "")
            origpost_red = origpost_red.replace(ferngespr, "")
            split_post.append(ortsgespr)
            split_post.append(ferngespr)

        # do special match: Ortsverkehr and Fernverkehr

        match_special3 = regex.match(
            r"(?<ov>Ortsverkehr.*)"
            r"(?<fv>Fernverkehr.*)", origpost_red)
        if match_special3:
            ortsverkehr = match_special3.group("ov")
            fernverkehr = match_special3.group("fv")
            origpost_red = origpost_red.replace(ortsverkehr, "")
            origpost_red = origpost_red.replace(fernverkehr, "")
            split_post.append(ortsverkehr)
            split_post.append(fernverkehr)

        # do special match: check if only numbers
        origpost_red_new = origpost_red
        #only_num_check = origpost_red.replace("und", "").replace(",", "").replace(" ", "")
        test_split = regex.split("\su\.|\sund\s|,|;", origpost_red)
        for number in test_split:
            # additional parenthesis block
            match_parenthesis = regex.search("\(.*\)", number)
            parenthesis = None
            if match_parenthesis:
                parenthesis = match_parenthesis.group()
                number = number.replace(parenthesis, "")  # remove number
                self.ef.add_to_my_obj("vorwahl",
                                      parenthesis,
                                      object_number=element_counter,
                                      only_filled=True)

            match_word_num = regex.search("(?<word>[^\d]*)(?<num>[\d\s\-/]*)",
                                          number)
            if match_word_num is None:
                continue

            word = match_word_num.group("word")
            num = match_word_num.group("num")
            if "Sa." in word and "Nr" in word:
                continue
            number_stripped = num.strip(" ./").replace("/", "").replace(
                "-", "").replace(" ", "")
            if number_stripped.isdigit():
                origpost_red_new = origpost_red_new.replace(
                    number, "")  # remove number
                origpost_red_new = origpost_red_new.replace(
                    word, "")  # remove word found

                change1 = self.ef.add_to_my_obj("number_Sa.-Nr.",
                                                num.strip(),
                                                object_number=element_counter,
                                                only_filled=True)
                change2 = self.ef.add_to_my_obj("location",
                                                word.strip(),
                                                object_number=element_counter,
                                                only_filled=True)
                if change1 or change2:
                    element_counter += 1

        #if "32 20 47" in origpost_red:
        #    print("asd")

        origpost_red = origpost_red_new
        # substitute in a separator char to integrate delimiters in next step
        origpost_red = regex.sub(r"(\d\.)", r"\1~~~~", origpost_red)

        # do  further matches (sc-separated)
        split_post.extend(regex.split(';|~~~~|\su\.', origpost_red))

        for index, entry in enumerate(split_post):
            if entry is None:
                continue
            entry_stripped = entry.strip()
            if entry_stripped == "":
                continue

            # additional parenthesis block
            match_parenthesis = regex.search("\(.*\)", entry_stripped)
            parenthesis = None
            if match_parenthesis:
                parenthesis = match_parenthesis.group()
                entry_stripped = entry_stripped.replace(parenthesis,
                                                        "")  # remove entry
                self.ef.add_to_my_obj("vorwahl",
                                      parenthesis,
                                      object_number=element_counter,
                                      only_filled=True)

            match_word = regex.match(r"(?<Tag>\D*)"
                                     r"(?<Numbers>[\d\s\W]*)", entry_stripped)
            if match_word is not None:
                # fetch match results
                tag_match = match_word.group("Tag")
                numbers_match = match_word.group("Numbers")
                rest_from_entry_str = entry_stripped.replace(tag_match, "", 1)
                rest_from_entry_str = rest_from_entry_str.replace(
                    numbers_match, "", 1)

                tag = dh.strip_if_not_none(tag_match, "")
                match_tag = regex.match(
                    r"(?<rest_bef>.*)(?<sanr>Sa\.?\-Nr\.?)(?<rest_end>.*)",
                    tag)
                location = ""
                if match_tag is not None:
                    rest_tag = match_tag.group('rest_bef')
                    rest_tag_2 = match_tag.group('rest_end')
                    # sanr = match_tag.group('sanr') # this is the filtered group
                    location = dh.strip_if_not_none(
                        rest_tag + " " + rest_tag_2, ":., ")
                else:
                    # if there are no real descriptors in tag then tag is usually location  (like Düsseldorf 1 36 62.)
                    location = tag

                if "und" in location:
                    location = regex.sub("[^\w]und[^\w]", "", location)

                number = dh.strip_if_not_none(numbers_match, "., ")
                self.ef.add_to_my_obj("number_Sa.-Nr.",
                                      number.strip(),
                                      object_number=element_counter,
                                      only_filled=True)
                self.ef.add_to_my_obj("location",
                                      location.strip(),
                                      object_number=element_counter,
                                      only_filled=True)
                additional_info_entry_level = dh.strip_if_not_none(
                    rest_from_entry_str, ",. ")
                self.ef.add_to_my_obj("additional_info",
                                      additional_info_entry_level.strip(),
                                      object_number=element_counter,
                                      only_filled=True)
                element_counter += 1

                origpost_red = origpost_red.replace(number, "", 1)
                origpost_red = origpost_red.replace(location, "", 1)

        origpost_red = origpost_red.replace("Sa.-Nr", "").replace("~~~~", "")
        origpost_red_end = dh.remove_multiple_outbound_chars(origpost_red)

        if len(origpost_red_end) > 3:
            self.ef.add_to_my_obj("additional_info_unparsed",
                                  origpost_red_end.strip(),
                                  object_number=element_counter)

    def parse_vorstand(self, real_start_tag, content_texts, content_lines,
                       feature_lines, segmentation_class):

        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        persons_final = cf.parse_persons(
            origpost_red, self.dictionary_handler,
            self.config.USE_DICTIONARIES_FOR_PERSON_PARSING)

        only_add_if_filed = True
        for entry in persons_final:
            name, first_name, last_name, city, title, funct, rest_info = entry
            self.ef.add_to_my_obj("name",
                                  name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("first_name",
                                  first_name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("last_name",
                                  last_name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("city",
                                  city,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("title",
                                  title,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("rest",
                                  rest_info,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("funct",
                                  funct,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            element_counter += 1
        """
        # do  matches (;-separated)
        split_post = origpost_red.split(';')

        for index, entry in enumerate(split_post):
            entry_stripped = entry.strip()

            if index == len(split_post)-1:
                matchend = regex.match("^[Aa]lle", entry_stripped)
                if matchend:
                    self.ef.add_to_my_obj("additional_info", entry_stripped, object_number=element_counter)
                    element_counter += 1
                    continue

            match = regex.match(r"(?<Name>.*)[,]"             # find location string
                                r"(?<Rest>.*+)",              # just get the rest which is usually streetname and number, but has other possibilities
                                entry_stripped)
            if match is None:
                name = dh.strip_if_not_none(entry_stripped, ", ")
                self.ef.add_to_my_obj("name", name, object_number=element_counter)
                element_counter += 1
                continue

            name = dh.strip_if_not_none(match.group("Name"), ", ")
            rest = dh.strip_if_not_none(match.group("Rest"), ",. ")
            name_split = name.split(',')
            if len(name_split) > 1:
                position = rest
                name = name_split[0]
                city = name_split[1]
            else:
                city = rest
                position = ""

            self.ef.add_to_my_obj("name", name, object_number=element_counter)
            self.ef.add_to_my_obj("city", city, object_number=element_counter)
            self.ef.add_to_my_obj("position", position, object_number=element_counter)
            element_counter += 1
            """

        return True

    def parse_aufsichtsrat(self, real_start_tag, content_texts, content_lines,
                           feature_lines, segmentation_class):

        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        #Try to fix +) problems
        origpost_red = origpost_red.replace("; +)", "+);").replace(
            ";+)", "+);").replace("')", "").replace("*)", "")

        persons_final = cf.parse_persons(
            origpost_red, self.dictionary_handler,
            self.config.USE_DICTIONARIES_FOR_PERSON_PARSING)

        only_add_if_filed = True
        for entry in persons_final:
            name, first_name, last_name, city, title, funct, rest_info = entry
            self.ef.add_to_my_obj("name",
                                  name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("first_name",
                                  first_name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("last_name",
                                  last_name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("city",
                                  city,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("title",
                                  title,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("rest",
                                  rest_info,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("funct",
                                  funct,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            element_counter += 1

        return True

    def parse_arbeitnehmervertreter(self, real_start_tag, content_texts,
                                    content_lines, feature_lines,
                                    segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        persons_final = cf.parse_persons(
            origpost_red, self.dictionary_handler,
            self.config.USE_DICTIONARIES_FOR_PERSON_PARSING)
        only_add_if_filed = True
        for entry in persons_final:
            name, first_name, last_name, city, title, funct, rest_info = entry
            self.ef.add_to_my_obj("name",
                                  name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("first_name",
                                  first_name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("last_name",
                                  last_name,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("city",
                                  city,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("title",
                                  title,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("rest",
                                  rest_info,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)
            self.ef.add_to_my_obj("funct",
                                  funct,
                                  object_number=element_counter,
                                  only_filled=only_add_if_filed)

            element_counter += 1

        return True

    # Gruendung
    def parse_gruendung(self, real_start_tag, content_texts, content_lines,
                        feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
        match_year = regex.search("^\d*", origpost_red.strip())
        if match_year:
            result = match_year.group()
            origpost_red_new = origpost_red.replace(result, "", 1)
            year = dh.strip_if_not_none(result, ".,() ")
            rest_info = dh.strip_if_not_none(origpost_red_new, ".,() ")
            self.ef.add_to_my_obj("rest_info",
                                  rest_info,
                                  object_number=element_counter,
                                  only_filled=True)
            self.ef.add_to_my_obj("year",
                                  year,
                                  object_number=element_counter,
                                  only_filled=True)
        else:
            rest_info = dh.strip_if_not_none(origpost_red, ".,() ")
            self.ef.add_to_my_obj("rest_info",
                                  rest_info,
                                  object_number=element_counter,
                                  only_filled=True)

    # Tätigkeitsgebiet
    def parse_taetigkeitsgebiet(self, real_start_tag, content_texts,
                                content_lines, feature_lines,
                                segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        final_items = cf.parse_general_and_keys(
            content_texts,
            join_separated_lines=False,
            current_key_initial_value="General_Info")

        for key in final_items.keys():
            value = final_items[key]
            if value is None or len(value) == 0:
                continue
            self.ef.add_to_my_obj(key,
                                  value,
                                  object_number=element_counter,
                                  only_filled=True)
            element_counter += 1
Exemplo n.º 3
0
class TableHandler(object):

    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_TABLE_HANDLER, self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL)
        self.PRINT_TO_CHECKFILE = False
        # a line starting with these words can't be in a table
        self.filter_start_words = ["Fernruf:", "Vorstand:", "Fernschreiber:",
                                   "von","Gründung:", "Ordnungsnr.", "Ordnungsnr",
                                   "Grundkapital:","Umstellung"]

        #with open("checkfile_tables.txt", "w") as myfile:
         #   myfile.write("----" + "\n")

    def recognize_a_line(self, line):

        if line == None or line == False or line == True or line.textstr == None:
            return False

        whole_text = line.textstr
        self.cpr.print("recognizing line:", whole_text)

        # counters
        counter_special_chars = 0
        counter_alphanumerical_chars = 0
        counter_numbers = 0
        counter_chars = len(whole_text)
        counter_alphabetical = 0
        counter_words = 0
        counters_alphabetical_ratios = []
        counters_wordlengths = []
        counters_numbers = []

        character_index = 0
        # special conditions
        ultimo_is_first_word = False
        first_word_no_table_indicator = False
        starts_with_parenthesis = False
        ends_with_parenthesis = False

        last_xstop = 0
        x_box_sizes = []
        x_gaps = []
        for key_index, key in enumerate(line.word['text']):
            word = line.word['text'][key]
            uid_info = line.word['UID'][key]
            word_xstart = line.data['word_x0'][character_index]
            word_xstop = line.data['word_x1'][character_index]
            word_box_size = word_xstop - word_xstart
            x_box_sizes.append(word_box_size)

            if key_index >= 1:
                x_gap = word_xstop - last_xstop
                x_gaps.append(x_gap)

            #line.data['word_x0']
            if word is None or word == "":
                continue

            if key_index == 0:
                if word in self.filter_start_words:
                    first_word_no_table_indicator = True
                if word.lower() == "ultimo":
                    ultimo_is_first_word = True
                if word[0] == "(":
                    starts_with_parenthesis = True


            if key_index == len(line.word['text'])-1:
                if word[-1] == ")":
                    ends_with_parenthesis = True



            counter_alphabetical_chars_word = 0
            counter_alphanumerical_chars_word = 0
            counter_numbers_word = 0


            counter_words += 1

            word_list = list(word)
            for char in word_list:
                if Random.is_special_character(char):
                    counter_special_chars += 1
                elif Random.is_alphanumerical_character(char):
                    counter_alphanumerical_chars += 1
                    counter_alphanumerical_chars_word += 1
                if char.isdigit():
                    counter_numbers += 1
                    counter_numbers_word += 1

            counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word
            ratio_alphabetical_word = np.round(counter_alphabetical_word/len(word), 2)
            counters_alphabetical_ratios.append(ratio_alphabetical_word)
            counters_wordlengths.append(len(word))
            counters_numbers.append(counter_numbers_word)
            character_index += len(uid_info)
            last_xstop = word_xstop


        # get number of spaces
        len_whole_unspace = len(whole_text.replace(" ", ""))
        counter_spaces = counter_chars - len_whole_unspace
        # set alphabetical counter
        counter_alphabetical = counter_alphanumerical_chars - counter_numbers


        if counter_chars == 0:
            self.cpr.printw("no chars shouldn't happen, no recognizion")
            return False

        special_chars_ratio = counter_special_chars/ counter_chars
        alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars
        alphabetical_ratio = counter_alphabetical / counter_chars
        spaces_ratio = counter_spaces/ counter_chars
        numbers_ratio = counter_numbers / counter_chars


        maximum_x_gap = None
        mean_x_gap = None
        median_x_gap = None

        if len(x_gaps) >= 1:
            maximum_x_gap = max(x_gaps)
            mean_x_gap = np.mean(x_gaps)
            median_x_gap = np.median(x_gaps)

        many_numbers_in_first_word = False
        many_alphabetical_in_middle_words = False
        many_alphabetical_in_last_word = False

        # check some middle and last word conditions
        for counter_index, counter in enumerate(counters_wordlengths):
            if counter_index == 0:
                ctr_numbers = counters_numbers[counter_index]
                numbers_ratio_word = np.round(ctr_numbers/counter,2)
                if numbers_ratio_word > 0.8:
                    many_numbers_in_first_word = True
            elif counter_index == len(counters_wordlengths)-1:
                if counter >= 4:
                    alphabetical_ratio_word = counters_alphabetical_ratios[counter_index]
                    if alphabetical_ratio_word >= 0.75:
                        many_alphabetical_in_last_word = True

            else:
                if counter >= 4:
                    alphabetical_ratio_word = counters_alphabetical_ratios[counter_index]
                    if alphabetical_ratio_word >= 0.75:
                        many_alphabetical_in_middle_words = True



        self.cpr.print("alle cntr:", counter_chars)
        self.cpr.print("spec cntr:", counter_special_chars, "ratio", special_chars_ratio)
        self.cpr.print("alnr cntr:", counter_alphanumerical_chars, "ratio", alphanumerical_chars_ratio)
        self.cpr.print("albt cntr:", counter_alphabetical, "ratio", alphabetical_ratio)
        self.cpr.print("spce cntr:", counter_spaces, "ratio", spaces_ratio)
        self.cpr.print("nmbr cntr:", counter_numbers, "ratio", numbers_ratio)
        self.cpr.print("x_box_sizes", x_box_sizes)
        self.cpr.print("x_gaps", x_gaps)
        self.cpr.print("x_gap_max_size", maximum_x_gap)
        self.cpr.print("x_gaps_mean", mean_x_gap)
        self.cpr.print("x_gaps_median", median_x_gap)

        if "Gewinn nach Vortrag" in whole_text:
            print("")


        if ((alphabetical_ratio < 0.75 and \
            numbers_ratio > 0.2 and \
            counter_chars > 5 and \
            counter_words >= 2) and not \
            (starts_with_parenthesis and ends_with_parenthesis)) or ultimo_is_first_word:

            if first_word_no_table_indicator:
                return False

            if mean_x_gap <= 115:
                return False
            if many_alphabetical_in_last_word:
                return False
            if many_alphabetical_in_middle_words and many_numbers_in_first_word:
                return False


            self.cpr.print("possible entry:", whole_text)

            if self.PRINT_TO_CHECKFILE:
                with open("checkfile_tables.txt", "a") as myfile:
                    myfile.write(whole_text+ "||| max x_gap: " + str(maximum_x_gap)+"||| mean x_gap: " + str(mean_x_gap) \
                             + "||| median x_gap: " + str(median_x_gap)+"\n")

            print("jab")
            return True

        return False
Exemplo n.º 4
0
class OCRVoter(object):
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)
        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL)
        self.cpr_vocab_check = ConditionalPrint(
            self.config.PRINT_VOCABULARY_CHECKER,
            self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL)
        self.cpr_sc_predict = ConditionalPrint(
            self.config.PRINT_SPECIALCHAR_PREDICTOR,
            self.config.PRINT_EXCEPTION_LEVEL, self.config.PRINT_WARNING_LEVEL)

        self.filo_last_chars = Filo(250)
        self.predictor = None
        self.use_aufsichtsrat_prediction = False
        self.vocab_checker = None
        self.previous_word_with_seperator = False

    def add_predictor(self, predictor):
        self.predictor = predictor

    def add_vocab_checker(self, vocab_checker):
        self.vocab_checker = vocab_checker

    def get_same_count(self, c1, c2, c3):
        same_ctr = 0
        if c1 == c2:
            same_ctr += 1

        if c1 == c3:
            same_ctr += 1

        return same_ctr

    def get_confidence_count(self,
                             char1,
                             char2,
                             char3,
                             cconf1,
                             cconf2,
                             cconf3,
                             wildcard_char='¦'):
        def get_other_char(char_first, char_sec, char_thrd, co1, co2, co3):
            if char_first != char_sec:
                return char_sec, float(co2)
            elif char_first != char_thrd:
                return char_thrd, float(co3)

        same_ctr = 0
        cconf_ctr = float(cconf1)

        if char1 == char2:
            same_ctr += 1
            cconf_ctr += float(cconf2)
        if char1 == char3:
            same_ctr += 1
            cconf_ctr += float(cconf3)

        # special cases space: ' ', ' ', 'x'
        # wildcard character : '¦', '¦', '¦'

        if char1 == ' ' and same_ctr == 1:
            # if the confidence of the other character is below that value, space gets the high put in confidence value
            return 1, 95.0  #todo j4t

            SPACE_TRESH = 50.0
            SPACE_PUT_IN_VALUE = 99.0
            otherchar, otherconf = get_other_char(char1, char2, char3, cconf1,
                                                  cconf2, cconf3)
            #print("otherchar",otherchar,"otherconf",otherconf)
            if otherconf < SPACE_TRESH:
                return 1, SPACE_PUT_IN_VALUE

        elif char1 == wildcard_char and same_ctr == 1:  #todo: differentiate type of character ??
            # if there is two wildcards and one characters, characters confidence has to be higher than
            # WILDCARD_TRESH to be taken

            wildcard_tresh = 98.5
            if self.config.MSA_BEST_CHANGE_VOTING_TRESHS_ON_EMPTY_LINE:
                wildcard_tresh -= 10  # 0:99,19%, 20:99.16%, 10:99.27%

            return 1, wildcard_tresh

        elif char1 == wildcard_char and same_ctr == 0:
            pass  # todo maybe cover this case (cause wildcard has no confidence i.e if the two otherchars are very low prob, take wildcard)
        elif char1 == '' and same_ctr == 0:
            pass  # todo maybe cover this case (cause space has no confidence ...
        elif self.config.MSA_BEST_VOTING_DOWNSCALE_ONLY_SC \
            and Random.is_special_character(char1) and same_ctr == 0 \
            and char2 == wildcard_char and char3 == wildcard_char:
            # lower the confidence of special characters which stand without any other chars
            return same_ctr, cconf_ctr * 0.9

        return same_ctr, cconf_ctr

    def vote_best_of_three_simple(self,
                                  text_1,
                                  text_2,
                                  text_3,
                                  index_best,
                                  wildcard_character='¦'):
        list_line_1 = list(text_1)
        list_line_2 = list(text_2)
        list_line_3 = list(text_3)

        accumulated_chars = ""
        accumulated_confs = Filo
        for character_index, character_1 in enumerate(list_line_1):
            character_2 = list_line_2[character_index]
            character_3 = list_line_3[character_index]

            clist = [character_1, character_2, character_3]
            # get the character which occurs the most
            sc1 = self.get_same_count(character_1, character_2, character_3)
            sc2 = self.get_same_count(character_2, character_1, character_3)
            sc3 = self.get_same_count(character_3, character_2, character_1)
            maxindices = np.argmax([sc2, sc1, sc3])
            if maxindices == 0:
                accumulated_chars += character_2
            elif maxindices == 1:
                accumulated_chars += character_1
            else:
                accumulated_chars += character_3

        accumulated_chars_stripped = accumulated_chars.replace(
            wildcard_character, '')

        return accumulated_chars, accumulated_chars_stripped

    def vote_best_of_three_charconfs(self,
                                     line_1,
                                     line_2,
                                     line_3,
                                     index_best,
                                     wildcard_character='¦'):
        try:

            def try_obtain_charconf(value, undef_value=0):
                if value is None or value is False or value is True:
                    return undef_value
                return value

            def try_obtain_char(charlist, index):
                if index >= len(charlist):
                    return False  #j4t means not defined
                else:
                    return charlist[index]

            key_confs_mapping = 'UID'
            key_confs = 'x_confs'
            key_char = 'calc_char'
            self.cpr.print("vote_text1", line_1.textstr)
            self.cpr.print("vote_text2", line_2.textstr)
            self.cpr.print("vote_text3", line_3.textstr)
            #if "¦¦lt.H" in line_1.textstr:
            #    self.cpr.print("asd")

            maximum_char_number = max(len(line_1.textstr), len(line_2.textstr),
                                      len(line_3.textstr))

            accumulated_chars = ""

            for character_index in range(
                    0, maximum_char_number
            ):  # check: is list 1 always best reference?

                character_1 = line_1.value(key_char, character_index)
                character_2 = line_2.value(key_char, character_index)
                character_3 = line_3.value(key_char, character_index)

                charconf_1 = try_obtain_charconf(
                    line_1.value(key_confs, character_index, wsval=50.0))
                charconf_2 = try_obtain_charconf(
                    line_2.value(key_confs, character_index, wsval=50.0))
                charconf_3 = try_obtain_charconf(
                    line_3.value(key_confs, character_index, wsval=50.0))

                clist = [character_1, character_2, character_3]
                # get the character which occurs the most
                sc1, acc_conf_1 = self.get_confidence_count(
                    character_1, character_2, character_3, charconf_1,
                    charconf_2, charconf_3)
                sc2, acc_conf_2 = self.get_confidence_count(
                    character_2, character_1, character_3, charconf_2,
                    charconf_1, charconf_3)
                sc3, acc_conf_3 = self.get_confidence_count(
                    character_3, character_2, character_1, charconf_3,
                    charconf_2, charconf_1)
                maxindices = np.argmax([
                    acc_conf_2, acc_conf_1, acc_conf_3
                ])  # this takes in priorisation in case the chars are same
                #todo:import to config
                if character_index == maximum_char_number - 1 and character_2 == "¦" and character_3 == "¦" and character_1 == "I":
                    continue

                if self.config.MSA_BEST_VOTER_DROP_CHARS_BELOW_TRESH == True:
                    tresh = self.config.MSA_BEST_VOTER_DROPPING_TRESH
                    maximum_conf = max(acc_conf_1, acc_conf_2, acc_conf_3)
                    if maximum_conf < tresh:
                        if [character_2, character_1, character_3
                            ][maxindices] != '¦':
                            continue

                if maxindices == 0:
                    accumulated_chars += character_2
                elif maxindices == 1:
                    accumulated_chars += character_1
                else:
                    accumulated_chars += character_3

            accumulated_chars_stripped = accumulated_chars.replace(
                wildcard_character, '')

            return accumulated_chars, accumulated_chars_stripped
        except Exception as ex:
            tr = inspect.trace()

            self.cpr.printex("ocr_voter.py Exception during confidence vote:",
                             ex)
            self.cpr.printex("trace is:", tr)

    def increase_umlaut_confidence(self, chars, charconfs):

        charconfs_adapted = []

        for char_index, char in enumerate(chars):
            if char in SpecialChars.umlauts_caps or char in SpecialChars.umlauts:
                cconf_to_add = charconfs[
                    char_index] + SpecialChars.umlaut_increment
            elif char in SpecialChars.special_chars:
                cconf_to_add = charconfs[
                    char_index] + SpecialChars.special_char_increment
            else:
                cconf_to_add = charconfs[char_index]

            charconfs_adapted.append(cconf_to_add)

        return charconfs_adapted

    def vote_best_of_three_charconfs_searchspaces(self,
                                                  line_1,
                                                  line_2,
                                                  line_3,
                                                  index_best,
                                                  wildcard_character='¦'):
        try:

            key_confs_mapping = 'UID'
            key_confs = 'x_confs'
            key_char = 'calc_char'
            self.cpr.print("vote_text1", line_1.textstr)
            self.cpr.print("vote_text2", line_2.textstr)
            self.cpr.print("vote_text3", line_3.textstr)
            #if "Beteiligung:" in line_1.textstr:
            #     self.cpr.print("asd")

            maximum_char_number = max(len(line_1.textstr), len(line_2.textstr),
                                      len(line_3.textstr))

            accumulated_chars = ""
            accumulated_confs = Filo(300)

            # search space settings
            SEARCH_SPACE_Y_SIZE = 3
            SEARCH_SPACE_X_SIZE_OUTER = 7
            SEARCH_SPACE_X_SIZE_INNER = 3
            SEARCH_SPACE_X_SEARCH_RANGE = 1
            SEARCH_SPACE_PROCESSING_SUBSTITUTION_CHAR = '¦'
            SEARCH_SPACE_PROCESSING_USE_SIMILAR_CHARS = True
            SEARCH_RANGE = 1
            PRINT_MATRICES = self.config.PRINT_SEARCH_SPACE_MATRICES

            # initialize search space processor and search spaces
            search_space_processor = SearchSpaceProcessor(SEARCH_SPACE_Y_SIZE, SEARCH_SPACE_X_SIZE_INNER, \
                                                          wildcard_character, SEARCH_SPACE_PROCESSING_SUBSTITUTION_CHAR)

            ssp_chars = SearchSpace(SEARCH_SPACE_Y_SIZE,
                                    SEARCH_SPACE_X_SIZE_OUTER,
                                    SEARCH_SPACE_X_SEARCH_RANGE, True)
            ssp_confs = SearchSpace(SEARCH_SPACE_Y_SIZE,
                                    SEARCH_SPACE_X_SIZE_OUTER,
                                    SEARCH_SPACE_X_SEARCH_RANGE, True)

            # check if one of the lines is empty for certain settings
            one_line_empty = False
            if self.config.MSA_BEST_VOTER_PUSH_LESS_LINES_WHITESPACE_CONFS or \
                self.config.MSA_BEST_CHANGE_VOTING_TRESHS_ON_EMPTY_LINE:
                one_line_empty = self.check_if_one_line_empty(
                    [line_1, line_2, line_3], wildcard_character)

            # loop through the maximum character range of the lines
            range_extension = SEARCH_SPACE_X_SIZE_INNER
            for character_index in range(
                    0, maximum_char_number + range_extension +
                    2):  # check: is list 1 always best reference?

                if character_index < maximum_char_number:
                    # if there is a character within range (no padding char from extension)
                    # get character values and obtain corresponding confidences (from searchspace because they might
                    # be different to normal values because of swapping
                    line_vals = [line_1.value(key_char, character_index), line_2.value(key_char, character_index), \
                                 line_3.value(key_char, character_index)]

                    line_1_conf = line_1.value(key_confs,
                                               character_index,
                                               wsval=50.0)
                    line_2_conf = line_2.value(key_confs,
                                               character_index,
                                               wsval=50.0)
                    line_3_conf = line_3.value(key_confs,
                                               character_index,
                                               wsval=50.0)

                    charconf_1 = self.try_obtain_charconf_searchspace(
                        line_1_conf,
                        line_vals[0],
                        engine_key=line_1.name[0],
                        one_line_empty=one_line_empty)
                    charconf_2 = self.try_obtain_charconf_searchspace(
                        line_2_conf,
                        line_vals[1],
                        engine_key=line_2.name[0],
                        one_line_empty=one_line_empty)
                    charconf_3 = self.try_obtain_charconf_searchspace(
                        line_3_conf,
                        line_vals[2],
                        engine_key=line_3.name[0],
                        one_line_empty=one_line_empty)
                    charconf_vals = [charconf_1, charconf_2, charconf_3]
                else:
                    # if the character is within padding range just give none values for characters and confidences
                    line_vals = [None, None, None]
                    charconf_vals = [None, None, None]

                # fill searchspace with the chars and confidences
                ssp_chars.push_column(line_vals)
                ssp_confs.push_column(charconf_vals)

                # update the mid-window of the search space (this is the actual search space processing step)
                mid_chars = ssp_chars.get_middle_matrix(PRINT_MATRICES)
                mid_confs = ssp_confs.get_middle_matrix(PRINT_MATRICES)
                mid_chars_processed, mid_confs_processed, change_done = \
                    search_space_processor.process_search_space(mid_chars, mid_confs,SEARCH_SPACE_PROCESSING_USE_SIMILAR_CHARS)
                if change_done is True:
                    ssp_chars.update_middle_matrix(mid_chars_processed)
                    ssp_confs.update_middle_matrix(mid_confs_processed)

                # extract changed values from search space
                character_offset = -(SEARCH_SPACE_X_SEARCH_RANGE + 1)
                character_1 = ssp_chars.get_value_around_middle(
                    0, character_offset)
                character_2 = ssp_chars.get_value_around_middle(
                    1, character_offset)
                character_3 = ssp_chars.get_value_around_middle(
                    2, character_offset)
                charconf_1 = ssp_confs.get_value_around_middle(
                    0, character_offset)
                charconf_2 = ssp_confs.get_value_around_middle(
                    1, character_offset)
                charconf_3 = ssp_confs.get_value_around_middle(
                    2, character_offset)
                if character_1 is None or character_2 is None or character_3 is None:
                    # self.cpr.print("test")
                    continue

                # in case umlaut confidence increment is active change charconfs otherwise same charconfs
                charconf_1, charconf_2, charconf_3 = self.increase_umlaut_confidence_searchspace(
                    character_1, character_2, character_3, charconf_1,
                    charconf_2, charconf_3)

                # get the previous characters from other lines as string (mainly for predictor)
                filo_content = self.filo_last_chars.get_content_as_string()

                # trigger predicted section for aufsichtsrat predictor
                self.toggle_predictor(filo_content)

                # predict_char if predictor is enabled
                predicted_char = self.predict_char(filo_content)

                # get the character which occurs the most by accumulating confidence scores
                sc1, acc_conf_1 = self.get_confidence_count(
                    character_1, character_2, character_3, charconf_1,
                    charconf_2, charconf_3)
                sc2, acc_conf_2 = self.get_confidence_count(
                    character_2, character_1, character_3, charconf_2,
                    charconf_1, charconf_3)
                sc3, acc_conf_3 = self.get_confidence_count(
                    character_3, character_2, character_1, charconf_3,
                    charconf_2, charconf_1)
                maxindices = np.argmax([
                    acc_conf_2, acc_conf_1, acc_conf_3
                ])  # this takes in priorisation in case the chars are same

                if character_index == maximum_char_number + range_extension + 1 and character_2 == "¦" and character_3 == "¦" and character_1 == "I":
                    continue

                # drop chars completely if they fall below a certain dropping treshhold and the setting is active
                if self.config.MSA_BEST_VOTER_DROP_CHARS_BELOW_TRESH == True:
                    tresh = self.config.MSA_BEST_VOTER_DROPPING_TRESH
                    maximum_conf = max(acc_conf_1, acc_conf_2, acc_conf_3)
                    if maximum_conf < tresh:
                        if [character_2, character_1, character_3
                            ][maxindices] != '¦':
                            continue

                # determine character with the best accumulated confidence
                voted_char = None
                voted_acc_conf = None
                if maxindices == 0:
                    voted_char = character_2
                    voted_acc_conf = acc_conf_2
                elif maxindices == 1:
                    voted_char = character_1
                    voted_acc_conf = acc_conf_1
                else:
                    voted_char = character_3
                    voted_acc_conf = acc_conf_3

                # if predictor is active, check if there is a better char predicted which can replace  voted character
                voted_char = self.maybe_replace_voted_by_predicted_char(
                    voted_char, self.use_aufsichtsrat_prediction,
                    predicted_char, wildcard_character, voted_acc_conf,
                    character_1, character_2, character_3)
                # push the voted char and the accumulated confidence of this char to results
                accumulated_confs.push(voted_acc_conf)
                accumulated_chars += voted_char

                # if the predictor is enabled fill the filo with the voted_char
                self.fill_filo_last_chars(voted_char)

            # do vocabulary related steps, if activated
            accumulated_chars = self.vocabulary_related_corrections(
                accumulated_chars, wildcard_character, accumulated_confs)

            # remove the wilcard characters and return result
            accumulated_chars_stripped = accumulated_chars.replace(
                wildcard_character, '')
            return accumulated_chars, accumulated_chars_stripped

        except Exception as ex:
            tr = inspect.trace()

            self.cpr.printex("ocr_voter.py Exception during confidence vote",
                             ex)
            self.cpr.printex("trace", tr)

    def vocabulary_related_corrections(self, accumulated_chars,
                                       wildcard_character, accumulated_confs):

        if self.config.KEYING_RESULT_VOCABULARY_CORRECTION_VOTE:
            accumulated_chars_final = ""
            acc_split = accumulated_chars.split()
            len_split = len(acc_split)

            for word_index, word in enumerate(acc_split):

                if self.config.KEYING_RESULT_VC_IGNORE_SEPERATE_WRITING_CORRECTION:
                    if word_index == len_split - 1 and word.replace(
                            wildcard_character, "").endswith('-'):
                        self.previous_word_with_seperator = True
                        accumulated_chars_final += word + " "
                        continue
                    if word_index == 0:
                        if self.previous_word_with_seperator is True:
                            self.previous_word_with_seperator = False
                            accumulated_chars_final += word + " "
                            continue

                acc_confs_word = accumulated_confs.pop_multi(len(word))
                acc_conf, rate, change, word_starting_borders, word_trailing_borders, word_reduced = \
                    self.vocab_checker.get_accumulated_confidence_rate(word, acc_confs_word, wildcard_character)
                self.cpr_vocab_check.print("w:", word, "wr:", word_reduced,
                                           "accr:", acc_conf, "rate", rate)

                # don't correct words below min vocab length ( mind that special chars in dict are toggled)
                check_len = len(word)
                if self.config.KEYING_RESULT_VC_DICT_REMOVE_SPECIAL_BORDER_CHARS:
                    check_len = len(word_reduced)
                if check_len < self.config.KEYING_RESULT_VC_MIN_VOCAB_WORD_LENGTH:
                    accumulated_chars_final += word + " "
                    continue

                if self.config.KEYING_RESULT_VC_CORRECT_ONLY_ERRONOUS_CHARS:
                    swappable_char_indices = []

                    acc_confs_used = None
                    word_used = None

                    if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS:
                        # use the full length confidences array including trailing and leading special characters
                        acc_confs_used = acc_confs_word
                        word_used = word
                    else:
                        # don't use trailing and starting special characters if no special chars needed
                        acc_confs_used = acc_confs_word[
                            len(word_starting_borders):(
                                len(acc_confs_word) -
                                len(word_trailing_borders))]
                        word_used = word_reduced

                    for conf_index, conf in enumerate(acc_confs_used):
                        if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS:
                            if conf <= 250:
                                character_related = word_used[conf_index]
                                is_special_char = Random.is_special_character(
                                    character_related)
                                if is_special_char and character_related != wildcard_character:
                                    # only swap special character indices
                                    swappable_char_indices.append(conf_index)
                        else:
                            if conf <= 215:
                                swappable_char_indices.append(conf_index)

                    if len(swappable_char_indices) >= 1:
                        word_reduced_correct = self.vocab_checker.correct_text_at_certain_indices_only(
                            word_used, swappable_char_indices)
                        if word_reduced_correct != None:
                            word_correct_withtrails = None

                            if self.config.KEYING_RESULT_VC_CORRECT_ERRONOUS_SPECIAL_CHARS:
                                if Random.has_special_character(
                                        word_reduced_correct):
                                    # if special character was replaced with special character
                                    word_correct_withtrails = word_reduced_correct
                                else:
                                    # if special character was replaced by alphanumerical character
                                    word_correct_withtrails = word
                            else:
                                word_correct_withtrails = word_starting_borders + word_reduced_correct + word_trailing_borders

                            # only print the changed results
                            if word != word_correct_withtrails:
                                self.cpr_vocab_check.print(
                                    "w:", word, "wc:", word_correct_withtrails,
                                    "accr:", acc_conf, "rate", rate)

                            accumulated_chars_final += word_correct_withtrails + " "
                        else:
                            accumulated_chars_final += word + " "
                    else:
                        accumulated_chars_final += word + " "

                    continue

                if rate < self.config.KEYING_RESULT_VOCABULARY_CORRECTION_VOTE_TRESH \
                        and len(word_reduced) > 2:
                    # if the rate drops below tresh, try to fetch vocab entry
                    word_reduced_correct, suggestions, flh = self.vocab_checker.correct_text(
                        word_reduced)
                    if word_reduced_correct != None and word_reduced_correct != word_reduced:

                        word_correct_withtrails = word_starting_borders + word_reduced_correct + word_trailing_borders

                        self.cpr_vocab_check.print("w:", word, "wc:",
                                                   word_correct_withtrails,
                                                   "accr:", acc_conf, "rate",
                                                   rate)

                        accumulated_chars_final += word_correct_withtrails + " "
                    else:
                        accumulated_chars_final += word + " "
                else:
                    accumulated_chars_final += word + " "

            accumulated_chars = accumulated_chars_final

        return accumulated_chars

    def try_obtain_charconf_searchspace(
        self,
        value_confidence,
        value,
        undef_value=0,
        engine_key=None,
        one_line_empty=False,
    ):
        if value_confidence is None or value_confidence is False or value_confidence is True:
            return undef_value

        returnvalue = value_confidence

        if self.config.MSA_BEST_VOTER_SCALE_ENGINE_CONFIDENCES and engine_key is not None:
            if engine_key == 'Abbyy':
                if self.config.MSA_BEST_INCREASE_CONFIDENCE_OF_SOME_ABBYY_CHARS:
                    if value == "%":  # improve ocropus in confidence of % because it was trained
                        value_confidence = value_confidence + 80

                returnvalue = ConfidenceModifications.abby_factor * value_confidence
            elif engine_key == 'Tess':
                returnvalue = ConfidenceModifications.tesseract_factor * value_confidence

            elif engine_key == 'Ocro':

                returnvalue = ConfidenceModifications.ocropus_factor * value_confidence

        if (self.config.MSA_BEST_VOTER_PUSH_LESS_LINES_WHITESPACE_CONFS and one_line_empty and value == " ") \
            or (self.config.MSA_BEST_VOTER_PUSH_WHITESPACE_IF_MOSTLY_WILDCARD and one_line_empty \
                and value == " "):

            returnvalue += ConfidenceModifications.whitespace_push

        return returnvalue

    def check_if_one_line_empty(self, lines, wildcard_character):
        for line in lines:
            text_wo_wildcards = line.textstr.replace(wildcard_character, '')
            if text_wo_wildcards == "":
                return True
            if self.config.MSA_BEST_VOTER_PUSH_WHITESPACE_IF_MOSTLY_WILDCARD:
                # also count in high whitecard ratios as empty line
                wildcard_ratio = 1 - (len(text_wo_wildcards) /
                                      len(line.textstr))
                if wildcard_ratio > 0.70:
                    return True

    def toggle_predictor(self, filo_content):
        if self.config.PREDICTOR_AUFSICHTSRAT_ENABLED:
            if "Aufsichtsrat" in filo_content:
                self.use_aufsichtsrat_prediction = True
            if "Gründung:" in filo_content:
                self.use_aufsichtsrat_prediction = False

    def predict_char(self, filo_content):
        predicted_char = None
        if self.use_aufsichtsrat_prediction:
            if len(filo_content
                   ) >= 19:  # if filo_content bigger than one prediction chunk
                len_aufsichtsrat = 19
                predicted_char = self.predictor.predict_next_aufsichtsrat_chars(
                    len_aufsichtsrat, filo_content)
                # print("filo", filo_content,"predict:", predicted_char)
                # print("dd")
        return predicted_char

    def fill_filo_last_chars(self, voted_char):
        """
        fill filo for predictor usage with voted_char some additional chars around this char
        :param voted_char:
        :return:
        """

        if self.config.PREDICTOR_AUFSICHTSRAT_ENABLED:
            # create pre semi-tokenized input strings in the filos from the voted characters for prediction
            if voted_char == ' ':
                # the models usally use the 'ƿ' char in substitution for spaces
                self.filo_last_chars.push(' ', filterchar='¦')
                self.filo_last_chars.push('ƿ', filterchar='¦')
                self.filo_last_chars.push(' ', filterchar='¦')
            elif Random.is_special_character(voted_char):
                self.filo_last_chars.push(' ', filterchar='¦')
                self.filo_last_chars.push(voted_char, filterchar='¦')
                self.filo_last_chars.push(' ', filterchar='¦')

            else:
                self.filo_last_chars.push(voted_char, filterchar='¦')

    def increase_umlaut_confidence_searchspace(self, character_1, character_2,
                                               character_3, charconf_1,
                                               charconf_2, charconf_3):

        if self.config.MSA_BEST_SEARCHSPACE_INCREASE_UMLAUT_CONFIDENCE:
            clist = [character_1, character_2, character_3]
            conflist = [charconf_1, charconf_2, charconf_3]
            conflist_new = self.increase_umlaut_confidence(clist, conflist)
            charconf_1 = conflist_new[0]
            charconf_2 = conflist_new[1]
            charconf_3 = conflist_new[2]
            return charconf_1, charconf_2, charconf_3
        return charconf_1, charconf_2, charconf_3

    def maybe_replace_voted_by_predicted_char(self, voted_char, aufsichtsrat_prediction_toggled, predicted_char, \
                                              wildcard_character, voted_acc_conf, character_1, character_2, character_3):
        if aufsichtsrat_prediction_toggled:
            if Random.is_special_character(predicted_char):
                one_char_sc = Random.is_special_character(character_1) \
                              or Random.is_special_character(character_2) or Random.is_special_character(
                    character_3)
                voted_char_sc = Random.is_special_character(voted_char)

                if predicted_char != voted_char and (
                        one_char_sc
                        or voted_char_sc) and voted_char != wildcard_character:
                    # print("FiloContent:", filo_content)
                    self.cpr_sc_predict.print("pc:", predicted_char, "vc:",
                                              voted_char, "vc_acc",
                                              voted_acc_conf)
                    if voted_acc_conf <= 90.0:
                        if voted_char != '\f':  # don't swap formfeeds, they don't get predicted at all
                            self.cpr_sc_predict.print("swap")
                            voted_char = predicted_char

        return voted_char
class AdditionalInfoHandler(object):
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_ADDITIONAL_INFO_HANDLER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL,
                                    leading_tag=self.__class__.__name__)
        self.cpr.print("init additional info handler")

    def write_excel_to_json(self,
                            fileinfo,
                            filepath,
                            filetype,
                            idxcol=None,
                            parse_cols=None,
                            page=0):
        """"
        At the moment a little helper script for the Aktienführer-Project.
        Be free to modify as you wish.
        """
        #if isinstance(parse_cols, list): parse_cols = [parse_cols],
        additional_filepath = path.normpath(
            f"{filepath}/**/*{fileinfo.dbname}.{filetype}")
        file = glob.glob(additional_filepath, recursive=True)
        if len(file) != 1: return None
        if filetype in ["xlsx", "xls"]:
            df = pd.read_excel(file[0]).set_index("ProfileID")
            jsondata = {fileinfo.dbname: {"Year": fileinfo.dbname}}
            jsondf = df.to_dict(orient="index")
            jsondata.update(jsondf)
            with open(file[0].replace("xlsx", "json"), "w") as output:
                json.dump(jsondata, output, indent=4)
        return None

    def fetch_additional_information_simple(self, file):
        """
        Same as fetch additional information, but config related info is already included in given
        parameters
        :return: additional info
        """
        if self.config.ADDITIONAL_INFORMATION:
            additional_info = self.fetch_additional_information(
                file,
                self.config.INPUT_ADDINFOPATH,
                idxcol=self.config.IDXCOL,
                parse_cols=self.config.PARSE_COLS,
                filetype=self.config.INPUT_ADDINFOFILETPYE)
            return additional_info

        return None

    def fetch_additional_information(self,
                                     fileinfo,
                                     filepath,
                                     filetype,
                                     idxcol=None,
                                     parse_cols=None,
                                     page=0):
        """
        Reads an additional file with information
        It searches the file where the index_name matches tablename or dbname
        :param file:
        :param index_name:
        :return: additional info
        """
        #if isinstance(parse_cols, list): parse_cols = [parse_cols]
        additional_filepath = path.normpath(
            f"{filepath}/**/*{fileinfo.dbname}.{filetype}")
        file = glob.glob(additional_filepath, recursive=True)

        len_files = len(file)
        if len_files > 1:
            self.cpr.printex(
                "More than one additional information file was found!")
            return None
        if len_files == 0:
            self.cpr.printex("No additional information file was found!")
            return None

        file = file[0]
        current_db_and_table = {
            "db": fileinfo.dbname,
            "table": fileinfo.tablename
        }
        if filetype in ["xlsx", "xls"]:
            infos = {}
            info_df = pd.read_excel(file)  #.set_index("ProfileID")
            parse_cols.remove(idxcol)
            for db_and_table_id, current_db_and_tablename in current_db_and_table.items(
            ):
                infos[db_and_table_id] = {}
                for line, rubric_content in info_df.loc[
                        info_df[idxcol] ==
                        current_db_and_tablename][parse_cols].to_dict(
                            orient="index").items():
                    for rubric, content in rubric_content.items():
                        if rubric != idxcol:
                            if infos[db_and_table_id].get(rubric,
                                                          None) is None:
                                infos[db_and_table_id][rubric] = content
                            elif infos[db_and_table_id].get(rubric,
                                                            None) != content:
                                if not isinstance(
                                        infos[db_and_table_id][rubric], list):
                                    infos[db_and_table_id][rubric] = [
                                        infos[db_and_table_id][rubric]
                                    ]
                                infos[db_and_table_id][rubric].append(content)
        elif filetype == "json":
            with open(file, "r") as add_info_file:
                infos = json.load(add_info_file)

            for possible_db_or_tablenames in reversed(list(infos.keys())):
                possible_db_or_tablenames_orig = possible_db_or_tablenames  # unchanged name

                if self.config.ADD_INFO_SIMPLIFIED_NAME_COMPARISON:
                    psplit = possible_db_or_tablenames.split("-")
                    possible_db_or_tablenames = psplit[0]

                if possible_db_or_tablenames not in current_db_and_table[
                        'table']:
                    del infos[possible_db_or_tablenames_orig]
                else:
                    for db_and_table_id, current_db_and_tablename in current_db_and_table.items(
                    ):
                        if possible_db_or_tablenames == current_db_and_tablename:
                            infos[db_and_table_id] = infos[
                                possible_db_or_tablenames_orig]
                            del infos[possible_db_or_tablenames_orig]
        else:
            return None
        return infos
class EndobjectFactory(object):
    """
    Creates an object with the following structure and provides exporting methods:

    segment_tag_1: [                ---> this level is created by set_current_main_list
        {
            type: "Sitz"            ---> add this level entries with add_to_my_object object_number=0
            city: "Neustadt"
        },
        {
            type: "Sitz"            ---> add this level entries with add_to_my_object object_number=0
            city: "Neustadt"
        }

    ],
    segment_tag_2: [
        {
            ...
        }
        ...
    ]
    """
    def __init__(self):
        self.my_object = {}
        self.current_main_list = None
        self.pp = pprint.PrettyPrinter(indent=5)

        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_OUTPUT_ANALYSIS, self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)

        if self.config.REMOVE_TAGS_IN_ORIG_DIFF:
            self.known_uc = KnownUncategories()

    def set_current_main_list(self, segment_tag):
        if segment_tag not in self.my_object.keys():
            self.my_object[segment_tag] = []              # create the main list (all subsequent entries are stored here)

        self.current_main_list = self.my_object[segment_tag]  # create a short link on the main list

    def add_to_my_obj(self, key, value, object_number=0, only_filled=False):

        if only_filled is True and (value == None or value == "" or value == [] or value == {}):
            return False

        # fill main list if object index not in
        len_list = len(self.current_main_list)
        if len_list < object_number+1:
            for index in range(len_list,object_number+1):
                self.current_main_list.append({})

        self.cpr.print("Adding value to List,- ObjectNr.:", object_number,"Key:", key, "Value:", value)
        # add or insert to the main_list
        self.current_main_list[object_number][key] = value
        return True

    def print_me_and_return(self):
        print("my_object is:")
        self.pp.pprint(self.my_object)
        return self.my_object

    def print_current_main(self):
        print("current_main:")
        self.pp.pprint(self.current_main_list)

    def export_as_json(self):
        my_obj_json = json.dumps(self.my_object, indent=5, ensure_ascii=False)
        return my_obj_json

    def export_as_json_at_key(self, key, remove_first_object=False):

        if key not in self.my_object.keys():
            return None

        my_obj = self.my_object[key]
        if remove_first_object:
            if len(my_obj) >= 1:
                my_obj = my_obj[1:]  # remove the first object which usally contains generic info

        my_obj_json = json.dumps(my_obj, indent=5, ensure_ascii=False)
        return my_obj_json

    @staticmethod
    def fetch_subentries_recursive_check(entry):
        """
        Fetches all subentries (values) from an entry and writes them to a list of texts
        This get's called recursively within the function until all subentries
        are found
        :param entry: entry to fetch the subentries from
        :return: list of subentries
        """
        final_texts = []

        for item in entry:
            if isinstance(entry, list):
                value = item
            else:
                # item is a key
                value = entry[item]
            if isinstance(value, str):
                final_texts.append(value)
            elif isinstance(value, int):
                final_texts.append(str(value))
            elif isinstance(value, object):
                obj_size = len(value)
                if obj_size > 0:
                    recursive_texts = EndobjectFactory.fetch_subentries_recursive_check(value)
                    final_texts.extend(recursive_texts)

        return final_texts

    @staticmethod
    def fetch_keys_recusive_check(entry, final_keys, create_multiple=True):
        """
        Fetches all keys in an object and it's sub-objects
        calls itself recursively until all keys are found
        writes final keys to final_keys array and returns this
        :param entry: object to fetch the sub-keys from
        :param final_keys: list of final keys (initial state)
        :param create_multiple: if the same key occurs multiple times it still gets added
        :return: final_keys with added keys from object
        """

        if isinstance(entry, list):
            for item in entry:
                final_keys = EndobjectFactory.fetch_keys_recusive_check(item, final_keys, create_multiple)
            return final_keys
        elif not isinstance(entry, dict):
            # just return if there are no keys (cause no dictionary)
            return final_keys

        for key in entry:
            value = entry[key]
            if create_multiple or key not in final_keys:
                if isinstance(key, int):
                    continue
                final_keys.append(key)
            final_keys = EndobjectFactory.fetch_keys_recusive_check(value, final_keys)
        return final_keys

    def diff_seg_to_orig_at_key(self, key):
        """
        def fetch_subentries_recursive(entry):
            final_texts = []

            for item in entry:
                if isinstance(entry, list):
                    value = item
                else:
                    # item is a key
                    value = entry[item]
                if isinstance(value, str):
                    final_texts.append(value)
                elif isinstance(value, int):
                    final_texts.append(str(value))
                elif isinstance(value, object):
                    obj_size = len(value)
                    if obj_size > 0:
                        recursive_texts = fetch_subentries_recursive(value)
                        final_texts.extend(recursive_texts)

            return final_texts
        """
        if key not in self.my_object.keys():
            return None

        my_data = self.my_object[key]

        # check if the orig-post property can exist warn if not
        if not self.config.ADD_INFO_ENTRY_TO_OUTPUT:
            self.cpr.printw("trying to fetch original data, original data is not added to results")
            self.cpr.printw("toggle ADD_INFO_ENTRY_TO_OUTPUT in config to True")
        if len(my_data) <= 0:
            self.cpr.printw("no data to do returning")
            return

        return # todo this seems to be wrong
        # copy orig string
        original_text = my_data[0]['origpost']
        rest_text = original_text

        # fetch parsed entries for diff
        all_final_entries = []  # array of final entries
        for index in range(1, len(my_data)):
            entry = my_data[index]
            final_entries = fetch_subentries_recursive(entry)
            all_final_entries.extend(final_entries)

        # order diff data after length
        all_final_entries.sort(key=lambda x: len(x))
        all_final_entries.reverse()

        # subtract
        for text in all_final_entries:
            rest_text = rest_text.replace(text, "")

            rest_text = rest_text.strip()

        return rest_text, original_text

    def diff_parsed_to_orig_at_key(self, key):
        """
        def fetch_subentries_recursive(entry):
            final_texts = []

            for item in entry:
                if isinstance(entry, list):
                    value = item
                else:
                    # item is a key
                    value = entry[item]
                if isinstance(value, str):
                    final_texts.append(value)
                elif isinstance(value, int):
                    final_texts.append(str(value))
                elif isinstance(value, object):
                    obj_size = len(value)
                    if obj_size > 0:
                        recursive_texts = fetch_subentries_recursive(value)
                        final_texts.extend(recursive_texts)

            return final_texts

        def fetch_keys_recusive(entry, final_keys, create_multiple=True):
            # just return if there are no keys (cause no dictionary)
            if not isinstance(entry, dict):
                return final_keys

            for key in entry:
                value = entry[key]
                if create_multiple or key not in final_keys:
                    if isinstance(key, int):
                        continue
                    final_keys.append(key)
                final_keys = fetch_keys_recusive(value, final_keys)
            return final_keys
        """
        if key not in self.my_object.keys():
            return None

        #if key == "KursVonZuteilungsrechten":
        #   print("todo remove debug")

        my_data = self.my_object[key]

        # check if the orig-post property can exist warn if not
        if not self.config.ADD_INFO_ENTRY_TO_OUTPUT:
            self.cpr.printw("trying to fetch original data, original data is not added to results")
            self.cpr.printw("toggle ADD_INFO_ENTRY_TO_OUTPUT in config to True")
        if len(my_data) <= 0:
            self.cpr.printw("no data to do returning")
            return
        # copy orig string
        original_text = my_data[0]['origpost']
        rest_text = original_text

        # fetch parsed entries for diff
        pool_entries = []  # array of final entries
        for index in range(1, len(my_data)):
            entry = my_data[index]
            final_entries = EndobjectFactory.fetch_subentries_recursive_check(entry)
            pool_entries.extend(final_entries)

        if self.config.REMOVE_SPACES_IN_ORIGIN_DIFF is True:
            # removes all spaces from rest and comparison values because spaces are often
            # a problem in subtracting the rests
            rest_text = rest_text.replace(" ", "")
            for index in range(0,len(pool_entries)):
                pool_entries[index] = pool_entries[index].replace(" ", "")

        all_final_entries = []

        # add the entries to the complete subtraction and tag them with '1'
        for pentry in pool_entries:
            all_final_entries.append((pentry, 1))

        # if keys shall be subracted also add them also
        if self.config.REMOVE_TAGS_IN_ORIG_DIFF:
            pool_keys = []  # gets multiple of the same key for later 1 by 1 subtraction
            for index in range(1, len(my_data)):
                pool_keys = EndobjectFactory.fetch_keys_recusive_check(my_data[index], pool_keys, create_multiple=True)

            # also remove spaces in keys
            if self.config.REMOVE_SPACES_IN_ORIGIN_DIFF is True:
                for index in range(0, len(pool_keys)):
                    pool_keys[index] = pool_keys[index].replace(" ", "")

            final_keys = []
            for pkey in pool_keys:
                final_keys.append((pkey, 2))

            all_final_entries.extend(final_keys)

        # order diff data after length
        all_final_entries.sort(key=lambda x: len(x[0]))
        all_final_entries.reverse()

        # subtract
        for entry in all_final_entries:
            text = entry[0]
            text_or_key = entry[1]
            if text_or_key == 2:
                if text in self.known_uc.unkeys:
                    continue
            text_stripped = text.strip()  # remove spaces so texts better fit in
            rest_text = rest_text.replace(text_stripped, "", 1)
            rest_text = rest_text.strip()

        return rest_text, original_text
class AkfParsingFunctionsJK(object):
    def __init__(self,
                 endobject_factory,
                 output_analyzer,
                 dictionary_handler,
                 ocromore_data=None):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(
            self.config.PRINT_SEGMENT_PARSER_AKF_FN_THREE,
            self.config.PRINT_EXCEPTION_LEVEL,
            self.config.PRINT_WARNING_LEVEL,
            leading_tag=self.__class__.__name__)

        self.cpr.print("init akf parsing functions three")

        self.ef = endobject_factory
        self.output_analyzer = output_analyzer
        self.ocromore_data = ocromore_data
        self.dictionary_handler = dictionary_handler

    def parse_bilanzen(self, real_start_tag, content_texts, content_lines,
                       feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(
            segmentation_class.segment_tag, content_texts, real_start_tag)

        # init
        only_add_if_string = True
        if self.config.LOG_SIMPLE:
            geschaeftslage = origpost_red.replace("- ", "")

            #parsing
            self.ef.add_to_my_obj("balances",
                                  geschaeftslage,
                                  object_number=element_counter,
                                  only_filled=only_add_if_string)
            return True
        #parsing
        table = Datatable(snippet=segmentation_class.snippet)
        table.analyse_structure(content_lines,
                                feature_lines,
                                template="datatable_balance")
        table.extract_content(content_lines,
                              feature_lines,
                              template="datatable_balance")

        # Write information for income table parsing
        segmentation_class.info_handler["income"] = {}
        segmentation_class.info_handler["income"]["amount"] = table.info.amount
        segmentation_class.info_handler["income"]["col"] = table.info.col
        segmentation_class.info_handler["income"][
            "separator"] = table.info.separator

        # Parsing the tables based on whitespace and number of numbers of each group
        # This should be the last option to parse (error-prone)
        self.ef.add_to_my_obj("balances",
                              table.content,
                              object_number=element_counter,
                              only_filled=only_add_if_string)

    def parse_gewinn_und_verlust(self, real_start_tag, content_texts,
                                 content_lines, feature_lines,
                                 segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(
            segmentation_class.segment_tag, content_texts, real_start_tag)

        # init
        only_add_if_string = True
        if self.config.LOG_SIMPLE:
            geschaeftslage = origpost_red.replace("- ", "")

            #parsing
            self.ef.add_to_my_obj("income",
                                  geschaeftslage,
                                  object_number=element_counter,
                                  only_filled=only_add_if_string)
            return True

        # parsing
        table = Datatable(snippet=segmentation_class.snippet)
        table.analyse_structure(content_lines,
                                feature_lines,
                                template="datatable_income")
        if segmentation_class.info_handler and "income" in set(
                segmentation_class.info_handler.keys()):
            table.info.col = segmentation_class.info_handler["income"]["col"]
            table.info.amount = segmentation_class.info_handler["income"][
                "amount"]
            table.info.separator = segmentation_class.info_handler["income"][
                "separator"]

        table.extract_content(content_lines,
                              feature_lines,
                              template="datatable_income")

        #parsing
        self.ef.add_to_my_obj("income",
                              table.content,
                              object_number=element_counter,
                              only_filled=only_add_if_string)

    def parse_aktienkurse(self, real_start_tag, content_texts, content_lines,
                          feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(
            segmentation_class.segment_tag, content_texts, real_start_tag)

        # init
        only_add_if_string = True
        #self.config.LOG_SIMPLE = False
        if self.config.LOG_SIMPLE:
            #    self.config.LOG_SIMPLE = False
            skip = origpost_red.replace("- ", "")

            # parsing
            self.ef.add_to_my_obj("shares",
                                  skip,
                                  object_number=element_counter,
                                  only_filled=only_add_if_string)
            return True

        # parsing
        table = Sharetable(snippet=segmentation_class.snippet)
        table.analyse_structure(content_lines, feature_lines)
        table.extract_content(content_lines, feature_lines)
        #from timeit import timeit
        #print(timeit(test))
        # parsing
        self.ef.add_to_my_obj("shares",
                              table.content,
                              object_number=element_counter,
                              only_filled=only_add_if_string)

    def parse_dividend(self, real_start_tag, content_texts, content_lines,
                       feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(
            segmentation_class.segment_tag, content_texts, real_start_tag)

        # init
        only_add_if_string = True
        # self.config.LOG_SIMPLE = True
        if self.config.LOG_SIMPLE:
            #    self.config.LOG_SIMPLE = False
            skip = origpost_red.replace("- ", "")

            # parsing
            self.ef.add_to_my_obj("dividende",
                                  skip,
                                  object_number=element_counter,
                                  only_filled=only_add_if_string)
            return True

        # parsing
        table = Dividendtable(snippet=segmentation_class.snippet)
        table.analyse_structure(content_lines, feature_lines)
        table.extract_content(content_lines, feature_lines)
        # from timeit import timeit
        # print(timeit(test))
        # parsing
        self.ef.add_to_my_obj("dividende",
                              table.content,
                              object_number=element_counter,
                              only_filled=only_add_if_string)
Exemplo n.º 8
0
class AkfParsingFunctionsTwo(object):

    def __init__(self, endobject_factory, output_analyzer, dictionary_handler):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_PARSER_AKF_FN_TWO, self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)

        self.cpr.print("init akf parsing functions two")

        self.ef = endobject_factory
        self.output_analyzer = output_analyzer
        self.dictionary_handler = dictionary_handler

    def parse_zahlstellen(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        split_post = origpost_red.split(';')
        DEFAULT_ENTRY = 1
        ADDITIONAL_INFO_BOTH = 2      # beide - two previous
        ADDITIONAL_INFO_ALL_PREV = 3  # sämtl. - all previous

        final_entries = []
        for index, entry in enumerate(split_post):
            entry_stripped = entry.strip()

            if "beide" in entry_stripped:
                entry_final = regex.sub(r"beide\s?\.?", "##", entry_stripped).strip()
                entry_final_split = entry_final.split('##')
                for index_fs, entry_fs in enumerate(entry_final_split):
                    if entry_fs.strip() == "" : continue
                    if index_fs < len(entry_final_split)-1:
                        final_entries.append((DEFAULT_ENTRY, entry_fs, "", "", ""))
                    else:
                        final_entries.append((ADDITIONAL_INFO_BOTH, entry_fs, "", "", ""))
                continue
            if regex.search("sämtl\s?\.?", entry_stripped):
                entry_final = regex.sub(r"sämtl\s?\.?", "##", entry_stripped).strip()
                entry_final_split = entry_final.split('##')
                for index_fs, entry_fs in enumerate(entry_final_split):
                    if entry_fs.strip() == "": continue
                    if index_fs < len(entry_final_split)-1:
                        final_entries.append((DEFAULT_ENTRY, entry_fs, "", "", ""))
                    else:
                        final_entries.append((ADDITIONAL_INFO_ALL_PREV, entry_fs, "", "", ""))
                continue

            entry_split = entry_stripped.split(',')
            bank = ""
            city = ""
            title = ""
            rest_info = []
            for fragment_index, fragment in enumerate(entry_split):
                if fragment_index == 0:
                    bank = fragment
                elif fragment_index == 1:
                    city = fragment
                elif fragment_index >= 2:
                    rest_info.append(fragment)
            if bank != "" or city != "" or title != "":
                final_entries.append((DEFAULT_ENTRY, bank, city, title, rest_info))

        # reverse list for better processing
        reverse_fe = final_entries.__reversed__()
        current_additional_info = ""
        current_info_index = None
        current_entry_type = None
        final_list = []
        for item_index, item in enumerate(reverse_fe):
            entry_type, entryorbank, city, title, rest_info = item
            # change current additional info
            if entry_type == ADDITIONAL_INFO_BOTH or entry_type == ADDITIONAL_INFO_ALL_PREV:
                current_info_index = item_index
                current_additional_info = entryorbank
            elif entry_type == DEFAULT_ENTRY:
                templist = [(entryorbank, city, title, current_additional_info, rest_info)]
                templist.extend(final_list)
                final_list = templist

            # end 'beide'-entry because it's over after 2 iterations
            if current_entry_type == ADDITIONAL_INFO_BOTH and item_index-current_info_index >= 1:
                current_info_index = None
                current_additional_info = ""

        # finally note the entries to output
        only_add_if_value = True
        for entry in final_list:
            bank, city, title, add_info, rest_info = entry
            if add_info.strip() != "":
                rest_info_new = [add_info]
                rest_info_new.extend(rest_info)
            else:
                rest_info_new = rest_info

            #if add_info != "" and add_info != None and city =="":
            #    city += add_info
            self.ef.add_to_my_obj("bank", bank, object_number=element_counter, only_filled=only_add_if_value)
            self.ef.add_to_my_obj("city", city, object_number=element_counter, only_filled=only_add_if_value)
            self.ef.add_to_my_obj("title", title, object_number=element_counter, only_filled=only_add_if_value)
            #self.ef.add_to_my_obj("additional_info", add_info, object_number=element_counter, only_filled=only_add_if_value)
            #self.ef.add_to_my_obj("rest_info", rest_info, object_number=element_counter, only_filled=only_add_if_value)
            self.ef.add_to_my_obj("rest_info", rest_info_new, object_number=element_counter, only_filled=only_add_if_value)

            element_counter += 1

        return True

    def parse_grundkapital(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
        # todo validate other currencies than 'DM'
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        only_add_if_value = True

        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)



        # Try to normalize ; to : with prefix apital
        content_texts = [content_text.replace("apital;","apital:") for content_text in content_texts]

        gk = cf.parse_general_and_keys(content_texts,
                                  join_separated_lines=True,
                                  current_key_initial_value='start_value',
                                  abc_sections=True)
        #print(gk)
        # check start value for 'normal' grundkapital content
        # if found parse
        start_value = gk.get('start_value',"")
        if len(gk.keys()) == 1:
            start_value = gk[list(gk.keys())[0]]
        #if start_value =
        if len(start_value) >= 1:
            #print("could be grundkapital")
            my_return_object, found_main_amount, element_counter, only_add_if_value, additional_info = \
                cf.parse_grundkapital_line(start_value[0], False, element_counter, only_add_if_value, [])
            currency = my_return_object.get('currency',"").strip()
            amount = my_return_object.get('amount',"").strip()
            if amount != "" and currency != "":
                self.ef.add_to_my_obj('Grundkapital', my_return_object, object_number=element_counter, only_filled=only_add_if_value)
            else:
                gk['additional_info'] = []
                gk['additional_info'].append(start_value[0].replace("↑", ":"))


        if len(start_value) >= 2: # get the additional values which are in start_value, but have nothing to do with that
            if 'additional_info' not in gk.keys():
                gk['additional_info'] = []

            gk['additional_info'] = []
            for index in range(1, len(start_value)):
                val = start_value[index]
                gk['additional_info'].append(val.replace("↑", ":"))

        """
        if 'additional_info' in gk.keys():
            gk_ai = cf.parse_general_and_keys(gk['additional_info'],
                                           join_separated_lines=True,
                                           current_key_initial_value='start_value_addinfo',
                                           abc_sections=True)

            print("lemme check")
        """


        for key in gk:
            if key is "start_value":
                continue
            entry = gk[key]
            # individual parsing here
            match_year = regex.search("\d\d\d\d", key) # key is year
            year = None
            key_rest = ""
            if match_year:
                year = match_year.group()
                key_rest = key.replace(year, "").strip()

            accumulated_text = []
            if key_rest != "":
                accumulated_text.append(key_rest)

            for inner_entry in entry:
                accumulated_text.append(inner_entry)

            final_entry = None
            if year is None:
                final_entry = accumulated_text
            else:
                final_entry = {
                    "year": year,
                    "text": accumulated_text
                }

            if final_entry != None and final_entry != "":
                self.ef.add_to_my_obj(key, final_entry, object_number=element_counter,
                                      only_filled=only_add_if_value)
                element_counter += 1

        # check all year lines and parse the
        return



        # old parsing style
        final_entries = []
        current_ref_index = -1
        found_main_amount = False
        additional_info = []
        only_add_if_value = True
        for text_index, text in enumerate(content_texts):
            text_stripped = text.strip()
            if text_stripped == "":
                continue

            # todo increment element ctr ?
            my_return_object, found_main_amount, element_counter, only_add_if_value, additional_info = \
                cf.parse_grundkapital_line(text_stripped, found_main_amount, element_counter, only_add_if_value, additional_info)

            for key in my_return_object:
                value = my_return_object[key]
                self.ef.add_to_my_obj(key, value, object_number=element_counter, only_filled=only_add_if_value)


        if len(additional_info) >= 1:
            add_lines_parsed = cf.parse_grundkapital_additional_lines(additional_info,element_counter,True, 0)
            self.ef.add_to_my_obj("additional_info", add_lines_parsed, object_number=element_counter,
                                     only_filled=only_add_if_value)

        return True

    def parse_ordnungsnrdaktien(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        only_add_if_value = True
        # example values - each line of content_texts list
        # '589300 (St.-Akt.)'
        # '589300.'
        first_number_match = True
        for entry in content_texts:
            entry_stripped = entry.strip()
            rest = entry_stripped
            if entry_stripped == "":
                continue

            match_number = regex.search(r"^([\d\s]*)", entry_stripped)
            match_parenth = regex.search(r"\(.*\)", entry_stripped) # take content in parenthesis

            if match_number is not None and match_number.group(0).strip() != "":

                if not first_number_match:
                    element_counter += 1        # switch to next element if number not true
                number = match_number.group(0).strip()

                self.ef.add_to_my_obj("ord_number", number, object_number=element_counter, only_filled=only_add_if_value)
                rest = rest.replace(number, "", 1)
                first_number_match = False
            if match_parenth is not None:
                parenth = match_parenth.group(0)
                self.ef.add_to_my_obj("category", parenth, object_number=element_counter, only_filled=only_add_if_value)
                rest = rest.replace(parenth, "", 1)

            rest_stripped = rest.strip()
            if rest_stripped != "":
                self.ef.add_to_my_obj("additional_info", rest_stripped, object_number=element_counter, only_filled=only_add_if_value)

    def parse_grossaktionaer(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        lines_split = origpost_red.split(';')
        only_add_if_value = True
        for line in lines_split:
            # testline
            # line = "Société Sidérurgique de Participations et d’ Approvisionnement en Charbons, par abréviation (Sidechar), Paris (ca.60,2 %)."
            findings = regex.finditer(r"\([a-zü0-9\s\,\.]*%\).?",line)
            lof = list(findings)
            #findings = regex.search(r"(?m)a", line)
            if lof:
                findings = []
                for finding in lof:
                    findings.append(finding.regs[0])
            else:
                findings = [(len(line),len(line))]
            start = 0
            for idx, finding in enumerate(findings):
                #shareholder,location, share
                item = line[start:finding[0]]
                if ":" in item:
                    self.ef.add_to_my_obj("additional_information", item[:item.index(":")],
                                          object_number=element_counter, only_filled=only_add_if_value)
                    if line.index(":")+2 > finding[0]:
                        continue
                    else:
                        item = item[item.index(":"):]
                item = item.rsplit(",",1)
                self.ef.add_to_my_obj("shareholder", item[0].strip(),
                                      object_number=element_counter, only_filled=only_add_if_value)
                if len(item) > 1 and item[1] != "":
                    if item[1][-1] == ".":
                        item[1] = item[1][:len(item[1])-1]
                    if "(" in item[1] and ")" in item[1]:
                        find = regex.search(r"(\([0-9\s\,]*|maßgeblich|Mehrheit|Majorität)\)", item[1])
                        if find:
                            self.ef.add_to_my_obj("share",
                                              item[1][find.regs[0][0]:find.regs[0][1]-1].strip(), object_number=element_counter,
                                              only_filled=only_add_if_value)
                            item[1] = item[1][:find.regs[0][0]-1]
                    self.ef.add_to_my_obj("location", item[1].strip(),
                                      object_number=element_counter, only_filled=only_add_if_value)
                if finding[0] != len(line):
                    self.ef.add_to_my_obj("share", line[finding[0]:finding[1]].replace(", ",",").replace("(","").replace(").","").replace(")","").strip(), object_number=element_counter,only_filled=only_add_if_value)

                start = finding[1]
                element_counter += 1
            #print(self.ef.my_object["Großaktionär"])
            """
            # find parenthesis with 2 or more characters inside
            #for item in line.split("%)"):
            match_parenth = regex.findall(r"(\(.{2,}\))", line)
            found_parenth = None
            parenth_is_used = False
            organization = None
            location = None
            # find additional info in  each line and subtract it
            if match_parenth:
                found_parenth = match_parenth[-1].strip("., ") # find the last parenthesis grounp
                # if the parenthesis are at the end of line
                if line.strip()[-1] == ")" and not(len(found_parenth.replace(" ", "")) <= 5 and "%" in found_parenth): # exclude percentages from parenthesis matches
                    line = line.replace(found_parenth, "", 1)
                    parenth_is_used = True

            split_line = line.split(',')
            len_split_line = len(split_line)
            if len_split_line == 1:
                organization = line.strip("., ")
            else:
                organization = line.replace(split_line[-1], "", 1).strip("., ")
                location = split_line[-1].strip("., ")  # town
            self.ef.add_to_my_obj("organization", organization, object_number=element_counter,only_filled=only_add_if_value)
            self.ef.add_to_my_obj("location", location, object_number=element_counter,only_filled=only_add_if_value)
            if parenth_is_used:
                self.ef.add_to_my_obj("additional_info", found_parenth, object_number=element_counter,only_filled=only_add_if_value)
            element_counter += 1
        """
        return True


    def parse_geschaeftsjahr(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        only_add_if_value = True
        final_jahr = []

        for text in content_texts:
            text_stripped = text.strip("., ")
            if text_stripped != "":
                if "bis" in text_stripped:
                    split_text = text_stripped.split('bis ')#
                    # regex.split('\.bis|\sbis\s', text_stripped)
                    if len(split_text) == 1:
                        final_jahr.append(split_text[0].strip())
                        continue
                    gesch_jahr_start = split_text[0].strip("( ")
                    gesch_jahr_stop = split_text[1].strip(" )")
                    self.ef.add_to_my_obj('gesch_jahr_start', gesch_jahr_start, object_number=element_counter,
                                          only_filled=only_add_if_value)
                    self.ef.add_to_my_obj('gesch_jahr_stop', gesch_jahr_stop, object_number=element_counter,
                                          only_filled=only_add_if_value)

                    if len(split_text) >= 3:
                        for rest in split_text[3:]:
                            if rest.strip() != "":
                                final_jahr.append(rest)
                else:
                    final_jahr.append(text_stripped)

        self.ef.add_to_my_obj('year', final_jahr, object_number=element_counter,only_filled=only_add_if_value)
        return True

    def parse_stimmrechtaktien(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        # find last parenthesis and filter
        match_parenth = regex.findall(r"(\(.*?\))", origpost_red)
        found_parenth = None
        origpost_used = origpost_red
        # find additional info in  each line and subtract it
        if match_parenth:
            found_parenth = match_parenth[-1].strip("., ")  # find the last parenthesis grounp
            origpost_used = origpost_red.replace(found_parenth, "")  # update the orignpost used

        final_lines = []
        only_add_if_value = True
        skip = False
        final_text = ""
        for text_index, text in enumerate(content_texts):
            if text == "":
                continue
            text = text.replace("DM =", "DM 1 =").replace("DM=", "DM 1 =").replace("eine DM", "DM 1.-")
            if element_counter == 0 and "je nom" not in text.lower():
                self.ef.add_to_my_obj("additional_info", "".join(content_texts[text_index:]),
                                      object_number=element_counter,
                                      only_filled=only_add_if_value)
                break
            if skip:
                skip = False
                continue
            parse_aktie = regex.compile(r"(?P<nominal>[Jj]e[de]*?\s?(?P<nomvalue>[\d\s]*?)\s?[Aa]ktie[n]?)[^\d]*(?P<vote>[\d\s]*?)\s*?(?P<voteend>Stimme[n]*)")
            finding = parse_aktie.findall(text.replace("Stamm",""))
            if finding != []:
                finding = list(finding[0])
                if finding[1] == "":
                    finding[1] = "1"
                stck = {"kind": "Aktie",
                        "amount": finding[1],
                        "vote": finding[2].replace(" ", "").strip(),
                        "value": "",
                        "currency": "",
                        "rank": element_counter}
                self.ef.add_to_my_obj(element_counter, stck, object_number=element_counter,
                                      only_filled=only_add_if_value)
                element_counter += 1
                continue
            #text = 'Je nom. DM 50.- =1 Stimme.'
            parse_stimmrecht = regex.compile(r"(?P<nominal>[Jj]e[de]*?\s?(?P<nomvalue>[\d\s]*?)\s?nom\.)\s*?(?P<currency>[^\d]*)\s?(?P<value>[\d\s]*)\s*?(?P<waste>[^\dA-Za-z]*)\s{0,}(?P<kind>[A-Za-z.,\-\s]*)?[^\d\s]*\s{0,}(?P<vote>[\d]*)?\s{0,}(?P<voteend>Stimme[n]*)?")
            finding = parse_stimmrecht.findall(text.replace("DM", " DM").replace("RM"," RM"))
            # Special case "bzw."
            if finding and "bzw." in text:
                if "Stimm" not in text:
                    skip = True
                    text += content_texts[text_index+1]
                parse_bzw = regex.compile(r"(?P<nominal>[Jj]e[de]*?\s?(?P<nomvalue>[\d\s]*?)\s?nom\.)\s*?(?P<currency>[^\d]*)\s?(?P<value>[\d\s]*)\s*?[^\d]*\s*?(?P<value2>[\d\s]*)[^\dA-Za-z]*(?P<kind>[A-Za-z][A-Za-z.,\-\s]*)?[^\d\s]*\s{0,}(?P<vote>[\d]*)?\s{0,}[^\d]*\s{0,}(?P<vote2>[\d]*)\s{0,}(?P<voteend>Stimme[n]*)?")
                finding = parse_bzw.findall(text)
                finding = finding[0]
                if finding:
                    stck = {"kind": finding[5].strip(),
                            "amount": "1",
                            "vote": finding[6].replace(" ", "").strip(),
                            "value": finding[3].strip(),
                            "currency": finding[2].strip(),
                            "rank": element_counter}
                    self.ef.add_to_my_obj(element_counter, stck, object_number=element_counter,
                                          only_filled=only_add_if_value)
                    element_counter += 1
                    stck = {"kind": finding[5].strip(),
                            "amount": "1",
                            "vote": finding[7].replace(" ", "").strip(),
                            "value": finding[4].strip(),
                            "currency": finding[2].strip(),
                            "rank": element_counter}
                    self.ef.add_to_my_obj(element_counter, stck, object_number=element_counter,
                                          only_filled=only_add_if_value)
                    continue
            if not finding or finding[0][0] + finding[0][1] == "":
                final_text += text
                continue
            if final_text != "":
                self.ef.add_to_my_obj("additional_info", final_text, object_number=element_counter-1,
                                      only_filled=only_add_if_value)
                final_text = ""
            finding_next = None
            if finding[0][6] + finding[0][7] == "":
                if text_index == len(content_texts) - 1:
                    self.ef.add_to_my_obj("additional_info", text, object_number=element_counter,
                                          only_filled=only_add_if_value)
                    continue
                else:
                    finding_next = parse_stimmrecht.findall(text + " " + content_texts[text_index + 1])
            if finding_next:
                skip = True
                finding = finding_next
            finding = list(finding[0])
            if finding[5] == "":
                finding[5] = "nom."
            if finding[1] == "":
                finding[1] = "1"
            stck = {"kind": finding[5].strip(),
                    "amount": finding[1].strip(),
                    "vote": finding[6].replace(" ", "").strip(),
                    "value": finding[3].strip(),
                    "currency": finding[2].strip(),
                    "rank": element_counter}
            self.ef.add_to_my_obj(element_counter, stck, object_number=element_counter, only_filled=only_add_if_value)
            element_counter += 1
        # match_akt = regex.search(r"\.\s?\-\s?Akt", text)
        # if match_saemtlsakt is not None:
        #    self.ef.add_to_my_obj("additional_info", text, object_number=element_counter, only_filled=only_add_if_value)
        #    element_counter += 1
        #    continue
        if final_text != "":
            self.ef.add_to_my_obj("additional_info", final_text, object_number=element_counter,
                                  only_filled=only_add_if_value)
        return True
        """
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        # add extra splitting elements to each 'je' or 'Je'
        origpost_red_se = regex.sub(r"(Je |je )", r"~~~\1", origpost_red)

        split_text = origpost_red_se.split('~~~')
        # origpost_red = regex.sub(r"(\d\.)", r"\1~~~~", origpost_red)
        only_add_if_value = True

        for entry in split_text:
            if entry == "":
                continue
            match_sb = regex.search(r"Stimmrechtsbeschränkung:.*", entry)
            sbe = None
            if match_sb is not None:
                sbe = match_sb.group()
                sbe = sbe.replace("Stimmrechtsbeschränkung:", "", 1)
                entry = entry.replace(sbe, "").replace("Stimmrechtsbeschränkung:", "", 1)

            self.ef.add_to_my_obj("entry", entry, object_number=element_counter ,only_filled=only_add_if_value)
            self.ef.add_to_my_obj("Stimmrechtsbeschränkung", sbe, object_number=element_counter ,only_filled=only_add_if_value)
            element_counter += 1
        """

    def parse_boersennotiz(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)
        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        found_parenth = None
        origpost_used = origpost_red

        # log all location elements
        only_add_if_value = True
        split_post = regex.split('u\.|und|,', origpost_used)
        for entry in split_post:
            entry_stripped = entry.strip("., ")

            # find additional info in  each line and subtract it
            # find last parenthesis and filter
            #match_parenth = regex.findall(r"(\(.*?\))", entry_stripped)
            #combined_ps = []
            #for res in match_parenth:
                #combined_ps.append(res.strip())
                #origpost_used = origpost_red.replace(found_parenth, "")  # update the orignpost used
                # log additional info in last parenthesis

            #self.ef.add_to_my_obj("additional_info", combined_ps, object_number=element_counter,
            #                          only_filled = only_add_if_value)

            #if entry_stripped is None or entry_stripped == "":
                #if match_parenth:
                #    element_counter += 1
            entry_stripped = entry.replace("im Freiverkehr", "").replace("(amtl.)", "").strip("., ")
            if entry_stripped == None or entry_stripped == "":
                continue
            self.ef.add_to_my_obj("location", entry_stripped, object_number=element_counter, only_filled= only_add_if_value)
            element_counter += 1

        return True


    def preprocess_stueckelung_texts(self, content_texts):
        final_stueckelung_texts = []

        previous_text_stripped = ""
        for index, current_text in enumerate(content_texts):
            current_text_stripped = current_text.strip()
            if current_text_stripped == "":
                continue

            if current_text_stripped.startswith("zu je") or current_text_stripped.startswith("Zu je"):
                final_stueckelung_texts.append(previous_text_stripped + "  "+current_text_stripped)
                previous_text_stripped = ""
            elif "(" == current_text_stripped[0] and ")" == current_text_stripped[-1]:
                final_stueckelung_texts.append(previous_text_stripped + "  "+current_text_stripped)
                previous_text_stripped = ""
            else:
                final_stueckelung_texts.append(previous_text_stripped)
                previous_text_stripped = current_text_stripped
                if index == len(content_texts)-1:
                    final_stueckelung_texts.append(current_text_stripped)

        final_texts_filtered = []
        for text in final_stueckelung_texts:
            text_stripped = text.strip()
            if text_stripped != "":
                final_texts_filtered.append(text_stripped)

        return final_texts_filtered

    def parse_stueckelung(self, real_start_tag, content_texts, content_lines, feature_lines, segmentation_class):
        # get basic data
        element_counter = 0
        origpost, origpost_red, element_counter, content_texts = \
            cf.add_check_element(self, content_texts, real_start_tag, segmentation_class, element_counter)

        # logme
        self.output_analyzer.log_segment_information(segmentation_class.segment_tag, content_texts, real_start_tag)

        # find last parenthesis and filter
        match_parenth = regex.findall(r"(\(.*?\))", origpost_red)
        found_parenth = None
        origpost_used = origpost_red
        # find additional info in  each line and subtract it
        if match_parenth:
            found_parenth = match_parenth[-1].strip("., ")  # find the last parenthesis grounp
            origpost_used = origpost_red.replace(found_parenth, "") # update the orignpost used

        final_lines = []
        additional_info_final = []
        only_add_if_value = True
        skip = False
        final_text = ""
        final_add_rest = ""
        content_texts = self.preprocess_stueckelung_texts(content_texts)
        for text_index, text in enumerate(content_texts):
            if text.strip() == "":
                continue
            if skip:
                skip = False
                continue
            parse_stck = regex.compile(r"(?P<amount>[\d\s\.]*)\s*(?P<kind>[^\d]*?)[\s]?(?P<nominal>zu je|zuje|zu|je)\s{0,}(?P<currency>[^\d\s]*)\s{0,}(?P<value>[\d\s]*)")
            finding = parse_stck.findall(text.replace(" Stücke ", " Aktien ").replace(" Stück ", " Aktie ").replace("DM", " DM").replace("RM", " RM").replace("hfl"," hfl"))

            rest_finding = ""
            if len(finding) >= 1:
                rest_finding = text # get the rest of finding
                subtract_sorted = sorted(finding[0],key=len)
                subtract_sorted.reverse()
                for find_chunk in subtract_sorted:
                    rest_finding = rest_finding.replace(find_chunk, "", 1).strip()
                rest_finding = regex.sub("\s{2,}"," ", rest_finding) # just replace redundant spaces for better subtraction

            if not finding or finding[0][0]+finding[0][1] == "" or finding[0][0]+finding[0][4] == "":
                match_akt = regex.search(r"\.\s?\-\s?Akt", text)
                match_saemtlsakt, err_saemtlsakt = regu.fuzzy_search(
                    r"([Ss]ämtliche [Ss]tammaktien.*|[Ss]ämtliche [Aa]ktien.*|[Ss]ämtliche Namens\-Stammaktien.*)", text, err_number=1)
                if match_saemtlsakt is not None: #and match_akt is not None: @jk is this second condition really necessary ?
                    saemtl_res = match_saemtlsakt.group()
                    self.ef.add_to_my_obj("additional_info", saemtl_res, object_number=element_counter,
                                          only_filled=only_add_if_value)
                    reduced_text = text.replace(saemtl_res, "")
                    final_lines.append(reduced_text)
                    rest_finding = rest_finding.replace(reduced_text,"")
                if "Börse" in text or "Besondere" in text:
                    addendum = "".join(content_texts[text_index:])
                    self.ef.add_to_my_obj("additional_info", addendum, object_number=element_counter,
                                          only_filled=only_add_if_value)
                    element_counter += 1
                    rest_finding = rest_finding.replace("".join(content_texts[text_index:]), "")
                    break
                if "(" in text:
                    self.ef.add_to_my_obj("additional_info", text, object_number=element_counter-1,
                                          only_filled=only_add_if_value)
                    rest_finding = rest_finding.replace(text, "")

                else:
                    rest_finding = rest_finding.replace(text, "")
                    final_text += text
                continue
            finding_next = None
            if finding[0][2] == "" or (("zu" in finding[0][2] or "je" in finding[0][2]) and finding[0][3] == ""):
                #test =  '2 638 514 Inh. - bzw. Namensaktien zuje FF 75.-'
                if text_index == len(content_texts) - 1:
                    self.ef.add_to_my_obj("additional_info", text, object_number=element_counter,
                                          only_filled=only_add_if_value)
                    continue
                else:
                    finding_next = parse_stck.findall(text + " " + content_texts[text_index + 1])
            if finding[0][3]+finding[0][4] == "":
                if text_index == len(content_texts) - 1:
                    self.ef.add_to_my_obj("additional_info", text, object_number=element_counter,
                                          only_filled=only_add_if_value)
                    continue
                else:
                    finding_next = parse_stck.findall(text + " " + content_texts[text_index + 1])
            if finding_next:
                skip = True
                finding = finding_next
            stck = {"amount": finding[0][0].replace("."," ").strip(),
                     "kind": finding[0][1].replace(" ","").strip(),
                     "nominal": "zu je",
                     "currency": finding[0][3],
                     "value": finding[0][4],
                     "rank": element_counter}
            self.ef.add_to_my_obj("entry", stck, object_number=element_counter, only_filled=only_add_if_value)
            if rest_finding != "":
                final_add_rest += rest_finding + " "
            element_counter += 1
           # match_akt = regex.search(r"\.\s?\-\s?Akt", text)
            #if match_saemtlsakt is not None:
            #    self.ef.add_to_my_obj("additional_info", text, object_number=element_counter, only_filled=only_add_if_value)
            #    element_counter += 1
            #    continue
        if final_text != "":
            self.ef.add_to_my_obj("additional_info", final_text.replace(final_add_rest.strip(".,- "),
                                                                        "", 1).strip(".,- "), object_number=element_counter,
                                  only_filled=only_add_if_value)
            element_counter += 1

        if final_add_rest != "":
            self.ef.add_to_my_obj("additional_info", final_add_rest.strip(".,- "), object_number=element_counter,
                                  only_filled=only_add_if_value)
        return True
Exemplo n.º 9
0
class FeatureExtractor():
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_FEATURE_EXTRACTOR,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL,
                                    leading_tag=self.__class__.__name__)

        self.filter_start_words = [
            "Fernruf:", "Vorstand:", "Fernschreiber:", "von", "Gründung:",
            "Ordnungsnr.", "Ordnungsnr", "Grundkapital:", "Umstellung"
        ]

    def extract_file_features(self, ocromore_data):
        all_line_features = []
        for line in ocromore_data['lines']:
            current_line_features = self.extract_line_features(line)
            all_line_features.append(current_line_features)

        ocromore_data['line_features'] = all_line_features

        return ocromore_data

    def extract_line_features(self, line):

        final_line_features = {}

        whole_text = line['text']

        self.cpr.print("recognizing text:", whole_text)

        # counters
        counter_special_chars = 0
        counter_alphanumerical_chars = 0
        counter_numbers = 0
        counter_chars = len(whole_text)
        counter_alphabetical = 0
        counter_words = 0
        counters_alphabetical_ratios = []
        counters_wordlengths = []
        counters_numbers = []

        character_index = 0
        # special conditions
        ultimo_is_first_word = False
        first_word_no_table_indicator = False
        starts_with_parenthesis = False
        ends_with_parenthesis = False

        last_xstop = 0
        x_box_sizes = []
        x_gaps = []
        for word_obj in line['words']:
            word_index = word_obj['word_index']
            word_text = word_obj['text']
            hocr_coordinates = word_obj['hocr_coordinates']

            word_xstart = hocr_coordinates[0]
            word_xstop = hocr_coordinates[2]
            word_box_size = word_xstop - word_xstart
            x_box_sizes.append(word_box_size)

            if word_index >= 1:
                x_gap = word_xstop - last_xstop
                x_gaps.append(x_gap)

            #line.data['word_x0']
            if word_text is None or word_text == "":
                continue

            if word_index == 0:
                if word_text in self.filter_start_words:
                    first_word_no_table_indicator = True
                if word_text.lower() == "ultimo":
                    ultimo_is_first_word = True
                if word_text[0] == "(":
                    starts_with_parenthesis = True

            if word_index == len(whole_text) - 1:
                if word_text[-1] == ")":
                    ends_with_parenthesis = True

            counter_alphabetical_chars_word = 0
            counter_alphanumerical_chars_word = 0
            counter_numbers_word = 0

            counter_words += 1

            word_list = list(word_text)
            for char in word_list:
                if Random.is_special_character(char):
                    counter_special_chars += 1
                elif Random.is_alphanumerical_character(char):
                    counter_alphanumerical_chars += 1
                    counter_alphanumerical_chars_word += 1
                if char.isdigit():
                    counter_numbers += 1
                    counter_numbers_word += 1

            counter_alphabetical_word = counter_alphanumerical_chars_word - counter_numbers_word
            ratio_alphabetical_word = np.round(
                counter_alphabetical_word / len(word_text), 2)
            counters_alphabetical_ratios.append(ratio_alphabetical_word)
            counters_wordlengths.append(len(word_text))
            counters_numbers.append(counter_numbers_word)
            character_index += len(word_text)
            last_xstop = word_xstop

        # get number of spaces
        len_whole_unspace = len(whole_text.replace(" ", ""))
        counter_spaces = counter_chars - len_whole_unspace
        # set alphabetical counter
        counter_alphabetical = counter_alphanumerical_chars - counter_numbers

        if counter_chars == 0:
            self.cpr.printw("no chars in line:", str(line['line_index']),
                            "no features here")
            return False

        special_chars_ratio = counter_special_chars / counter_chars
        alphanumerical_chars_ratio = counter_alphanumerical_chars / counter_chars
        alphabetical_ratio = counter_alphabetical / counter_chars
        spaces_ratio = counter_spaces / counter_chars
        numbers_ratio = counter_numbers / counter_chars

        maximum_x_gap = None
        mean_x_gap = None
        median_x_gap = None

        if len(x_gaps) >= 1:
            maximum_x_gap = max(x_gaps)
            mean_x_gap = np.mean(x_gaps)
            median_x_gap = np.median(x_gaps)

        many_numbers_in_first_word = False
        many_alphabetical_in_middle_words = False
        many_alphabetical_in_last_word = False

        # check some middle and last word conditions
        for counter_index, counter in enumerate(counters_wordlengths):
            if counter_index == 0:
                ctr_numbers = counters_numbers[counter_index]
                numbers_ratio_word = np.round(ctr_numbers / counter, 2)
                if numbers_ratio_word > 0.8:
                    many_numbers_in_first_word = True
            elif counter_index == len(counters_wordlengths) - 1:
                if counter >= 4:
                    alphabetical_ratio_word = counters_alphabetical_ratios[
                        counter_index]
                    if alphabetical_ratio_word >= 0.75:
                        many_alphabetical_in_last_word = True

            else:
                if counter >= 4:
                    alphabetical_ratio_word = counters_alphabetical_ratios[
                        counter_index]
                    if alphabetical_ratio_word >= 0.75:
                        many_alphabetical_in_middle_words = True

        final_line_features = LineFeatures(cpr=self.cpr)
        final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word

        final_line_features.counter_special_chars = counter_special_chars
        final_line_features.counter_chars = counter_chars
        final_line_features.counter_spaces = counter_spaces
        final_line_features.counter_numbers = counter_numbers
        final_line_features.counter_alphabetical = counter_alphabetical
        final_line_features.counter_alphanumerical_chars = counter_alphanumerical_chars
        final_line_features.counter_words = counter_words

        final_line_features.counters_numbers = counters_numbers
        final_line_features.counters_wordlengths = counters_wordlengths
        final_line_features.counters_alphabetical_ratios = counters_alphabetical_ratios

        final_line_features.numbers_ratio = numbers_ratio
        final_line_features.alphabetical_ratio = alphabetical_ratio
        final_line_features.alphanumerical_chars_ratio = alphanumerical_chars_ratio
        final_line_features.special_chars_ratio = special_chars_ratio
        final_line_features.spaces_ratio = spaces_ratio

        final_line_features.many_alphabetical_in_last_word = many_alphabetical_in_last_word
        final_line_features.many_alphabetical_in_middle_words = many_alphabetical_in_middle_words
        final_line_features.many_numbers_in_first_word = many_numbers_in_first_word
        final_line_features.x_box_sizes = x_box_sizes
        final_line_features.x_gaps = x_gaps

        final_line_features.maximum_x_gap = maximum_x_gap
        final_line_features.mean_x_gap = mean_x_gap
        final_line_features.median_x_gap = median_x_gap

        return final_line_features
Exemplo n.º 10
0
class DictionaryHandler(object):
    def __init__(self):
        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_DICTIONARY_HANDLER,
                                    self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL,
                                    leading_tag=self.__class__.__name__)

        self.cpr.print("init dictionary handler")
        self.data_functs = None  # storage for json object
        self.data_titles = None  # storage for json object
        self.texts_functs = None
        self.texts_titles = None
        if self.config.USE_DICTIONARIES_FOR_PERSON_PARSING:
            self.load_dictionaries()
            # get the rows as sorted list of texts longest first
            if self.data_functs is not None:
                check_tf = self.sort_rows(self.get_rows(self.data_functs))
                self.texts_functs = check_tf
            if self.data_titles is not None:
                check_tt = self.sort_rows(self.get_rows(self.data_titles))
                self.texts_titles = check_tt

    def diff_name_title(self, text_to_check):

        len_text_to_check = len(text_to_check)
        name_found = text_to_check
        title_found = ""

        for entry_index, entry in enumerate(self.texts_titles):
            title, tlen = entry
            # accelerate the process, by skipping comparisons which have longer texts
            if tlen > len_text_to_check:
                continue
            # compare the texts
            if title in text_to_check:
                name_found = text_to_check.replace(title, "", 1).strip()
                title_found = title
                break

        return name_found, title_found

    def load_dictionaries(self):
        base_dict_path = self.get_dict_path()

        filepath_titles_dict = os.path.join(base_dict_path, "dict_titles.json")
        filepath_functs_dict = os.path.join(base_dict_path, "dict_functs.json")

        # load titles
        if os.path.exists(filepath_titles_dict):
            with open(filepath_titles_dict) as f:
                self.data_titles = json.load(f)
        else:
            self.cpr.printex(
                "dictionary dict_titles.json missing at specificied path",
                filepath_titles_dict)

        # load functs
        if os.path.exists(filepath_functs_dict):
            with open(filepath_functs_dict) as f:
                self.data_functs = json.load(f)
        else:
            self.cpr.printex(
                "dictionary dict_functs.json missing at specificied path",
                filepath_functs_dict)

    def get_rows(self, dict_data):
        rows = dict_data['rows']
        final_rows = []
        for entry in rows:
            text = entry[0]
            final_rows.append((text, len(text)))
        return final_rows

    def sort_rows(self, rows):
        #itemgetter(1),
        rows.sort(key=lambda t: len(t[0]), reverse=True)
        return rows

    def path(self):
        return os.getcwd()

    def get_dict_path(self):
        complete = os.path.join(self.path(), "additionals", "dictionaries")
        return complete
Exemplo n.º 11
0
class SegmentClassifier(object):
    """
    This is the basic handler for classification
    which get's accessed from root/-outside classes.
    """

    def __init__(self):

        config_handler = ConfigurationHandler(first_init=False)

        self.config = config_handler.get_config()
        self.cpr = ConditionalPrint(self.config.PRINT_SEGMENT_CLASSIFIER, self.config.PRINT_EXCEPTION_LEVEL,
                                    self.config.PRINT_WARNING_LEVEL, leading_tag=self.__class__.__name__)
        self.cpr.print("init segment classifier")

    def classify_file_segments(self, ocromore_data):
        lines = ocromore_data['lines']
        feats = ocromore_data['line_features']
        file_info = ocromore_data['file_info']
        all_file_segments = AllSegments(len(lines), self.cpr, self.config)

        prev_line = None
        prev_text = None
        for current_line_index, current_line in enumerate(lines):
            current_features = feats[current_line_index]
            current_text = current_line['text']
            current_index = current_line['line_index']
            # create a combined lined object with optimized (removed) separation
            combined_line = None
            if prev_line is not None:
                combined_lines = dh.join_separated_lines([prev_text, current_text])
                combined_line = dh.join_joined_lines(combined_lines)
            else:
                combined_line = current_text
            # pass parameters to matching functions
            all_file_segments.match_my_segments(current_line, current_text, current_index, current_features, 
                                                prev_line, combined_line)
            prev_line = current_line
            prev_text = current_text




        if self.config.MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION:
            self.adapt_non_explicit_indices(all_file_segments)
        else:
            all_file_segments.correct_overlaps_index_field(only_start_tags=True)

        self.adapt_stop_index_in_last_segment(all_file_segments)


        # does the last steps in segment matching
        all_file_segments.finish_segment_matching(lines, feats, file_info)

        # do again after final step
        if self.config.MATCH_UNTIL_NEXT_START_THEN_STOP_CONDITION:
            self.adapt_non_explicit_indices(all_file_segments)
        else:
            all_file_segments.correct_overlaps_index_field(only_start_tags=True)

        self.adapt_stop_index_in_last_segment(all_file_segments)




        ocromore_data['segmentation'] = all_file_segments

        return ocromore_data


    def adapt_stop_index_in_last_segment(self, all_file_segments):
        """
        Sets the stop_index for the last recognized segment, which
        is a special case and is usually not filled beforehand, because
        there is no next start index
        :param all_file_segments: holder object for segment classes and other info
        :return: None
        """

        # search for last segment
        saved_start_index = -1
        saved_last_segment = None
        for segment in all_file_segments.my_classes:
            # only count segmented segments
            if segment.start_was_segmented is False:
                continue

            if segment.start_line_index >= saved_start_index:
                saved_start_index = segment.start_line_index
                saved_last_segment = segment

        if saved_last_segment is None:
            return

        # adapt the last stop index of last segment
        saved_last_segment.stop_line_index = all_file_segments.number_of_lines-1
        saved_last_segment.stop_was_segmented = True  # todo think about if this is necessary?





    def adapt_non_explicit_indices(self, all_file_segments):

        # update start and explicit stop tags first
        all_file_segments.correct_overlaps_index_field(only_start_tags=True)

        # fill undefined stop regions until next start region
        all_file_segments.fill_start_index_until_next_stop()
Exemplo n.º 12
0
class OCRset:
    """
        A storage class for a y_mean value
        and a set of lines which was assigned to each other
        If the lineset values where not edited, they are intialized with 'False
    """
    N_DISTANCE_SHORTEST_TAG = "n_distance_shortest"

    def __init__(self, lines_size, y_mean, msa_handler):
        lineset = []
        for x in range(0, lines_size):
            lineset.append(False)

        self._set_lines = lineset
        self._size = lines_size
        self._y_mean = y_mean  # mean y coordinate of all lines referenced in this set
        self.shortest_distance_line_index = -1
        self._unspaced = False  # indicates the set_lines was unspaced
        self._refspaced = False  # indicates the set_lines was reference spaced
        self._text_unspacer = TextUnspacer()
        self.shortest_distance_line = None  # holder element for recognized shortest distance line
        self._best_msa_text = ""
        self._text_seg = None
        self._is_origin_database = False
        self._database_handler = None
        config_handler = ConfigurationHandler(first_init=False)
        self._config = config_handler.get_config()

        if 'ExceptionInitializing' in self._config:
            print("Exception initializing config, don't print")
            self._cpr = ConditionalPrint(False, False, False)
        else:

            self._cpr = ConditionalPrint(self._config.PRINT_MSA_HANDLER,
                                         self._config.PRINT_EXCEPTION_LEVEL,
                                         self._config.PRINT_WARNING_LEVEL)

        self._msa_handler = msa_handler

    def add_predictor(self, predictor):
        self.predictor = predictor
        self._msa_handler.add_predictor(predictor)

    def is_database_set(self, enabled, database_handler):
        self._is_origin_database = enabled
        self._database_handler = database_handler

    def edit_line_set_value(self, set_index, new_value):
        self._set_lines[set_index] = new_value

    def get_line_set_value_line(self, set_index):
        return self._set_lines[set_index]

    def get_line_set_value_text(self, set_index):
        value_line = self.get_line_set_value_line(set_index)
        value_text = self.get_line_content(value_line)
        return value_text

    def get_msa_best_text(self):
        return self._best_msa_text

    def set_msa_best_text(self, value):
        self._best_msa_text = value

    @property
    def size(self):
        return self._size

    @size.setter
    def size(self, value):
        self._size = value

    @property
    def y_mean(self):
        return self._y_mean

    @y_mean.setter
    def y_mean(self, value):
        self.y_mean = value

    def calculate_y_mean(self):
        """
        Goes through set elements and calculates y_mean for y_start and y_stop values
        :return:
        """

        acc_counter = 0
        y_start_final = 0
        y_stop_final = 0

        for line in self._set_lines:
            # don't count undefined values for means
            if line is False or line is None:
                continue
            # accumulate y-values
            (x_start, y_start, x_stop, y_stop) = line.coordinates
            y_start_final = y_start_final + y_start
            y_stop_final = y_stop_final + y_stop
            # add number of accumulation count
            acc_counter = acc_counter + 1

        y_start_mean = y_start_final / acc_counter
        y_stop_mean = y_stop_final / acc_counter
        y_mean = (y_start_mean + y_stop_mean) / 2

        self._y_mean = round(y_mean)

    def is_full(self):
        """
        Checks if all lines are defined in the lineset
        :return: True or False
        """
        for line in self._set_lines:
            if line is False:
                return False

        return True

    def print_me(self, diff_only=False):

        lineset_acc = ""
        one_line_is_false = False

        for line in self._set_lines:
            try:
                ocr_text = self.get_line_content(line)
                if ocr_text is False:
                    one_line_is_false = True
                    lineset_acc = lineset_acc + str(ocr_text) + "||"
                else:
                    lineset_acc = lineset_acc + ocr_text + "||"

            except:
                self._cpr.print("problem creating printable lineset ")

        lineset_acc = lineset_acc + "||"
        msa_str = str(self._best_msa_text)
        if diff_only is True:
            if one_line_is_false is True:
                self._cpr.print(
                    str(self.y_mean) + "||" + msa_str + "||" +
                    str(self.shortest_distance_line_index) + "||" +
                    lineset_acc)
        else:
            self._cpr.print(
                str(self.y_mean) + "||" + msa_str + "||" +
                str(self.shortest_distance_line_index) + "||" + lineset_acc)

    def calculate_n_distance_keying(self):

        # get the texts
        texts = []
        for line in self._set_lines:
            text = self.get_line_content(line)
            texts.append(text)

        self._n_distance_voter = NDistanceVoter(texts)
        if "ExceptionInitializing" in self._config.keys():
            print("Exception in initializing config using default in c")
            shortest_dist_index = self._n_distance_voter.compare_texts( \
                    take_longest_on_empty_lines = True, \
                    vote_without_spaces = False)
        else:
            shortest_dist_index = self._n_distance_voter.compare_texts( \
                    take_longest_on_empty_lines = self._config.NDIST_VOTE_LONGEST_IF_EMPTY_STRINGS, \
                    vote_without_spaces = self._config.NDIST_VOTE_WITHOUT_SPACES)

        # save the result
        self.shortest_distance_line_index = shortest_dist_index
        self.shortest_distance_line = self._set_lines[shortest_dist_index]

    def calculate_n_distance_keying_wordwise(self):
        if self._is_origin_database is False:
            self._cpr.printex(
                "Wordwise keying only possible with database originated ocr_sets"
            )
            raise Exception

        # get maximum word index todo probably will be refactored
        max_word_indices = []
        for line in self._set_lines:
            if line is False or line is None or line.textstr == '':
                max_word_indices.append(0)
            else:
                max_word_index = int(max(line.data["word_idx"]))
                max_word_indices.append(max_word_index)

        max_word_index = max(max_word_indices)
        self._cpr.print("mwi", max_word_index)

        def get_word_at_calc_wordindex(line, word_index):
            accumulated_word = ""
            word_indices = line.data["calc_word_idx"]

            for char_index, char in enumerate(line.data["char"]):
                current_word_index = word_indices[char_index]
                if current_word_index == word_index:
                    accumulated_word += char
                if current_word_index > word_index:
                    break
            return accumulated_word

        max_word_index = 2
        words_mock = [["hallo", "h4llo", "hallo"], ["zwei", None, "2wei"]]
        ndist_voter = NDistanceVoter(None)

        # get corresponding words
        for current_word_index in range(0, max_word_index):
            words = []
            """
            for line in self._set_lines:
                if line is False or line is None:
                    words.append(False)
                else:
                    if current_word_index < int(max(line.data["calc_word_idx"])):
                        current_word = get_word_at_calc_wordindex(line, current_word_index)
                        words.append(current_word)
                    else:
                        words.append(False)
            """

            words = words_mock[current_word_index]
            ndist_voter.set_texts(words)
            wordindex_result = ndist_voter.compare_texts( \
                take_longest_on_empty_lines = self._config.NDIST_VOTE_LONGEST_IF_EMPTY_STRINGS, \
                vote_without_spaces=self._config.NDIST_VOTE_WITHOUT_SPACES)

            ndist_voter.reset()
            self._cpr.print(words[wordindex_result])
            self._cpr.print("--")
            # just assume words is filled here and a 3 word list

        return

    def get_longest_index(self):
        def if_notdef_set_emptystring(value):
            if value is True or value is False or value is None:
                return ""

            return value

        lsval_1 = if_notdef_set_emptystring(
            self.get_line_content(self.get_line_set_value_line(0)))
        lsval_2 = if_notdef_set_emptystring(
            self.get_line_content(self.get_line_set_value_line(1)))
        lsval_3 = if_notdef_set_emptystring(
            self.get_line_content(self.get_line_set_value_line(2)))

        len_pline_1 = len(lsval_1)
        len_pline_2 = len(lsval_2)
        len_pline_3 = len(lsval_3)
        # max_index_value = max([len_pline_1, len_pline_2, len_pline_3])
        max_index = np.argmax([len_pline_1, len_pline_2, len_pline_3])
        self._cpr.print(max_index)
        return max_index

    def calculate_msa_best(self,
                           take_n_dist_best_index=False,
                           take_longest_as_pivot=False):

        # do a preselection of best element, if the parameter is set to take best n_dist_index as a pivot
        best_index = 1
        if take_longest_as_pivot is True:
            best_index = self.get_longest_index()
        elif take_n_dist_best_index is True:
            best_index = self.get_shortest_n_distance_index()

        indices = [0, 1, 2]
        indices.remove(best_index)
        index1 = indices[0]
        index2 = indices[1]

        self._cpr.print("msa selection taking best:", best_index, "others:(",
                        index1, "and", index2, ")")

        try:
            line_1 = self.get_line_content(self._set_lines[index1])
            line_2 = self.get_line_content(
                self._set_lines[best_index])  # should be best
            line_3 = self.get_line_content(self._set_lines[index2])

            self._cpr.print("ocr_set:")
            self._cpr.print("text_A", line_1)
            self._cpr.print("text_B", line_2)
            self._cpr.print("text_C", line_3)

            lines = [line_1, line_2, line_3]

            line_1_ok = not Random.is_false_true_or_none(line_1)
            line_2_ok = not Random.is_false_true_or_none(line_2)
            line_3_ok = not Random.is_false_true_or_none(line_3)
            ok_lines = [line_1_ok, line_2_ok, line_3_ok]
            not_ok_indices = []
            ok_indices = []
            for ok_index, ok in enumerate(ok_lines):
                if ok is True:
                    # not_ok_indices.append(ok_index)
                    ok_indices.append(ok_index)

            ok_len = len(ok_indices)

            if ok_len == 1:
                result = lines[ok_indices[0]]
            elif ok_len == 0:
                result = None
            elif ok_len == 2:
                result = lines[ok_indices[0]]
            else:
                result = self._msa_handler.get_best_of_three(
                    line_1, line_2, line_3)

            self._best_msa_text = result
        except Exception as e:
            self._cpr.printex(
                "ocr_set.py Exception in MSA, just taking line prio exception:",
                e)
            tr = inspect.trace()
            self._cpr.printex("trace is:", tr)

            self._best_msa_text = self.get_line_content(self._set_lines[1])

    def obtain_best_index(self,
                          use_n_dist_pivot,
                          use_longest_pivot,
                          default_best_index=1):
        # do a preselection of best element, if the parameter is set to take best n_dist_index as a pivot
        best_index = 1

        if use_n_dist_pivot is True:
            ldist_best_index = self.get_shortest_n_distance_index(
            )  # this doesn't work in all cases atm
            best_index = ldist_best_index
        if use_longest_pivot is True:
            best_index = self.get_longest_index()

        indices = [0, 1, 2]
        indices.remove(best_index)
        other_indices = indices
        return best_index, other_indices

    def obtain_line_info(self, best_index, other_indices):

        line_1 = self._set_lines[other_indices[0]]
        line_2 = self._set_lines[best_index]  # should be best
        line_3 = self._set_lines[other_indices[1]]

        text_1 = self.get_line_content(line_1)
        text_2 = self.get_line_content(line_2)  # should be best
        text_3 = self.get_line_content(line_3)

        self._cpr.print("ocr_set:")
        self._cpr.print("text_A", text_1)
        self._cpr.print("text_B", text_2)
        self._cpr.print("text_C", text_3)

        line_1_ok = not Random.is_false_true_or_none(line_1)
        line_2_ok = not Random.is_false_true_or_none(line_2)
        line_3_ok = not Random.is_false_true_or_none(line_3)
        ok_lines = [line_1_ok, line_2_ok, line_3_ok]

        ok_indices = []
        for ok_index, ok in enumerate(ok_lines):
            if ok is True:
                # not_ok_indices.append(ok_index)
                ok_indices.append(ok_index)

        ok_len = len(ok_indices)

        texts_return = [text_1, text_2, text_3]
        lines_return = [line_1, line_2, line_3]
        lines_return_ok = [line_1_ok, line_2_ok, line_3_ok]

        return texts_return, lines_return, lines_return_ok, ok_len

    def calculate_msa_best_all(self,
                               use_ndist_pivot,
                               use_longest_pivot,
                               use_charconfs,
                               use_wordwise,
                               use_searchspaces,
                               prefered_index=1):

        # get the pivot index and the other indices
        best_index, other_indices = self.obtain_best_index(
            use_ndist_pivot, use_longest_pivot, prefered_index)
        self._cpr.print("msa selection taking best:", best_index, "others:(",
                        other_indices[0], "and", other_indices[1], ")")

        # fetch the lines to process and info which (and how many) lines are ok
        texts, lines, lines_ok, number_lines_ok = self.obtain_line_info(
            best_index, other_indices)

        # do the msa if there is at least one line ok (confidence vote can be done with one line also :))
        if use_wordwise is True:
            if number_lines_ok != 0:
                result, self._text_seg = self._msa_handler.get_best_of_three_wordwise(
                    lines[0], lines[1], lines[2], use_charconfs,
                    use_searchspaces)
            else:
                result = None

        else:
            if number_lines_ok != 0:

                text_1 = self.get_line_content(lines[0])
                text_2 = self.get_line_content(lines[1])  # should be best
                text_3 = self.get_line_content(lines[2])

                result = self._msa_handler.get_best_of_three(
                    text_1,
                    text_2,
                    text_3,
                    line_1=lines[0],
                    line_2=lines[1],
                    line_3=lines[2],
                    use_charconfs=use_charconfs,
                    use_searchspaces=use_searchspaces)
            else:
                result = None

        self._best_msa_text = result

    def calculate_msa_best_charconf(self,
                                    take_n_dist_best_index=False,
                                    take_longest_as_pivot=True):

        # do a preselection of best element, if the parameter is set to take best n_dist_index as a pivot
        best_index = 1

        if take_n_dist_best_index is True:
            ldist_best_index = self.get_shortest_n_distance_index(
            )  # this doesn't work in all cases atm
            best_index = ldist_best_index
        if take_longest_as_pivot is True:
            best_index = self.get_longest_index()

        indices = [0, 1, 2]
        indices.remove(best_index)
        index1 = indices[0]
        index2 = indices[1]

        self._cpr.print("msa selection taking best:", best_index, "others:(",
                        index1, "and", index2, ")")

        try:

            line_1 = self._set_lines[index1]
            line_2 = self._set_lines[best_index]
            line_3 = self._set_lines[index2]

            text_1 = self.get_line_content(line_1)
            text_2 = self.get_line_content(line_2)  # should be best
            text_3 = self.get_line_content(line_3)

            self._cpr.print("ocr_set:")
            self._cpr.print("text_A", text_1)
            self._cpr.print("text_B", text_2)
            self._cpr.print("text_C", text_3)

            lines = [text_1, text_2, text_3]

            line_1_ok = not Random.is_false_true_or_none(text_1)
            line_2_ok = not Random.is_false_true_or_none(text_2)
            line_3_ok = not Random.is_false_true_or_none(text_3)
            ok_lines = [line_1_ok, line_2_ok, line_3_ok]
            not_ok_indices = []
            ok_indices = []
            for ok_index, ok in enumerate(ok_lines):
                if ok is True:
                    # not_ok_indices.append(ok_index)
                    ok_indices.append(ok_index)

            ok_len = len(ok_indices)

            if ok_len == 0:
                result = None
            else:
                result = self._msa_handler.get_best_of_three(text_1, text_2, text_3, use_charconfs=True, \
                                                      line_1=line_1,line_2=line_2,line_3=line_3)

            self._best_msa_text = result
        except Exception as e:
            self._cpr.printex(
                "ocr_set.py Exception in MSA, just taking line prio exception:",
                e)
            tr = inspect.trace()
            self._cpr.printex("trace is:", tr)
            if take_n_dist_best_index is True:
                self._best_msa_text = self.get_line_content(
                    self._set_lines[ldist_best_index])
            else:
                self._best_msa_text = self.get_line_content(
                    self._set_lines[best_index])

    def get_shortest_n_distance_text(self):
        if self.shortest_distance_line_index >= 0:
            line = self.shortest_distance_line
            line_text = self.get_line_content(line)
            return line_text
        else:
            return None

    def set_shortest_n_distance_text(self, value):
        if self.shortest_distance_line_index >= 0:
            sd_line = self.shortest_distance_line
            sd_line_new_value = self.set_line_content(sd_line, value)
            self.set_shortest_n_distance_line(sd_line_new_value)
        else:
            return None

    def get_shortest_n_distance_line(self):
        if self.shortest_distance_line_index >= 0:
            line = self.shortest_distance_line
            return line
        else:
            return None

    def set_shortest_n_distance_line(self, value):
        self.shortest_distance_line = value

    def get_shortest_n_distance_index(self):
        if self.shortest_distance_line_index >= 0:
            return self.shortest_distance_line_index
        else:
            return None

    def print_shortest_n_distance_line(self):
        line = self.get_shortest_n_distance_text()
        if line is not None and line is not False:
            self._cpr.print(line)

    def print_msa_best_line(self):
        msa_text = self._best_msa_text
        if msa_text is not None and msa_text is not False:
            print(msa_text)
        else:
            self._cpr.print(str(msa_text))

    def get_line_content(self, line):
        """
        Helper method to get line content, because ocropus content
        has other access properties. Method behaves differently when
        the current set is a database set
        :param line: line element to check upn
        :return: string with line content, or 'False if line isn't defined.
        """

        # hint: the attribute checked is created by hocr_line_normalizer
        if line is False:
            return False
        # elif hasattr(line, 'ocr_text_normalized'):

        if self._is_origin_database is False:
            # just the standard behaviour
            if line.ocr_text_normalized is not None:
                return line.ocr_text_normalized
            else:
                return line.ocr_text
        else:
            return line.textstr

    def set_line_content(self, line, value):
        """
        Helper method to set line content, because ocropus content
        has other access properties.
        :param line: line element to set the value to
        :param value: value to set to 'ocr_text_normalized' property
        :return: line or false if line not defined
        """

        # hint: the attribute checked is created by hocr_line_normalizer
        if line is False:
            return False

        line.ocr_text_normalized = value
        return line

    def unspace_lines(self, list_index_to_unspace, unspaced_list_index):

        unspaced_lines = self._text_unspacer.unspace_texts(
            self._set_lines, list_index_to_unspace, unspaced_list_index)

        self._unspaced = True
        self._refspaced = False
        self._set_lines = unspaced_lines

    def refspace_lines(self, list_index_to_adapt, list_index_reference):

        refspaced_lines = self._text_unspacer.refspace_texts(
            self._set_lines, list_index_to_adapt, list_index_reference)

        self._unspaced = False
        self._refspaced = True
        self._set_lines = refspaced_lines
Exemplo n.º 13
0
class SearchSpaceProcessor(object):
    def __init__(self, y_size, x_size, wildcard_character,
                 substitution_character):
        self._y_size = y_size
        self._x_size = x_size
        self._middle_index = Random.find_middle(self._x_size, True)
        self._pre_middle_index = self.get_middle_index() - 1
        self._nex_middle_index = self.get_middle_index() + 1

        self._wildcard_character = wildcard_character
        self._substitution_character = substitution_character
        self.similar_chars = []
        self.similar_chars.append(['o', 'ö'])
        self.similar_chars.append(['<',
                                   'o'])  # untested is this really better?
        self.similar_chars.append(['O', 'Ö'])
        self.similar_chars.append(['0', 'O', '9'])
        self.similar_chars.append(['d', 'ö'])
        #self.similar_chars.append(['1', 'l'])
        self.similar_chars.append(['l', 'j', '1'])
        self.similar_chars.append(['I', 'l'])
        self.similar_chars.append(['u', 'ü'])
        self.similar_chars.append(['U', 'Ü', 'O'])
        self.similar_chars.append(['a', 'ä'])
        self.similar_chars.append(['A', 'Ä'])
        self.similar_chars.append([':', ';'])
        self.similar_chars.append(['-', '¬'])
        self.similar_chars.append(['"', "'"])
        self.similar_chars.append(['C', "G", "c"])
        # just for testing ...
        self.similar_chars.append(['.', ','])
        self.similar_chars.append([',', ';'])
        self.similar_chars.append(['v', 'V'])
        self.similar_chars.append(['w', 'W'])

        self.similar_chars.append(['i', 'l', 't', '1',
                                   '.'])  # 1 l i also possible
        self.similar_chars.append(['r', 'n'])
        self.similar_chars.append(['%', 'm'])
        self.similar_chars.append(['&', 'é'])
        self.similar_chars.append(['e', 'é'])

        config_handler = ConfigurationHandler(first_init=False)
        self._config = config_handler.get_config()
        self._cpr = ConditionalPrint(self._config.PRINT_SEARCH_SPACE_PROCESSOR,
                                     self._config.PRINT_EXCEPTION_LEVEL,
                                     self._config.PRINT_WARNING_LEVEL)

    def get_middle_index(self):
        return self._middle_index

    def get_simchars_for_char(
        self, char
    ):  # todo similar chars for each char could be preprocessed once at start
        simchars_return_array = []

        for simchars in self.similar_chars:
            if char in simchars:
                simchars_return_array.extend(simchars)

        if len(simchars_return_array) >= 1:
            return simchars_return_array

        return [char]

    def get_pre_middle_index(self):
        return self._pre_middle_index

    def get_nex_middle_index(self):
        return self._nex_middle_index

    def get_wildcard_char(self):
        return self._wildcard_character

    def get_substitution_char(self):
        return self._substitution_character

    def get_y_size(self):
        return self._y_size

    def validate_column_features(self,
                                 search_space,
                                 x_index,
                                 reference_char=None,
                                 count_up_similar_references=False):
        counter_whitespaces = 0
        counter_wildcards = 0
        counter_nones = 0
        counter_characters = 0
        counter_reference_char = 0
        counter_same_characters = 0
        counter_dict = {}
        counter_special_characters = 0
        most_occuring_char = None

        otherchar = None
        otherchar_y_index = None
        simchars = None
        if reference_char is not None and count_up_similar_references is True:
            simchars = self.get_simchars_for_char(reference_char)
            if len(simchars) != 1:
                self._cpr.print("evaluate")

        # gather data
        for y_index in range(0, self.get_y_size()):
            row = search_space[y_index]
            column_item = row[x_index]
            if column_item == self.get_wildcard_char():
                counter_wildcards += 1
            elif column_item == ' ':
                counter_whitespaces += 1
            elif column_item == None or column_item == False or column_item == True:
                counter_nones += 1
            else:
                if reference_char is not None:

                    if count_up_similar_references is False and column_item == reference_char:
                        counter_reference_char += 1
                    if count_up_similar_references is True:
                        matching = [s for s in simchars if column_item in s]
                        boolmatch = len(matching) >= 1
                        if boolmatch is True:
                            counter_reference_char += 1

                counter_characters += 1
                otherchar = column_item
                otherchar_y_index = y_index

            if column_item != None:
                if column_item != self._wildcard_character and \
                    column_item != " ":
                    if not column_item in counter_dict.keys():
                        counter_dict.update({column_item: 1})
                    else:
                        counter_dict[column_item] += 1
            if Random.is_special_character(column_item):
                counter_special_characters += 1

        # the highest amount of same characters in this column
        if len(counter_dict.items()) >= 1:
            most_occuring_char, counter_same_characters = max(
                counter_dict.items(), key=operator.itemgetter(1))

        # extract features
        features = []
        counter_whitespace_and_wildcards = counter_whitespaces + counter_wildcards

        if counter_nones == self.get_y_size():
            features.append(ColumnFeatures.ONLY_NONE.value)
        if counter_wildcards == self.get_y_size(
        ) - 1 and counter_characters == 1:
            features.append((ColumnFeatures.ONE_CHAR_REST_WILDCARDS).value)
            # additional feature, the only char is a special character
            if Random.is_special_character(otherchar):
                features.append(
                    ColumnFeatures.ONE_SPECIALCHAR_REST_WILDCARDS.value)

        if counter_whitespaces == self.get_y_size(
        ) - 1 and counter_characters == 1:
            features.append(ColumnFeatures.ONE_CHAR_REST_WHITESPACE.value)
        if counter_whitespace_and_wildcards == self.get_y_size(
        ) - 1 and counter_characters == 1:
            features.append(
                ColumnFeatures.ONE_CHAR_REST_WHITESPACE_OR_WILDCARDS.value)
            # additional feature, the only char is a special character
            if otherchar != self._wildcard_character and otherchar != " "\
                    and Random.is_special_character(otherchar):
                #print("feature extraction")

                #print(search_space[0])
                #print(search_space[1])
                #print(search_space[2])
                #print("x-index",x_index)
                features.append(
                    ColumnFeatures.
                    ONE_SPECIALCHAR_REST_WHITESPACE_OR_WILDCARDS.value)

        if counter_reference_char == self.get_y_size() - 1 and (
                counter_whitespaces == 1 or counter_wildcards == 1):
            features.append(ColumnFeatures.MOSTLY_REFERENCE_CHAR.value)
        if counter_whitespaces == self.get_y_size():
            features.append(ColumnFeatures.ONLY_WHITESPACE.value)
        if counter_reference_char == self.get_y_size():
            features.append(ColumnFeatures.ONLY_WILDCARD.value)
        if counter_whitespace_and_wildcards == self.get_y_size():
            features.append(ColumnFeatures.ONLY_WHITESPACE_OR_WILDCARD.value)
        if counter_reference_char >= 1:
            features.append(ColumnFeatures.CONTAINS_REFERENCE_CHAR.value)
        if counter_same_characters == self.get_y_size():
            if counter_special_characters == self.get_y_size():
                features.append(ColumnFeatures.ONLY_SAME_SPECIAL_CHAR.value)
        if Random.is_special_character(most_occuring_char) \
            and counter_same_characters == self.get_y_size()-1 \
            and most_occuring_char != self._wildcard_character \
            and counter_whitespace_and_wildcards == 1:

            features.append(ColumnFeatures.MOSTLY_SAME_SPECIAL_CHAR.value)

        return features, otherchar, otherchar_y_index

    def shift_from_mid(self,
                       search_space,
                       line_index,
                       to_left,
                       other_substition_char=None):
        if other_substition_char is not None:
            used_substitution_char = other_substition_char
        else:
            used_substitution_char = self.get_substitution_char()

        mid_val = search_space[line_index][self.get_middle_index()]
        possible_shifts = [
            ' ',
            self.get_wildcard_char(), used_substitution_char, None, False,
            True, 0
        ]
        shifted = False
        if to_left is True:
            if search_space[line_index][
                    self.get_pre_middle_index()] in possible_shifts:
                search_space[line_index][self.get_pre_middle_index()] = mid_val
                search_space[line_index][
                    self.get_middle_index()] = used_substitution_char
                shifted = True
        else:
            if search_space[line_index][
                    self.get_nex_middle_index()] in possible_shifts:
                search_space[line_index][self.get_nex_middle_index()] = mid_val
                search_space[line_index][
                    self.get_middle_index()] = used_substitution_char
                shifted = True

        return search_space, shifted

    def shift_from_to(self,
                      search_space,
                      y_index,
                      x_from,
                      x_to,
                      other_substition_char=None):

        if other_substition_char is not None:
            used_substitution_char = other_substition_char
        else:
            used_substitution_char = self.get_substitution_char()

        possible_shifts = [
            ' ',
            self.get_wildcard_char(), used_substitution_char, None, False,
            True, 0
        ]
        swap_val = search_space[y_index][x_from]

        shifted = False

        if search_space[y_index][x_to] in possible_shifts:
            search_space[y_index][x_to] = swap_val
            search_space[y_index][x_from] = used_substitution_char
            shifted = True

        return search_space, shifted

    def set_space_to_value(self,
                           search_space,
                           y_index,
                           x_index,
                           used_subsitution_value=None):

        if used_subsitution_value is not None:
            used_substitution_char = used_subsitution_value
        else:
            used_substitution_char = self.get_substitution_char()

        search_space[y_index][x_index] = used_substitution_char

        shifted = True

        return search_space, shifted

    def process_search_space(self, search_space, search_space_confs,
                             use_similar_chars):
        processed_space = search_space
        processed_space_confs = search_space_confs
        change_done = False

        # self.output_as_scrollbar(search_space) #todo build this in someday

        mid_column_feats, otherchar_mid, oc_mid_index = self.validate_column_features(
            search_space, self.get_middle_index())

        if self._config.MSA_BEST_SEARCHSPACE_MITIGATE_SPACE_HOPS:

            if ColumnFeatures.ONLY_WHITESPACE_OR_WILDCARD.value in mid_column_feats:

                # some char 'hopped' over a whitespace, get the characters back together
                pre_column_feats, otherchar_pre, oc_pre_index = self.validate_column_features(search_space, \
                                                                                              self.get_pre_middle_index(),
                                                                                              reference_char=None)
                nex_column_feats, otherchar_nex, oc_nex_index = self.validate_column_features(search_space, \
                                                                                              self.get_nex_middle_index(),
                                                                                              reference_char=None)

                if ColumnFeatures.ONE_CHAR_REST_WHITESPACE_OR_WILDCARDS.value in pre_column_feats and \
                    ColumnFeatures.ONE_CHAR_REST_WHITESPACE_OR_WILDCARDS.value in nex_column_feats:

                    if otherchar_nex == otherchar_pre and \
                        oc_pre_index != oc_nex_index:

                        processed_space, shifted_longtrans = self.shift_from_to(
                            search_space, oc_pre_index, 0, 2)

                        if shifted_longtrans is True:

                            processed_space_confs, shifted_confs_longtrangs = self.shift_from_to(
                                search_space_confs, oc_pre_index, 0, 2, 0)
                            change_done = True

                        if change_done:
                            search_space = processed_space
                            search_space_confs = processed_space_confs



        if ColumnFeatures.ONE_CHAR_REST_WILDCARDS.value in mid_column_feats \
                or ColumnFeatures.ONE_CHAR_REST_WHITESPACE.value in mid_column_feats \
                    or ColumnFeatures.ONE_CHAR_REST_WHITESPACE_OR_WILDCARDS.value in mid_column_feats:

            #if ColumnFeatures.ONE_CHAR_REST_WHITESPACE_OR_WILDCARDS.value in mid_column_feats:
            #if otherchar_mid == "l":
            #    self._cpr.print("beep!")

            pre_column_feats, otherchar_pre, oc_pre_index = self.validate_column_features(search_space, \
                                                                        self.get_pre_middle_index(), otherchar_mid, use_similar_chars)
            nex_column_feats, otherchar_nex, oc_nex_index = self.validate_column_features(search_space, \
                                                                        self.get_nex_middle_index(), otherchar_mid, use_similar_chars)

            shifted = False
            left_right = None
            if ColumnFeatures.MOSTLY_REFERENCE_CHAR.value in pre_column_feats\
                    or (ColumnFeatures.CONTAINS_REFERENCE_CHAR.value in pre_column_feats
                        and ColumnFeatures.ONE_CHAR_REST_WHITESPACE_OR_WILDCARDS.value in pre_column_feats):

                left_right = True
                processed_space, shifted = self.shift_from_mid(
                    search_space, oc_mid_index, left_right)
            if ColumnFeatures.MOSTLY_REFERENCE_CHAR.value in nex_column_feats \
                    or (ColumnFeatures.CONTAINS_REFERENCE_CHAR.value in nex_column_feats
                        and ColumnFeatures.ONE_CHAR_REST_WHITESPACE_OR_WILDCARDS.value in nex_column_feats):
                left_right = False
                processed_space, shifted = self.shift_from_mid(
                    search_space, oc_mid_index, left_right)
            if shifted:

                if self._config.MSA_BEST_SEARCHSPACE_QUOTE_NORMALIZATION  \
                        and (otherchar_mid == "'" or otherchar_mid == '"'):

                    ## this part here merges '' to single ' and corrects the alignment
                    x_middle_index = self.get_middle_index()
                    if left_right is True:
                        delete_index = x_middle_index + 1
                        shift_index = x_middle_index - 1
                    else:
                        delete_index = x_middle_index - 1
                        shift_index = x_middle_index + 1

                    if otherchar_mid == "'":
                        processed_space, shiftedD1 = self.set_space_to_value(
                            search_space, oc_mid_index, shift_index, '"')
                        processed_space, shiftedD2 = self.set_space_to_value(
                            processed_space, oc_mid_index, delete_index)
                        search_space_confs, shiftedD3 = self.set_space_to_value(
                            search_space_confs,
                            oc_mid_index,
                            delete_index,
                            used_subsitution_value=0)
                    else:
                        # just push confidences because it was confusion with ' and " should be prioritized
                        search_space_confs, shiftedD3 = self.set_space_to_value(
                            search_space_confs,
                            oc_mid_index,
                            shift_index,
                            used_subsitution_value=1000)

                processed_space_confs, shifted_confs = self.shift_from_mid(
                    search_space_confs, oc_mid_index, left_right, 0)
                change_done = True
        elif ColumnFeatures.ONLY_WHITESPACE.value in mid_column_feats or ColumnFeatures.MOSTLY_REFERENCE_CHAR.value in mid_column_feats:
            # this case checks for 'far-transitions' of similar chars and does them if possible
            pre_column_feats, otherchar_pre, oc_pre_index = self.validate_column_features(search_space, \
                                                                        self.get_pre_middle_index(), otherchar_mid, use_similar_chars)
            nex_column_feats, otherchar_nex, oc_nex_index = self.validate_column_features(search_space, \
                                                                        self.get_nex_middle_index(), otherchar_mid, use_similar_chars)
            reference_char = None
            reference_char_y_index = None
            check_index = None

            pre_is_one_char = False
            nex_is_one_char = False
            if ColumnFeatures.ONE_CHAR_REST_WILDCARDS.value in pre_column_feats:
                reference_char = otherchar_pre
                reference_char_y_index = oc_pre_index

                pre_is_one_char = True
                check_index = self.get_nex_middle_index()
                check_index_from = self.get_pre_middle_index()

            if ColumnFeatures.ONE_CHAR_REST_WILDCARDS.value in nex_column_feats:
                reference_char = otherchar_nex
                reference_char_y_index = oc_nex_index
                nex_is_one_char = True
                check_index = self.get_pre_middle_index()
                check_index_from = self.get_nex_middle_index()
            if (pre_is_one_char is True and nex_is_one_char is False) \
                    or (pre_is_one_char is False and nex_is_one_char is True):

                other_column_feats, otherchar_other, oc_other_index = self.validate_column_features(search_space, \
                                                                                                check_index,
                                                                                                reference_char,
                                                                                                use_similar_chars)

                #print("search_space", search_space)
                if ColumnFeatures.MOSTLY_REFERENCE_CHAR.value in other_column_feats:
                    processed_space, shifted_longtrans = self.shift_from_to(search_space, reference_char_y_index, \
                                                                  check_index_from, check_index)

                    if shifted_longtrans is True:

                        processed_space_confs, shifted_confs_longtrangs = self.shift_from_to(search_space_confs, \
                                                                      reference_char_y_index, check_index_from, check_index, 0)
                        change_done = True

        if self._config.MSA_BEST_SEARCHSPACE_DROP_SINGLE_CH_NEAR_SC:
            #print("processed space")
            #print(processed_space[0])
            #print(processed_space[1])
            #print(processed_space[2])

            mid_column_feats2, otherchar_mid2, oc_mid_index2 = self.validate_column_features(
                processed_space, self.get_middle_index())

            pre_column_feats2, otherchar_pre2, oc_pre_index2 = self.validate_column_features(processed_space, \
                                                                                          self.get_pre_middle_index(),
                                                                                          reference_char=None)
            nex_column_feats2, otherchar_nex2, oc_nex_index2 = self.validate_column_features(processed_space, \
                                                                                          self.get_nex_middle_index(),
                                                                                          reference_char=None)
            if ColumnFeatures.MOSTLY_SAME_SPECIAL_CHAR.value in mid_column_feats2:

                if ColumnFeatures.ONE_SPECIALCHAR_REST_WHITESPACE_OR_WILDCARDS.value in pre_column_feats2:
                    mid_char_at_oc_index = search_space[oc_pre_index2][
                        self.get_middle_index()]
                    if mid_char_at_oc_index == self.get_wildcard_char() or \
                        mid_char_at_oc_index == " ":

                        processed_space2, shiftedD3 = self.shift_from_to(
                            processed_space, oc_pre_index2,
                            self.get_pre_middle_index(),
                            self.get_middle_index(), self.get_wildcard_char())
                        processed_space_confs2, shiftedD3 = self.shift_from_to(
                            processed_space_confs, oc_pre_index2,
                            self.get_pre_middle_index(),
                            self.get_middle_index(), 0)

                        #processed_space2, shiftedD3= self.set_space_to_value(processed_space,oc_pre_index2,self.get_pre_middle_index(),self.get_wildcard_char())
                        #processed_space_confs2, shiftedD3= self.set_space_to_value(processed_space_confs,oc_pre_index2,self.get_pre_middle_index(),0)

                        #processed_space2, shiftedD3 =self.shift_from_to(processed_space,oc_pre_index,self.get_pre_middle_index(), self.get_middle_index(),self._wildcard_character)
                        #print("cool")
                        #if shiftedD3:
                        #    print("ssp corrected")
                        processed_space = processed_space2
                        processed_space_confs = processed_space_confs2

                if ColumnFeatures.ONE_SPECIALCHAR_REST_WHITESPACE_OR_WILDCARDS.value in nex_column_feats2:
                    mid_char_at_oc_index = search_space[oc_nex_index2][
                        self.get_middle_index()]
                    if mid_char_at_oc_index == self.get_wildcard_char() or \
                            mid_char_at_oc_index == " ":

                        processed_space2, shiftedD3 = self.shift_from_to(
                            processed_space, oc_nex_index2,
                            self.get_nex_middle_index(),
                            self.get_middle_index(), self.get_wildcard_char())
                        processed_space_confs2, shiftedD3 = self.shift_from_to(
                            processed_space_confs, oc_nex_index2,
                            self.get_nex_middle_index(),
                            self.get_middle_index(), 0)

                        #processed_space2, shiftedD3= self.set_space_to_value(processed_space,oc_nex_index2,self.get_nex_middle_index(),self.get_wildcard_char())
                        #processed_space_confs2, shiftedD3= self.set_space_to_value(processed_space_confs,oc_nex_index2,self.get_nex_middle_index(), 0)
                        #print("cool")
                        #if shiftedD3:
                        #print("ssp corrected")
                        processed_space = processed_space2
                        processed_space_confs = processed_space_confs2
            #elif ColumnFeatures.ONLY_SAME_SPECIAL_CHAR.value in mid_column_feats:
            #if ColumnFeatures.ONE_SPECIALCHAR_REST_WHITESPACE_OR_WILDCARDS.value in pre_column_feats2:
            #print("asd")
            #if ColumnFeatures.ONE_SPECIALCHAR_REST_WHITESPACE_OR_WILDCARDS.value in nex_column_feats2:
            #print("asd")
            #search_space_confs, shiftedD3 = self.set_space_to_value(processed_space, oc_nex_index,
            #                                                        delete_index, used_subsitution_value=self._wildcard_character)

        return processed_space, processed_space_confs, change_done

    def output_as_scrollbar(self, search_space, active=False):
        if active is False:
            return
        sys.stdout.write(f"Scrollingbar {search_space[1]} \r")
        sys.stdout.flush()
Exemplo n.º 14
0
        #    continue
        #if int(split[0])<300:
        #    continue
        #if not "_1956" in file.name:
        #    continue
        # fetch additional information for current file (if toggled in info)
        additional_info = add_info_handler.fetch_additional_information_simple(
            file)

        # fetch basic data for current file
        ocromore_data = dh.fetch_ocromore_data(file,
                                               additional_info=additional_info)
        output_analyzer.set_current_data(
            ocromore_data)  # prepare output analyzer

        cpr.print("Checking file:", ocromore_data['file_info'].path)

        # extract features from basic data
        ocromore_data = feature_extractor.extract_file_features(ocromore_data)
        # line segmentation
        ocromore_data = segment_classifier.classify_file_segments(
            ocromore_data)
        # segment parsing
        ocromore_data = segment_parser.parse_segments(ocromore_data)
        # output file synthesis
        segment_parser.write_result_to_output(True, ocromore_data)
        # todo
        # output analysis steps
        output_analyzer.log_segmentation_simple(
            ocromore_data)  # log the recognized segmentation
        output_analyzer.log_parsed_output(
Exemplo n.º 15
0
class OCRcomparison:
    """
        Storage class for multiple Ocr_Sets
    """
    def __init__(self,
                 predictor=None,
                 vocabulary_checker=None,
                 first_config_init=False):
        self.ocr_sets = []
        self.line_height_information = []
        config_handler = ConfigurationHandler(first_init=first_config_init)
        self.config = config_handler.get_config()

        if 'ExceptionInitializing' in self.config:
            self.cpr = ConditionalPrint(False, False, False)
        else:
            self.cpr = ConditionalPrint(self.config.PRINT_MSA_HANDLER,
                                        self.config.PRINT_EXCEPTION_LEVEL,
                                        self.config.PRINT_WARNING_LEVEL)

        self.predictor = predictor
        self.vocabulary_checker = vocabulary_checker

    def load_predictor(self, predictor):
        self.predictor = predictor

    def add_set(self, set_to_add):
        self.ocr_sets.append(set_to_add)

    def add_line_information(self, line_height_information):
        self.line_height_information.append(line_height_information)

    def set_dataframe_wrapper(self, dataframe_wrapper):
        self._dataframe_wrapper = dataframe_wrapper

    def set_vocabulary_checker(self, vocabulary_checker):
        self.vocabulary_checker = vocabulary_checker

    def sort_set(self):
        """
        Sort the ocr_sets by y_mean values
        :return:
        """
        def take_y_mean(my_set):
            ym = my_set.y_mean
            return ym

        sorted_sets = sorted(self.ocr_sets, key=take_y_mean, reverse=False)
        self.ocr_sets = sorted_sets

    def unspace_list(self, list_index_to_unspace, unspaced_list_index):
        """
        apply the unspacing algorithm to one of the lists, take another list as
        comparison  (which is not spaced)
        :param list_index_to_unspace: index for the set to unspace
        :param unspaced_list_index: index for the non-spaced set
        :return:
        """

        for set in self.ocr_sets:
            set.unspace_lines(list_index_to_unspace, unspaced_list_index)

    def refspace_list(self, list_index_to_adapt, list_index_reference):

        for set in self.ocr_sets:
            set.refspace_lines(list_index_to_adapt, list_index_reference)

    def print_sets(self, diff_only=False):
        for current_set in self.ocr_sets:
            current_set.print_me(diff_only)

    def do_n_distance_keying(self, wordwise_keying=False):

        if wordwise_keying is False:
            # the keying is done on line base - this is the standard mode without database
            for current_set in self.ocr_sets:
                current_set.calculate_n_distance_keying()
        else:
            # the keying is done wordwise - can be done with sets originated by database
            for current_set in self.ocr_sets:
                current_set.calculate_n_distance_keying_wordwise()

    def do_msa_best(self):
        for current_set in self.ocr_sets:
            current_set.calculate_msa_best()

    def do_msa_best_with_ndist_pivot(self):
        self.do_n_distance_keying()

        for current_set in self.ocr_sets:
            current_set.calculate_msa_best(True)

    def do_msa_best_with_ndist_pivot_charconf(self):
        self.do_n_distance_keying()

        for current_set in self.ocr_sets:
            current_set.calculate_msa_best_charconf(True)

    def do_msa_best_new(self, use_ndist_pivot, use_longest_pivot,
                        use_charconfs, use_wordwise, use_searchspaces,
                        do_postcorrection):

        if use_ndist_pivot is True:
            self.do_n_distance_keying()

        for current_set in self.ocr_sets:
            current_set.calculate_msa_best_all(use_ndist_pivot,
                                               use_longest_pivot,
                                               use_charconfs, use_wordwise,
                                               use_searchspaces)

        if do_postcorrection is True:
            self.do_postcorrection(True)

        print("done")

    def print_n_distance_keying_results(self):
        self.cpr.print("N_DISTANCE_KEYING_RESULTS ")
        for current_set in self.ocr_sets:
            current_set.print_shortest_n_distance_line()

    def print_msa_best_results(self):
        self.cpr.print("MSA_BEST_RESULTS ")
        for current_set in self.ocr_sets:
            current_set.print_msa_best_line()

    def add_linebreaks(self, previous_line, current_line, previous_line_index,
                       sd_line_index, line_heigth_info):
        MODE = 'TAKE_CURRENT_LINE_DIST'

        if previous_line is None:
            return None
        if MODE is 'TAKE_CURRENT_LINE_DIST':
            MARGIN = 0  # tolerance margin
            current_lh_info = line_heigth_info[sd_line_index]
            (xp_start, yp_start, xp_stop, yp_stop) = previous_line.coordinates
            (xc_start, yc_start, xc_stop, yc_stop) = current_line.coordinates

            y_dist = yc_start - yp_stop

            if y_dist <= 0:
                return None

            line_distance = current_lh_info.get_line_distance()
            y_times = (y_dist + MARGIN) / line_distance
            y_times_absolute = TypeCasts.round_to_int(y_times)
            if y_times_absolute > 0:
                generated_text = Random.append_pad_values(
                    "", y_times_absolute, "\n")
                return generated_text
            else:
                return None

        self.cpr.print("Undefined case reached shouldn't happen")
        return None

    def save_n_distance_keying_results_to_file(self,
                                               filename,
                                               mode_add_linebreaks=False):
        file = open(filename, 'w+')

        previous_sd_line = None
        previous_sd_line_index = None
        for current_set in self.ocr_sets:

            sd_text = current_set.get_shortest_n_distance_text()
            # add comparison from previous to actual line break here
            if mode_add_linebreaks:
                sd_line_index = current_set.get_shortest_n_distance_index()
                sd_line = current_set.get_shortest_n_distance_line()
                if sd_line is True or sd_line is False:
                    continue

                additional_breaks = \
                    self.add_linebreaks(previous_sd_line, sd_line, previous_sd_line_index, sd_line_index, self.line_height_information)

                if additional_breaks is not None:
                    file.write(additional_breaks)
                previous_sd_line = sd_line
                previous_sd_line_index = sd_line_index

            # do not print lines which are mostly recognized with no content at the moment
            if sd_text is not None and sd_text is not False:
                file.write(sd_text + "\n")

        file.close()

    def save_dataset_to_file(self,
                             filename,
                             set_index,
                             mode_add_linebreaks=False,
                             other_set=""):

        dir = os.path.dirname(filename)
        if not os.path.exists(dir):
            os.makedirs(dir)

        file = open(filename, 'w+', encoding="utf-8")

        previous_dataset_line = None
        previous_dataset_line_index = None

        for current_set in self.ocr_sets:
            if other_set == 'msa_best':
                dataset_text = current_set.get_msa_best_text()
            elif other_set == 'ndist_keying':
                dataset_text = current_set.get_shortest_n_distance_text()
            else:
                dataset_text = current_set.get_line_set_value_text(set_index)

            # add comparison from previous to actual line break here
            if mode_add_linebreaks:
                dataset_line = current_set.get_line_set_value_line(set_index)
                if dataset_line is True or dataset_line is False:
                    continue

                additional_breaks = \
                    self.add_linebreaks(previous_dataset_line, dataset_line, previous_dataset_line_index, set_index,
                                        self.line_height_information)

                if additional_breaks is not None:
                    file.write(additional_breaks)
                previous_dataset_line = dataset_line
                previous_dataset_line_index = set_index

            # do not print lines which are mostly recognized with no content at the moment
            if dataset_text is not None and dataset_text is not False:
                file.write(dataset_text + "\n")

        file.close()

    def save_dataset_to_hocr(self,
                             filename,
                             set_index,
                             mode_add_linebreaks=False,
                             other_set=""):
        #TODO: Import const to config or/and rework imagepath generating
        filename = os.path.normpath(filename).replace("\\", "/")
        IMG_PATH = "/media/sf_ShareVB/many_years_firmprofiles/"
        IMG_FILETYPES = [".jpg"]
        imgdir = "None"
        if IMG_PATH != "":
            imgname = "_".join(
                filename.split("/")[-1].replace("msa_best",
                                                "").split("_")[:-1])
            if imgname[-3:] == "msa":
                imgname = imgname[:-3]
            imgfolder = filename.split("/")[-2]
            imgpath = IMG_PATH + "**/" + imgfolder + "/**/" + imgname + "*"
            imgdirs = list(
                chain.from_iterable(
                    glob.iglob(imgpath + filetype, recursive=True)
                    for filetype in IMG_FILETYPES))
            if imgdirs is not None and len(imgdirs) > 0:
                imgdir = imgdirs[0]

        filename += ".hocr"
        dir = os.path.dirname(filename)
        if not os.path.exists(dir):
            os.makedirs(dir)

        file = open(filename, 'w+', encoding="utf-8")
        wrote_header = False
        for lidx, current_set in enumerate(self.ocr_sets):
            if lidx == 0:
                for last_set in self.ocr_sets[lidx]._set_lines:
                    if last_set.data["line_x1"] != []:
                        file_cords = last_set.data
                        break
            if other_set == 'msa_best':
                dataset_text = current_set.get_msa_best_text()
                dataset_bbox = None
                name = ["msa", "combined"]
                for lines in current_set._set_lines:
                    ldata = lines.data
                    if dataset_bbox is None and ldata["line_x0"]:
                        dataset_bbox = [
                            min(ldata["line_x0"]),
                            min(ldata["line_y0"]),
                            max(ldata["line_x1"]),
                            max(ldata["line_y1"])
                        ]
                    elif ldata["line_x0"]:
                        if min(ldata["line_x0"]) < dataset_bbox[0]:
                            dataset_bbox[0] = min(ldata["line_x0"])
                        if min(ldata["line_y0"]) < dataset_bbox[1]:
                            dataset_bbox[1] = min(ldata["line_y0"])
                        if max(ldata["line_x1"]) > dataset_bbox[2]:
                            dataset_bbox[2] = max(ldata["line_x1"])
                        if max(ldata["line_y1"]) > dataset_bbox[3]:
                            dataset_bbox[3] = max(ldata["line_y1"])
            else:
                dataset_text = current_set.get_line_set_value_text(set_index)
                dataset_bbox = None
                ldata = current_set._set_lines[set_index].data
                name = current_set._set_lines[set_index].name
                if ldata["line_x0"]:
                    dataset_bbox = [
                        min(ldata["line_x0"]),
                        min(ldata["line_y0"]),
                        max(ldata["line_x1"]),
                        max(ldata["line_y1"])
                    ]

            # do not print lines which are mostly recognized with no content at the moment
            if dataset_text is not None and dataset_text is not False and dataset_bbox:
                if wrote_header == False:
                    wrote_header = True
                    hocr_header = f'''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
    <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
        <head>
            <title>OCR Results</title>
            <meta http-equiv="content-type" content="text/html; charset=utf-8" />
            <meta name='AKF-OCR' content='{name[0]}-{name[1]}' />
            <meta name='ocr-capabilities' content='ocr_line ocrx_word'/>
        </head>
        <body>
            <div class='ocr_page' title='image {imgdir}; bbox 0 0 {int(file_cords["line_x1"][0])} {int(file_cords["line_y1"][0])}'>\n'''
                    file.write(hocr_header)
                dtext = self._write_line_infos(dataset_bbox, dataset_text,
                                               set_index, other_set, lidx,
                                               current_set)
                file.write(dtext)
            if lidx == len(self.ocr_sets) - 2:
                file.write("\t\t</div>\n\t</body>\n</html>")
                break
        file.close()

    def _write_line_infos(self, dataset_bbox, dataset_text, set_index,
                          other_set, lidx, current_set):
        dtext = f'''            <span class ='ocr_line' title='bbox {int(dataset_bbox[0])} {int(dataset_bbox[1])} {int(dataset_bbox[2])} {int(dataset_bbox[3])}' ><br/>\n'''
        if other_set == "msa_best":
            if current_set._text_seg == None:
                dtext += f'''                <span  class ='ocrx_word' title='bbox {int(dataset_bbox[0])} {int(dataset_bbox[1])} {int(dataset_bbox[2])} {int(dataset_bbox[3])}' >{dataset_text}</span >\n'''
            else:
                for number, word in current_set._text_seg.items():
                    if number != -1.0:
                        set_index = 2
                        if number in current_set._set_lines[1].word["UID"].keys() and \
                                set(current_set._set_lines[1].word["text"][number]) != set("¦") and \
                                current_set._set_lines[1].data["word_x0"]:
                            set_index = 1
                        elif number in current_set._set_lines[0].word["UID"].keys() and \
                                set(current_set._set_lines[0].word["text"][number]) != set("¦") and \
                                current_set._set_lines[0].data["word_x0"]:
                            set_index = 0
                        dataset_bbox = self._get_wbbox_new(
                            dataset_bbox, number,
                            current_set._set_lines[set_index].data)
                    dtext += f'''                <span  class ='ocrx_word' title='bbox {int(dataset_bbox[0])} {int(dataset_bbox[1])} {int(dataset_bbox[2])} {int(dataset_bbox[3])}' >{word}</span >\n'''
                    set_index = 0
        else:
            for number, word in current_set._set_lines[set_index].word[
                    "text"].items():
                dataset_bbox = self._get_wbbox(
                    dataset_bbox, number,
                    current_set._set_lines[set_index].word["UID"],
                    current_set._set_lines[set_index].data)
                dtext += f'''                <span  class ='ocrx_word' title='bbox {int(dataset_bbox[0])} {int(dataset_bbox[1])} {int(dataset_bbox[2])} {int(dataset_bbox[3])}' >{word}</span >\n'''
        dtext += f'''            </span>\n'''
        return dtext

    def _get_wbbox(self, bbox, number, nb_dict, data, avg=True):
        wbbox_pos = 0
        for nbkey in nb_dict:
            if nbkey != number:
                if wbbox_pos == 0: wbbox_pos = -1
                wbbox_pos += len(nb_dict[nbkey])
            else:
                wbbox_pos += len(nb_dict[nbkey]) / 2
                break
        if wbbox_pos != 0:
            if number != 0.0:
                bbox[0] = data["word_x0"][int(wbbox_pos)]
            bbox[2] = data["word_x1"][int(wbbox_pos)]
        return bbox

    def _get_wbbox_new(self, bbox, number, data, avg=True):
        uid_arr = np.array(data["UID"])
        wc_arr = np.where(uid_arr == -1)
        nb_arr = np.array(data["word_match"])
        nb_arr = np.delete(nb_arr, wc_arr)
        #for wc_pos in wc_arr[0]:
        #    del nb_arr[wc_pos]
        nb_pos = np.where(nb_arr == number)
        if len(nb_pos[0]) == 0:
            stop = "STOP"
            #return
        wbbox_pos = nb_pos[0][int(len(nb_pos[0]) / 2)]
        if wbbox_pos != 0:
            bbox[0] = data["word_x0"][int(nb_pos[0][0])]
            bbox[2] = data["word_x1"][int(nb_pos[0][-1])]
        return bbox

    def export_text_lines(self):
        """
        Exports the lines of text of the result as list
        :return: list with lines
        """

        return_list = []
        for setindex, current_set in enumerate(self.ocr_sets):
            sd_line = current_set.get_shortest_n_distance_line()

            # do not list lines which are mostly recognized with no content at the moment
            if sd_line is not None and sd_line is not False:
                return_list.append(sd_line)

        return return_list

    def do_vocabulary_correction(self):
        store_last_entry = None
        for current_set in self.ocr_sets:
            msa_best_text = current_set.get_msa_best_text()
            msa_best_text_corrected = ""
            msa_best_ttokenized = msa_best_text.split()

            len_tokens = len(msa_best_ttokenized)
            for word_index, word in enumerate(msa_best_ttokenized):
                #if "Tee" in word:
                #    print("asd")

                if self.config.KEYING_RESULT_VC_IGNORE_SEPERATE_WRITING_CORRECTION:
                    if store_last_entry != None:
                        # don't correct first follow up line word to seperation word
                        store_last_entry = None
                        msa_best_text_corrected += " " + word
                        continue

                    if len_tokens - 1 == word_index:
                        tdash = self.vocabulary_checker.word_trails_with_dash(
                            word)
                        if tdash:
                            store_last_entry = word
                            msa_best_text_corrected += " " + word
                            continue

                word_wo_sc, ratio = self.vocabulary_checker.without_special_chars(
                    word)
                if ratio == 0 or len(word_wo_sc) <= 2:
                    msa_best_text_corrected += " " + word
                    continue

                word_wb, bstart, btrail, changeb = self.vocabulary_checker.remove_and_give_borders(
                    word)
                if changeb:
                    word_correct_vc, suggestions, first_letter_high = self.vocabulary_checker.correct_text(
                        word_wb)
                    if word_correct_vc is None:
                        word_correct = word
                    else:
                        word_correct = bstart + word_correct_vc + btrail
                else:
                    word_correct, suggestions, first_letter_high = self.vocabulary_checker.correct_text(
                        word)

                if word_correct is None:
                    msa_best_text_corrected += " " + word
                else:
                    msa_best_text_corrected += " " + word_correct

            msa_best_text_corrected = msa_best_text_corrected.lstrip(" ")

            if self.config.KEYING_RESULT_VC_PRINTDIFF and msa_best_text_corrected != msa_best_text:
                print("vocab in :", msa_best_text)
                print("vocab out:", msa_best_text_corrected)

            current_set.set_msa_best_text(msa_best_text_corrected)

    def do_postcorrection(self,
                          postcorrect_keying=False,
                          postcorrect_ndist=False,
                          postcorrect_msa=False,
                          postcorrect_other=False,
                          postcorrection_index=0):
        """
        Do postcorrection steps for a specified list of sets or for the resulting lines of n_distkeying
        :param postcorrect_keying: if this is true, the lines of n_distkeying are postcorrected, otherwise it's specified by pc_index
        :param postcorrection_index: specifies the list of sets which is postcorrected if pc_keying is false
        :return:
        """
        if postcorrect_keying is False:
            return

        for current_set in self.ocr_sets:
            if postcorrect_ndist:
                sd_line_text = current_set.get_shortest_n_distance_text()
                if sd_line_text is not None and sd_line_text is not True and sd_line_text is not False:
                    sd_line_text_corrected = TextCorrector.correct_line_text(
                        sd_line_text)
                    current_set.set_shortest_n_distance_text(
                        sd_line_text_corrected)
            if postcorrect_msa:
                msa_best_text = current_set.get_msa_best_text()
                if msa_best_text is not None and msa_best_text is not True and msa_best_text is not False:
                    msa_best_text_corrected = TextCorrector.correct_line_text(
                        msa_best_text)
                    current_set.set_msa_best_text(msa_best_text_corrected)