def replace_content(self, content, reference_markers):
        """
        Replace content with markers

        :param content: Without markers
        :param reference_markers:
        :return:
        """
        marker_offset = 0
        content_with_markers = content
        sorted_markers = sorted(reference_markers,
                                key=lambda k: k.get_start_position()
                                )  # order by occurrence in text

        for i, marker in enumerate(sorted_markers):
            # Check on overlaps
            if i > 0 and sorted_markers[
                    i - 1].get_end_position() >= marker.get_start_position():
                raise RefExError('Marker overlaps with previous marker: %s' %
                                 marker)
            elif i + 1 < len(sorted_markers) and sorted_markers[
                    i + 1].get_start_position() <= marker.get_end_position():
                raise RefExError('Marker overlaps with next marker: %s' %
                                 marker)
            else:
                # Everything fine, replace content
                content_with_markers, marker_offset = marker.replace_content(
                    content_with_markers, marker_offset)

        return content_with_markers
Пример #2
0
    def get_law_book_ref_regex(self,
                               law_book_codes,
                               optional=False,
                               group_name=False,
                               to_lower=False):
        """
        Returns regex for law book part in reference markers (OR list).

        Example:
            - codes: ['ab', 'cd', 'de']
            - output: ab|cd|de

        """

        # return '[a-zA-Z]'

        if len(law_book_codes) < 1:
            raise RefExError("Cannot generate regex, law_book_codes are empty")

        if optional:
            raise ValueError("optional=True not supported")

        if group_name:
            raise ValueError("group_name=True not supported")

        logger.debug("Law book ref with %i books" % len(law_book_codes))

        return "|".join(
            [code.lower() if to_lower else code for code in law_book_codes])
Пример #3
0
    def extract_law_ref_markers(self, content: str) -> List[RefMarker]:
        """

        The main extraction method. Takes input content and returns content with markers and list of extracted references.

        :param content: Plain-text or even HTML
        :return: List of reference markers
        """

        logger.debug("Extracting from: %s" % content)

        if self.law_book_context is not None:
            # Extraction with context available is done in another method
            return self.extract_law_ref_markers_with_context(content)

        # Init
        markers = []
        marker_offset = 0

        # Handle each match separately
        for marker_match in re.finditer(
                self.get_law_ref_regex(self.get_law_book_codes()), content):

            marker_text = str(marker_match.group(0)).strip()
            references: List[Ref] = []

            # Handle single and multi refs separately
            if re.match(r"^(Art(\.{,1})|§)\s", marker_text):
                references = self.handle_single_law_ref(
                    self.get_law_book_codes(), marker_text, references)

            elif re.match(r"^§§\s", marker_text):
                references = self.handle_multiple_law_refs(
                    self.get_law_book_codes(), marker_text, references)

            else:
                raise RefExError("Unsupported ref beginning: %s" % marker_text)

            marker = RefMarker(
                text=marker_text,
                start=marker_match.start(),
                end=marker_match.end(),
                line=0,
            )  # TODO
            marker.set_uuid()
            marker.set_references(references)

            markers.append(marker)

        return markers
Пример #4
0
    def handle_single_law_ref(self, law_book_codes, ref_str,
                              law_ids) -> List[Ref]:
        logger.debug("Single ref found in: %s" % ref_str)

        # Single ref
        mm = self.get_law_ref_match_single(law_book_codes, ref_str)

        # Find book and section (only single result possible)
        if mm is not None:
            # mm.groupdict()

            if mm.group("book") is not None:
                # Found book
                book = mm.group("book").lower()
            else:
                raise RefExError("Ref book is not set: %s " % ref_str)

            if mm.group("sect") is not None:
                # Found section
                sect = str(mm.group("sect"))
            else:
                raise RefExError("Ref sect is not set")

            if mm.group("sect_az") is not None:
                # Found section addon
                sect += mm.group("sect_az").lower()

            law_id = Ref(ref_type=RefType.LAW, book=book, section=sect)

            logger.debug("Law ID: %s" % law_id)

            law_ids.append(law_id)
        else:
            # law_ids.append({'book': 'not matched', 'sect': 'NOT MATCHED (single) %s ' % ref_str})
            logger.warning("Law ID could not be matched: %s" % ref_str)

        return law_ids
    def get_law_book_ref_regex(self,
                               law_book_codes,
                               optional=False,
                               group_name=False,
                               to_lower=False):
        """
        Returns regex for law book part in reference markers (OR list).

        TODO book codes should be ordered by reverse string length (SG|SGB X) -> (SGB X|SG), SGG
        - Solution: book code needs to be followed by (\s|\.|,|;|:|"|'|\)|])

        TODO Add refex for ending [A-Z][A-Za-z](V|G|O)
        - start with capital letter
        - end with V, G or O

        Example:
            - codes: ['ab', 'cd', 'de']
            - output: ab|cd|de

        """

        # return '[a-zA-Z]'

        if len(law_book_codes) < 1:
            raise RefExError('Cannot generate regex, law_book_codes are empty')

        if optional:
            raise ValueError('optional=True not supported')

        if group_name:
            raise ValueError('group_name=True not supported')

        logger.debug('Law book ref with %i books' % len(law_book_codes))

        # return '|'.join([code.lower() if to_lower else code for code in law_book_codes])

        # alternative regex:
        # start with capital char
        # optional, max length chars
        # ends with V,G,O or B
        # optional space + roman numbers (e.g. SGB IX)
        return '([A-ZÄÜÖ][-ÄÜÖäüöA-Za-z]{,20})(V|G|O|B)(?:\s([XIV]{1,5}))?'
    def extract_law_ref_markers(self,
                                content: str,
                                is_html: bool = False) -> List[RefMarker]:
        """

        The main extraction method. Takes input content and returns content with markers and list of extracted references.

        Divide and Conquer
        - only simple regex
        - replace matches with mask (_REF_) to avoid multiple matches

        :param content: Plain-text or even HTML
        :return: List of reference markers
        """

        if self.law_book_context is not None:
            # Extraction with context available is done in another method
            return self.extract_law_ref_markers_with_context(content)

        # Init
        markers = []

        # Replace special characters if working with html
        if is_html:
            sectionSign = '&#167;'
            self.word_delimiter = '\s|\.|,|;|:|!|\?|\(|\)|\[|\]|&#8221;|\&#8216;|\&#8217;|&#60;|&#62;|&#38;|&rdquo;|\&lsquo;|\&rsquo;|&lt;|&gt;|&amp;|"|\'|<|>|&'
        else:
            sectionSign = '§'
            self.word_delimiter = '\s|\.|,|;|:|!|\?|\(|\)|\[|\]|"|\'|<|>|&'

        book_look_ahead = '(?=' + self.word_delimiter + ')'  # book code should be followed by a word separator, e.g. space.

        # Single ref
        book_pattern = self.get_law_book_ref_regex(self.get_law_book_codes())

        # Any content
        any_content = '(\s?([0-9]{1,5}(\.{,1})|[a-z]{1,2}|[IXV]{1,3}|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|und|bis|,|;|\s))*'
        any_content = '([0-9]{1,5}|\.|[a-z]|[IXV]{1,3}|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|und|bis|,|;|\s)*'

        multi_pattern = sectionSign + sectionSign + ' (\s|[0-9]+(\.{,1})|[a-z]|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|f\.|ff\.|und|bis|\,|;|\s' + book_pattern + ')+\s(' + book_pattern + ')' + book_look_ahead

        for marker_match in re.finditer(re.compile(multi_pattern),
                                        content):  # All matches
            marker_text = marker_match.group(0)
            refs = []
            refs_waiting_for_book = []

            # print('>> ' + marker_text)
            logger.debug('Multi Match with: %s' % marker_text)

            # Books by position in text
            book_positions = {
            }  # Can we ensure that book_position is in order?
            for book_match in re.finditer(book_pattern, marker_text):
                book_positions[book_match.start()] = book_match.group(0)

            # We cannot work without knowing the book
            if len(book_positions) < 0:
                logger.error('No book found in marker text: %s' % marker_text)
                continue

            # Extract references from marker text
            # - find for <separator §§|,|..> + <section>
            # - ignore Abs, Nr, ...
            # - corresponding book is the closest to right
            a = '([0-9]+)\s(?=bis|und)'
            b = '([0-9]+)\s?[a-z]'
            c = '([0-9]+)'
            pattern = '(?P<sep>' + sectionSign + sectionSign + '|,|;|und|bis)\s?(?P<sect>(' + a + '|' + b + '|' + c + '))'

            for ref_match in re.finditer(re.compile(pattern), marker_text):
                sect = ref_match.group('sect')

                logger.debug('Found ref: %s' % ref_match.group())

                if len(book_positions) == 1:
                    book = next(iter(book_positions.values()))
                else:
                    book = None
                    pos = ref_match.start()

                    for bp in book_positions:
                        if bp > pos:
                            # Take the first book that is right to section position
                            book = book_positions[bp]
                            break

                if book is None:
                    logger.error('No book after reference found: %s - %s' %
                                 (ref_match.group(0), marker_text))
                    continue

                # Check for 'between' (range sections)
                if ref_match.group('sep') == 'bis' and len(refs) > 0:
                    from_sect = refs[-1].section  # last section

                    # Both sections should be integers (no a-z sections)
                    if sect.isdigit() and from_sect.isdigit():
                        for between_sect in range(
                                int(from_sect) + 1, int(sect)):
                            # Add to queue
                            refs.append(
                                Ref.init_law(book=book,
                                             section=str(between_sect)))

                refs.append(Ref.init_law(book=book, section=sect))

            # Prepare marker
            marker = RefMarker(text=marker_text,
                               start=marker_match.start(),
                               end=marker_match.end())
            marker.set_uuid()
            marker.set_references(refs)

            # Check if actual references were found in marker text
            if len(refs) > 0:
                markers.append(marker)

                # Update content to avoid double matching
                content = marker.replace_content_with_mask(content)
            else:
                logger.warning('No references found in marker: %s ' %
                               marker_text)

        # Single refs
        sect_pattern = '(?P<sect>([0-9]+)(\s?[a-z]?))'
        patterns = [
            # § 3 BGB, § 3d BGB, § 83 d BGB
            sectionSign + ' ' + sect_pattern + ' (?P<book>' + book_pattern +
            ')' + book_look_ahead,
            # Abs OR Nr
            # § 42 Abs. 1 Alt. 1 VwGO
            sectionSign + ' ' + sect_pattern +
            ' Abs. ([0-9]+) Alt. ([0-9]+) (?P<book>' + book_pattern + ')' +
            book_look_ahead,
            sectionSign + ' (?P<sect>([0-9]+)(\s?[a-z]?)) ' + any_content +
            ' (?P<book>(' + book_pattern + '))' + book_look_ahead,
            sectionSign + ' (?P<sect>([0-9]+)(\s?[a-z]?)) ' + any_content +
            ' (?P<next_book>(i\.V\.m\.|iVm))' + book_look_ahead,
        ]

        markers_waiting_for_book = []  # type: List[RefMarker]

        for pattern in patterns:  # Iterate over all patterns

            # logger.debug('Pattern: %s' % pattern)

            for marker_match in re.finditer(re.compile(pattern),
                                            content):  # All matches
                marker_text = marker_match.group(0)
                if 'book' in marker_match.groupdict():
                    book = Ref.clean_book(marker_match.group('book'))
                else:
                    book = None

                ref = Ref.init_law(section=marker_match.group('sect'),
                                   book=None)

                marker = RefMarker(text=marker_text,
                                   start=marker_match.start(),
                                   end=marker_match.end())
                marker.set_uuid()
                # marker.uuid = 's'

                # Has this marker a book
                if book is not None:
                    ref.book = book

                    marker.set_references([ref])

                    # Update content to avoid double matching
                    content = marker.replace_content_with_mask(content)

                    markers.append(marker)

                    # Set to waiting markers
                    for waiting in markers_waiting_for_book:
                        if len(waiting.references) == 1:
                            waiting.references[0].book = book

                            content = waiting.replace_content_with_mask(
                                content)

                            markers.append(waiting)
                    markers_waiting_for_book = []
                else:
                    if marker_match.group('next_book') is not None:
                        marker.set_references([ref])
                        markers_waiting_for_book.append(marker)
                    else:
                        raise RefExError('next_book and book are None')

        if len(markers_waiting_for_book) > 0:
            logger.warning('Marker could not be assign to book: %s' %
                           markers_waiting_for_book)

        # TODO Art GG

        return markers
    def handle_multiple_law_refs(self, law_book_codes, ref_str, law_ids) -> List[Ref]:
        # Search for multiple refs
        matches = self.get_law_ref_match_multi(law_book_codes, ref_str)

        refs_tmp = []
        prev_sect = None
        prev_book = None

        logger.debug('Multi refs found in: %s' % ref_str)

        # Loop over all results
        for match in matches:

            # If book is not set, use __placeholder__ and replace later
            if match.group('book') is not None:
                book = match.group('book').lower()
            else:
                book = '__book__'

            # Section must exist
            if match.group('sect') is not None:
                sect = str(match.group('sect'))
            else:
                raise RefExError('Ref sect is not set')

            if match.group('sect_az') is not None:
                sect += match.group('sect_az').lower()

            ref = Ref(ref_type=RefType.LAW, book=book, section=sect)

            logger.debug('Ref found: %s (%s)' % (ref, match.group(0)))

            # Check for section ranges
            if match.group('delimiter') == 'bis':
                logger.debug('Handle section range - Add ids from ' + prev_sect + ' to ' + sect)
                # TODO how to handle az sects
                prev_sect = re.sub('[^0-9]', '', prev_sect)
                sect = re.sub('[^0-9]', '', sect)

                for between_sect in range(int(prev_sect)+1, int(sect)):
                    # print(between_sect)

                    refs_tmp.append(Ref(ref_type=RefType.LAW, book=prev_book, section=str(between_sect)))
            else:
                prev_sect = sect
                prev_book = book

            refs_tmp.append(ref)

        # law_ids.append('multi = ' + ref_str)
        # handle __book__
        logger.debug('All law ids found: %s' % refs_tmp)

        refs_tmp.reverse()
        book = None
        for id_tmp in refs_tmp:
            if id_tmp.book != '__book__':
                book = id_tmp.book
            elif book is not None:
                id_tmp.book = book
            else:
                # raise RefExError('Cannot determine law book (Should never happen): %s' % ref_str)
                continue

            law_ids.append(id_tmp)

        return law_ids