def replace_content(self, content, reference_markers): """ Replace content with markers :param content: Without markers :param reference_markers: :return: """ marker_offset = 0 content_with_markers = content sorted_markers = sorted(reference_markers, key=lambda k: k.get_start_position() ) # order by occurrence in text for i, marker in enumerate(sorted_markers): # Check on overlaps if i > 0 and sorted_markers[ i - 1].get_end_position() >= marker.get_start_position(): raise RefExError('Marker overlaps with previous marker: %s' % marker) elif i + 1 < len(sorted_markers) and sorted_markers[ i + 1].get_start_position() <= marker.get_end_position(): raise RefExError('Marker overlaps with next marker: %s' % marker) else: # Everything fine, replace content content_with_markers, marker_offset = marker.replace_content( content_with_markers, marker_offset) return content_with_markers
def get_law_book_ref_regex(self, law_book_codes, optional=False, group_name=False, to_lower=False): """ Returns regex for law book part in reference markers (OR list). Example: - codes: ['ab', 'cd', 'de'] - output: ab|cd|de """ # return '[a-zA-Z]' if len(law_book_codes) < 1: raise RefExError("Cannot generate regex, law_book_codes are empty") if optional: raise ValueError("optional=True not supported") if group_name: raise ValueError("group_name=True not supported") logger.debug("Law book ref with %i books" % len(law_book_codes)) return "|".join( [code.lower() if to_lower else code for code in law_book_codes])
def extract_law_ref_markers(self, content: str) -> List[RefMarker]: """ The main extraction method. Takes input content and returns content with markers and list of extracted references. :param content: Plain-text or even HTML :return: List of reference markers """ logger.debug("Extracting from: %s" % content) if self.law_book_context is not None: # Extraction with context available is done in another method return self.extract_law_ref_markers_with_context(content) # Init markers = [] marker_offset = 0 # Handle each match separately for marker_match in re.finditer( self.get_law_ref_regex(self.get_law_book_codes()), content): marker_text = str(marker_match.group(0)).strip() references: List[Ref] = [] # Handle single and multi refs separately if re.match(r"^(Art(\.{,1})|§)\s", marker_text): references = self.handle_single_law_ref( self.get_law_book_codes(), marker_text, references) elif re.match(r"^§§\s", marker_text): references = self.handle_multiple_law_refs( self.get_law_book_codes(), marker_text, references) else: raise RefExError("Unsupported ref beginning: %s" % marker_text) marker = RefMarker( text=marker_text, start=marker_match.start(), end=marker_match.end(), line=0, ) # TODO marker.set_uuid() marker.set_references(references) markers.append(marker) return markers
def handle_single_law_ref(self, law_book_codes, ref_str, law_ids) -> List[Ref]: logger.debug("Single ref found in: %s" % ref_str) # Single ref mm = self.get_law_ref_match_single(law_book_codes, ref_str) # Find book and section (only single result possible) if mm is not None: # mm.groupdict() if mm.group("book") is not None: # Found book book = mm.group("book").lower() else: raise RefExError("Ref book is not set: %s " % ref_str) if mm.group("sect") is not None: # Found section sect = str(mm.group("sect")) else: raise RefExError("Ref sect is not set") if mm.group("sect_az") is not None: # Found section addon sect += mm.group("sect_az").lower() law_id = Ref(ref_type=RefType.LAW, book=book, section=sect) logger.debug("Law ID: %s" % law_id) law_ids.append(law_id) else: # law_ids.append({'book': 'not matched', 'sect': 'NOT MATCHED (single) %s ' % ref_str}) logger.warning("Law ID could not be matched: %s" % ref_str) return law_ids
def get_law_book_ref_regex(self, law_book_codes, optional=False, group_name=False, to_lower=False): """ Returns regex for law book part in reference markers (OR list). TODO book codes should be ordered by reverse string length (SG|SGB X) -> (SGB X|SG), SGG - Solution: book code needs to be followed by (\s|\.|,|;|:|"|'|\)|]) TODO Add refex for ending [A-Z][A-Za-z](V|G|O) - start with capital letter - end with V, G or O Example: - codes: ['ab', 'cd', 'de'] - output: ab|cd|de """ # return '[a-zA-Z]' if len(law_book_codes) < 1: raise RefExError('Cannot generate regex, law_book_codes are empty') if optional: raise ValueError('optional=True not supported') if group_name: raise ValueError('group_name=True not supported') logger.debug('Law book ref with %i books' % len(law_book_codes)) # return '|'.join([code.lower() if to_lower else code for code in law_book_codes]) # alternative regex: # start with capital char # optional, max length chars # ends with V,G,O or B # optional space + roman numbers (e.g. SGB IX) return '([A-ZÄÜÖ][-ÄÜÖäüöA-Za-z]{,20})(V|G|O|B)(?:\s([XIV]{1,5}))?'
def extract_law_ref_markers(self, content: str, is_html: bool = False) -> List[RefMarker]: """ The main extraction method. Takes input content and returns content with markers and list of extracted references. Divide and Conquer - only simple regex - replace matches with mask (_REF_) to avoid multiple matches :param content: Plain-text or even HTML :return: List of reference markers """ if self.law_book_context is not None: # Extraction with context available is done in another method return self.extract_law_ref_markers_with_context(content) # Init markers = [] # Replace special characters if working with html if is_html: sectionSign = '§' self.word_delimiter = '\s|\.|,|;|:|!|\?|\(|\)|\[|\]|”|\‘|\’|<|>|&|”|\‘|\’|<|>|&|"|\'|<|>|&' else: sectionSign = '§' self.word_delimiter = '\s|\.|,|;|:|!|\?|\(|\)|\[|\]|"|\'|<|>|&' book_look_ahead = '(?=' + self.word_delimiter + ')' # book code should be followed by a word separator, e.g. space. # Single ref book_pattern = self.get_law_book_ref_regex(self.get_law_book_codes()) # Any content any_content = '(\s?([0-9]{1,5}(\.{,1})|[a-z]{1,2}|[IXV]{1,3}|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|und|bis|,|;|\s))*' any_content = '([0-9]{1,5}|\.|[a-z]|[IXV]{1,3}|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|und|bis|,|;|\s)*' multi_pattern = sectionSign + sectionSign + ' (\s|[0-9]+(\.{,1})|[a-z]|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|f\.|ff\.|und|bis|\,|;|\s' + book_pattern + ')+\s(' + book_pattern + ')' + book_look_ahead for marker_match in re.finditer(re.compile(multi_pattern), content): # All matches marker_text = marker_match.group(0) refs = [] refs_waiting_for_book = [] # print('>> ' + marker_text) logger.debug('Multi Match with: %s' % marker_text) # Books by position in text book_positions = { } # Can we ensure that book_position is in order? for book_match in re.finditer(book_pattern, marker_text): book_positions[book_match.start()] = book_match.group(0) # We cannot work without knowing the book if len(book_positions) < 0: logger.error('No book found in marker text: %s' % marker_text) continue # Extract references from marker text # - find for <separator §§|,|..> + <section> # - ignore Abs, Nr, ... # - corresponding book is the closest to right a = '([0-9]+)\s(?=bis|und)' b = '([0-9]+)\s?[a-z]' c = '([0-9]+)' pattern = '(?P<sep>' + sectionSign + sectionSign + '|,|;|und|bis)\s?(?P<sect>(' + a + '|' + b + '|' + c + '))' for ref_match in re.finditer(re.compile(pattern), marker_text): sect = ref_match.group('sect') logger.debug('Found ref: %s' % ref_match.group()) if len(book_positions) == 1: book = next(iter(book_positions.values())) else: book = None pos = ref_match.start() for bp in book_positions: if bp > pos: # Take the first book that is right to section position book = book_positions[bp] break if book is None: logger.error('No book after reference found: %s - %s' % (ref_match.group(0), marker_text)) continue # Check for 'between' (range sections) if ref_match.group('sep') == 'bis' and len(refs) > 0: from_sect = refs[-1].section # last section # Both sections should be integers (no a-z sections) if sect.isdigit() and from_sect.isdigit(): for between_sect in range( int(from_sect) + 1, int(sect)): # Add to queue refs.append( Ref.init_law(book=book, section=str(between_sect))) refs.append(Ref.init_law(book=book, section=sect)) # Prepare marker marker = RefMarker(text=marker_text, start=marker_match.start(), end=marker_match.end()) marker.set_uuid() marker.set_references(refs) # Check if actual references were found in marker text if len(refs) > 0: markers.append(marker) # Update content to avoid double matching content = marker.replace_content_with_mask(content) else: logger.warning('No references found in marker: %s ' % marker_text) # Single refs sect_pattern = '(?P<sect>([0-9]+)(\s?[a-z]?))' patterns = [ # § 3 BGB, § 3d BGB, § 83 d BGB sectionSign + ' ' + sect_pattern + ' (?P<book>' + book_pattern + ')' + book_look_ahead, # Abs OR Nr # § 42 Abs. 1 Alt. 1 VwGO sectionSign + ' ' + sect_pattern + ' Abs. ([0-9]+) Alt. ([0-9]+) (?P<book>' + book_pattern + ')' + book_look_ahead, sectionSign + ' (?P<sect>([0-9]+)(\s?[a-z]?)) ' + any_content + ' (?P<book>(' + book_pattern + '))' + book_look_ahead, sectionSign + ' (?P<sect>([0-9]+)(\s?[a-z]?)) ' + any_content + ' (?P<next_book>(i\.V\.m\.|iVm))' + book_look_ahead, ] markers_waiting_for_book = [] # type: List[RefMarker] for pattern in patterns: # Iterate over all patterns # logger.debug('Pattern: %s' % pattern) for marker_match in re.finditer(re.compile(pattern), content): # All matches marker_text = marker_match.group(0) if 'book' in marker_match.groupdict(): book = Ref.clean_book(marker_match.group('book')) else: book = None ref = Ref.init_law(section=marker_match.group('sect'), book=None) marker = RefMarker(text=marker_text, start=marker_match.start(), end=marker_match.end()) marker.set_uuid() # marker.uuid = 's' # Has this marker a book if book is not None: ref.book = book marker.set_references([ref]) # Update content to avoid double matching content = marker.replace_content_with_mask(content) markers.append(marker) # Set to waiting markers for waiting in markers_waiting_for_book: if len(waiting.references) == 1: waiting.references[0].book = book content = waiting.replace_content_with_mask( content) markers.append(waiting) markers_waiting_for_book = [] else: if marker_match.group('next_book') is not None: marker.set_references([ref]) markers_waiting_for_book.append(marker) else: raise RefExError('next_book and book are None') if len(markers_waiting_for_book) > 0: logger.warning('Marker could not be assign to book: %s' % markers_waiting_for_book) # TODO Art GG return markers
def handle_multiple_law_refs(self, law_book_codes, ref_str, law_ids) -> List[Ref]: # Search for multiple refs matches = self.get_law_ref_match_multi(law_book_codes, ref_str) refs_tmp = [] prev_sect = None prev_book = None logger.debug('Multi refs found in: %s' % ref_str) # Loop over all results for match in matches: # If book is not set, use __placeholder__ and replace later if match.group('book') is not None: book = match.group('book').lower() else: book = '__book__' # Section must exist if match.group('sect') is not None: sect = str(match.group('sect')) else: raise RefExError('Ref sect is not set') if match.group('sect_az') is not None: sect += match.group('sect_az').lower() ref = Ref(ref_type=RefType.LAW, book=book, section=sect) logger.debug('Ref found: %s (%s)' % (ref, match.group(0))) # Check for section ranges if match.group('delimiter') == 'bis': logger.debug('Handle section range - Add ids from ' + prev_sect + ' to ' + sect) # TODO how to handle az sects prev_sect = re.sub('[^0-9]', '', prev_sect) sect = re.sub('[^0-9]', '', sect) for between_sect in range(int(prev_sect)+1, int(sect)): # print(between_sect) refs_tmp.append(Ref(ref_type=RefType.LAW, book=prev_book, section=str(between_sect))) else: prev_sect = sect prev_book = book refs_tmp.append(ref) # law_ids.append('multi = ' + ref_str) # handle __book__ logger.debug('All law ids found: %s' % refs_tmp) refs_tmp.reverse() book = None for id_tmp in refs_tmp: if id_tmp.book != '__book__': book = id_tmp.book elif book is not None: id_tmp.book = book else: # raise RefExError('Cannot determine law book (Should never happen): %s' % ref_str) continue law_ids.append(id_tmp) return law_ids