def bulk_index(self, doc, doc_type=None, doc_index=None, doc_id=None): if doc_type is None: if self.es_type is None: raise ProcessingError('No ES document type defined') doc_type = self.es_type if doc_index is None: doc_index = self.es_index action = {'index': {'_index': doc_index, '_type': doc_type}} if doc_id is not None: action['index']['_id'] = doc_id self.bulk_data += json.dumps(action) + '\n' # Convert Django model if isinstance(doc, Model): logger.debug('Convert Django model: %s' % doc) # Should some fields be excluded? if hasattr(doc, 'es_fields_exclude'): doc_dict = model_to_dict(doc, exclude=doc.es_fields_exclude) else: doc_dict = model_to_dict(doc) # Has pre_index signal? pre_index = getattr(doc, "pre_index", None) if callable(pre_index): # logger.debug('Calling pre_index signal for model') pre_index(doc_dict) # if 'revision_date' in doc: # del doc['revision_date'] # pass doc_str = json.dumps(doc_dict) elif isinstance(doc, dict): doc_str = json.dumps(doc) else: raise ProcessingError('Cannot transform to JSON: %s' % doc) self.bulk_data += doc_str + '\n' self.bulk_size += 1 if self.bulk_size >= self.items_per_batch: self.send_data_to_es()
def get_input(self) -> list: """Select files from input_selector recursively and from directory with dir_selector """ if self.input_selector is None: raise ProcessingError('input_selector is not set') content_list = self.get_input_content_from_selector( self.input_selector)[self.input_start:] if len(content_list) < 1: raise ProcessingError('Input selector is empty: %s' % self.input_selector) if self.input_limit > 0: content_list = content_list[:self.input_limit] return content_list
def handle_single_law_ref(self, ref_str, law_ids): logger.debug('Single ref found in: %s' % ref_str) # Single ref mm = self.get_law_ref_match_single(ref_str) # Find book and section (only single result possible) if mm is not None: # mm.groupdict() if mm.group('book') is not None: # Found book book = mm.group('book').lower() else: raise ProcessingError('Ref book is not set: %s ' % ref_str) if mm.group('sect') is not None: # Found section sect = str(mm.group('sect')) else: raise ProcessingError('Ref sect is not set') if mm.group('sect_az') is not None: # Found section addon sect += mm.group('sect_az').lower() law_id = {'book': book, 'sect': sect, 'type': 'law'} logger.debug('Law ID: %s' % law_id) law_ids.append(law_id) else: law_ids.append({ 'book': 'not matched', 'sect': 'NOT MATCHED (single) %s ' % ref_str }) logger.warning('Law ID could not be matched.') return law_ids
def set_es_url(self, url): """Parse ES url to extract index name etc.""" o = urlparse(url) self.es_scheme = o.scheme self.es_host = o.hostname self.es_port = o.port p = o.path.split('/') if len(p) == 2: self.es_index = p[1] else: raise ProcessingError('Cannot extract index from ES url: %s' % url) self.es_url = '%s://%s:%i' % (self.es_scheme, self.es_host, self.es_port)
def handle_input(self, input_content: str) -> None: """Parses law XML and creates book and law instances (append to processed_content) :param input_content: File path to law XML file :return: """ logger.debug('Reading from %s' % input_content) # File exist? if not os.path.isfile(input_content): raise ProcessingError('Is not file: %s' % input_content) # Count lines num_lines = sum(1 for line in open(input_content)) # Skip if lines count is invalid if (self.min_lines is not None and self.min_lines >= 0 and num_lines < self.min_lines) \ or (self.max_lines is not None and 0 <= self.max_lines < num_lines): logger.info('Skip - File has invalid line count (%i): %s' % (num_lines, input_content)) return # Parse XML tree tree = etree.parse(input_content) sort = 0 docs = [] # Prepare docs for idx, n in enumerate(tree.xpath('norm')): # Extract law content (with html tags) content = self.get_node_content(n, 'textdaten/text/Content/*') if idx == 0: # Save book with the first element book = self.handle_law_book(n) # Append section to book object if section title is found section_title = ( n.xpath('metadaten/gliederungseinheit/gliederungstitel/text()') or [None])[0] if section_title is not None: book.add_section(from_order=sort, title=section_title.strip()) # Create law object doc = Law( doknr=n.get('doknr'), enbez=(n.xpath('metadaten/enbez/text()') or [None])[0], amtabk=(n.xpath('metadaten/amtabk/text()') or [None])[0], kurzue=(n.xpath('metadaten/kurzue/text()') or [None])[0], title=(n.xpath('metadaten/titel/text()') or [''])[0].strip(), order=sort, # use in frontend for sorting content=content, footnotes=json.dumps( self.get_node_content(n, 'textdaten/fussnoten/Content/*')), book=book, ) # Perform processing steps # for processor in self.processing_steps: # type: LawProcessingStep # doc = processor.process(doc) # TODO is Verordnung? is Gesetz? strip <pre>? # slug (unique) slug = slugify(doc.enbez or '') if slug[:3] == 'ss-': # Is section-symbol slug = slug[3:] doc.slug = slug if slug != '': logger.debug('Pre-processed: %s' % doc) docs.append(doc) sort += 1 else: logger.warning('Ignore invalid document (no slug): %s' % doc) # Append to queue self.pre_processed_content.extend(docs)
def handle_law_book(self, node) -> LawBook: jurabk = (node.xpath('metadaten/jurabk/text()') or [None])[0] revision_date_str = None revision_date = None changelog = [] changelog_comments = node.xpath( 'metadaten/standangabe/standkommentar/text()') changelog_types = node.xpath('metadaten/standangabe/standtyp/text()') for key, value in enumerate(changelog_comments): changelog.append({'type': changelog_types[key], 'text': value}) if changelog_types[key] == 'Stand': revision_date_str = value if revision_date_str is not None: # print(revision_date_str) # [0-9]{2})\.([0-9]{4}) match = re.search( r'(?P<day>[0-9]{1,2})\.(?P<month>[0-9]{1,2})\.(?P<year>[0-9]{4})', revision_date_str) if match: # Revision data as string (datetime.date is not JSON serializable) revision_date = datetime.date(int(match.group('year')), int(match.group('month')), int(match.group('day'))) # revision_date = match.group('year') + '-' + match.group('month') + '-' + match.group('day') revision_date = revision_date.strftime('%Y-%m-%d') book_title_match = node.xpath('metadaten/langue/text()') if book_title_match: book_title = book_title_match[0].replace( '\n', ' ') # replace line breaks else: book_title = None book = LawBook( title=book_title, # gliederung=[], code=jurabk, slug=slugify(jurabk), footnotes=json.dumps( self.get_node_content( node, 'textdaten/fussnoten/Content/*')), # On book level? changelog=json.dumps(changelog) # revision_date=revision_date ) if revision_date is not None: # TODO raise error if no revision date is provided? # raise ValueError('no revision date: %s; %s' % (changelog_comments, changelog_types)) book.revision_date = revision_date try: book.save() except DatabaseError as e: # TODO set latest depending on revision - check first if other books exist? raise ProcessingError('Cannot save book: %s' % e) return book
def handle_multiple_law_refs(self, ref_str, law_ids): # Search for multiple refs mms = self.get_law_ref_match_multi(ref_str) ids_tmp = [] prev_sect = None prev_book = None logger.debug('Multi refs found in: %s' % ref_str) # Loop over all results for m in mms: # If book is not set, use __placeholder__ and replace later if m.group('book') is not None: book = m.group('book').lower() else: book = '__book__' # Section must exist if m.group('sect') is not None: sect = str(m.group('sect')) else: raise ProcessingError('Ref sect is not set') if m.group('sect_az') is not None: sect += m.group('sect_az').lower() law_id = {'book': book, 'sect': sect, 'type': 'law'} logger.debug('Law ID found: %s' % law_id) # Check for section ranges if m.group('delimiter') == 'bis': logger.debug('Handle section range - Add ids from ' + prev_sect + ' to ' + sect) # TODO how to handle az sects prev_sect = re.sub('[^0-9]', '', prev_sect) sect = re.sub('[^0-9]', '', sect) for between_sect in range(int(prev_sect) + 1, int(sect)): # print(between_sect) ids_tmp.append({ 'book': prev_book, 'sect': between_sect, 'type': 'law' }) else: prev_sect = sect prev_book = book ids_tmp.append(law_id) # law_ids.append('multi = ' + ref_str) # handle __book__ logger.debug('All law ids found: %s' % ids_tmp) ids_tmp.reverse() book = None for id_tmp in ids_tmp: if id_tmp['book'] != '__book__': book = id_tmp['book'] elif book is not None: id_tmp['book'] = book else: raise ProcessingError( 'Cannot determine law book (Should never happen): %s' % ref_str) law_ids.append(id_tmp) return law_ids
def extract_law_refs(self, referenced_by: Case, content: str, key: int = 0): """ § 3d AsylG § 123 VwGO §§ 3, 3b AsylG § 77 Abs. 1 Satz 1, 1. Halbsatz AsylG § 3 Abs. 1 AsylG § 77 Abs. 2 AsylG § 113 Abs. 5 Satz 1 VwGO § 3 Abs. 1 Nr. 1 i.V.m. § 3b AsylG § 3a Abs. 1 und 2 AsylG §§ 154 Abs. 1 VwGO § 83 b AsylG § 167 VwGO iVm §§ 708 Nr. 11, 711 ZPO § 167 VwGO i.V.m. §§ 708 Nr. 11, 711 ZPO §§ 167 Abs. 2 VwGO, 708 Nr. 11, 711 ZPO §§ 52 Abs. 1; 53 Abs. 2 Nr. 1; 63 Abs. 2 GKG § 6 Abs. 5 Satz 1 LBO §§ 80 a Abs. 3, 80 Abs. 5 VwGO § 1 Satz 2 SbStG § 2 ZWStS § 6 Abs. 2 S. 2 ZWStS TODO all law-book jurabk :param referenced_by: :param key: :link https://www.easy-coding.de/Thread/5536-RegExp-f%C3%BCr-Gesetze/ :param content: :return: """ logger.debug('Extracting law references') refs = [] results = list(re.finditer(self.get_law_ref_regex(), content)) marker_offset = 0 logger.debug('Current content value: %s' % content) logger.debug('Law refs found: %i' % len(results)) for ref_m in results: ref_str = str(ref_m.group(0)).strip() law_ids = [] # Handle single and multi refs separately if re.match(r'^(Art\.|§)\s', ref_str): law_ids = self.handle_single_law_ref(ref_str, law_ids) elif re.match(r'^§§\s', ref_str): law_ids = self.handle_multiple_law_refs(ref_str, law_ids) else: raise ProcessingError('Unsupported ref beginning: %s' % ref_str) ref = CaseReferenceMarker(referenced_by=referenced_by, text=ref_str, start=ref_m.start(), end=ref_m.end(), line=0) # TODO ref.set_uuid() ref.set_references(law_ids) refs.append(ref) content, marker_offset = ref.replace_content( content, marker_offset, key + len(refs)) return content, refs
def find_court(query) -> Court: """ Example court names: - Oberverwaltungsgericht für das Land Schleswig-Holstein - VG Magdeburg - {"name": "OVG L\u00fcneburg 5. Senat"} :param query: Dict(name, code) :return: """ if 'code' in query: # Find based on code (EuGH, ...) try: return Court.objects.get(code=query['code']) except Court.DoesNotExist: pass if 'name' not in query: raise ProcessingError('Field name not in query') if ' ' not in query['name']: # Find based on name if name does not contain whitespaces try: return Court.objects.get(name=query['name']) except Court.DoesNotExist: pass # Determine type # print('Find court: %s' % query) court_type = Court.extract_type_code_from_name(query['name']) # print('Type code: %s' % court_type) if court_type is None: raise ProcessingError('Court type not found') location_levels = CourtTypes().get_type(court_type)['levels'] # print('Location level: %s' % location_levels) # Look for states if CourtLocationLevel.STATE in location_levels: state_id_mapping = {} for r in State.objects.values_list('id', 'name'): if r[1] != '': state_id_mapping[r[1]] = r[0] # Add variations, e.g. Hamburg_er, Holstein_isches for v in ['es', 'er', 'isches']: state_id_mapping[r[1] + v] = r[0] state_id = find_from_mapping(query['name'], state_id_mapping) if state_id is not None: try: logger.debug('Look for state=%i, type=%s' % (state_id, court_type)) return Court.objects.get(state_id=state_id, court_type=court_type) except Court.DoesNotExist: pass # Look for cities if CourtLocationLevel.CITY in location_levels: city_id_mapping = {} for r in City.objects.values_list('id', 'name'): if r[1] != '': city_id_mapping[r[1]] = r[0] city_id = find_from_mapping(query['name'], city_id_mapping) # print(city_id_mapping) if city_id is not None: try: logger.debug('Look for city=%i, type=%s' % (city_id, court_type)) return Court.objects.get(city_id=city_id, court_type=court_type) except Court.DoesNotExist: pass # Nothing found raise Court.DoesNotExist # if 'name' in query and 'code' in query: # candidates = Court.objects.filter(Q(name=query['name']) | Q(code=query['code'])) # instance = candidates[0] # # if len(candidates) == 0: # raise Court.DoesNotExist # elif 'name' in query: # instance = Court.objects.get(name=query['name']) # # else: # raise ProcessingError('Court fields missing: %s' % query) return instance