def get_available_processing_steps(self) -> dict: """Loads available processing steps based on package names in settings""" if self.available_processing_steps is None: self.available_processing_steps = {} # Get packages for model type if self.model.__name__ in settings.PROCESSING_STEPS: for step_package in settings.PROCESSING_STEPS[ self.model.__name__]: # type: str module = import_module(step_package) if 'ProcessingStep' not in module.__dict__: raise ProcessingError( 'Processing step package does not contain "ProcessingStep" class: %s' % step_package) step_cls = module.ProcessingStep( ) # type: BaseProcessingStep if not isinstance(step_cls, BaseProcessingStep): raise ProcessingError( 'Processing step needs to inherit from BaseProcessingStep: %s' % step_package) step_name = step_package.split('.')[ -1] # last module name from package path # Write to dict self.available_processing_steps[step_name] = step_cls else: raise ValueError( 'Model `%s` is missing settings.PROCESSING_STEPS.' % self.model.__name__) return self.available_processing_steps
def get_input(self) -> List[str]: """Select files from input_selector recursively and from directory with dir_selector """ if self.input_selector is None: raise ProcessingError('input_selector is not set') content_list = self.get_input_content_from_selector( self.input_selector)[self.input_start:] if len(content_list) < 1: raise ProcessingError('Input selector is empty: %s' % self.input_selector) if self.input_limit > 0: content_list = content_list[:self.input_limit] return content_list
def assign_law_ref(self, raw: Ref, ref: Reference) -> Reference: """ Find corresponding database item to reference for laws """ if raw.book is None or raw.section is None: raise ProcessingError('Reference data is not set') else: candidates = Law.objects.filter(book__slug=raw.book, slug=raw.section) if len(candidates) >= 1: # Multiple candidates should not occur ref.law = candidates.first() else: raise ProcessingError( 'Cannot find ref target in with book=%s; section=%s; for ref=%s' % (raw.book, raw.section, raw)) return ref
def save_markers( self, markers, referenced_by, assign_references=True ) -> Tuple[List[ReferenceMarker], List[Reference]]: """Convert module objects into Django objects""" saved_markers = [] saved_refs = [] error_counter = 0 success_counter = 0 for marker in markers: # type: RefMarker my_marker = self.marker_model(referenced_by=referenced_by, text=marker.text, start=marker.start, end=marker.end) my_marker.save() for ref in marker.references: # type: Ref my_ref = Reference(to=marker.text) # Assign references to target items if assign_references: try: if ref.ref_type == RefType.LAW: my_ref = self.assign_law_ref(ref, my_ref) elif ref.ref_type == RefType.CASE: my_ref = self.assign_case_ref(ref, my_ref) else: raise ProcessingError( 'Unsupported reference type: %s' % ref.ref_type) success_counter += 1 except ProcessingError as e: logger.error(e) error_counter += 1 # TODO Should we save references all the time or only on successful matching? my_ref.set_to_hash() my_ref.save() # Save in m2m helper self.reference_from_content_model(reference=my_ref, marker=my_marker).save() saved_refs.append(my_ref) saved_markers.append(my_marker) logger.debug('References: saved=%i; errors=%i' % (success_counter, error_counter)) return saved_markers, saved_refs
def get_wikipedia_extract(self, query): res = requests.get( 'https://' + self.language + '.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro=&explaintext=&titles=%s' % query) if res.status_code == 200: res_obj = res.json() for p in res_obj['query']['pages']: return res_obj['query']['pages'][p]['extract'] raise ProcessingError('Cannot get extract')
def get_wikipedia_field(self, query, field='pageid'): # Get Wikipedia ID from search API res = requests.get( 'https://' + self.language + '.wikipedia.org/w/api.php?action=query&list=search&srsearch=%s&utf8=&format=json' % query) if res.status_code == 200: res_obj = res.json() if len(res_obj['query']['search']) > 0: return res_obj['query']['search'][0][field] raise ProcessingError('Cannot get field')
def get_wikipedia_image(self, query, size=250): res = requests.get( 'https://' + self.language + '.wikipedia.org/w/api.php?action=query&titles=%s&prop=pageimages&format=json&pithumbsize=%i' % (query, size)) if res.status_code == 200: res_obj = res.json() # print(res_obj['query']['pages']) for p in res_obj['query']['pages']: if 'thumbnail' in res_obj['query']['pages'][p]: return res_obj['query']['pages'][p]['thumbnail']['source'] raise ProcessingError('Cannot get image')
def from_json_file(file_path): with open(file_path) as f: out = serializers.deserialize( "json", f.read()) # , ignorenonexistent=True) # print(len(out)) try: for o in out: return o.object except DeserializationError: pass raise ProcessingError('Cannot deserialize: %s' % file_path)
def set_processing_steps(self, step_list): """Selects processing steps from available dict""" # Unset old steps and load available steps self.processing_steps = [] self.get_available_processing_steps() if not isinstance(step_list, List): step_list = [step_list] if 'all' in step_list: return self.available_processing_steps.values() for step in step_list: if step in self.available_processing_steps: self.processing_steps.append( self.available_processing_steps[step]) else: raise ProcessingError('Requested step is not available: %s' % step)
def assign_case_ref(self, raw: Ref, ref: Reference) -> Reference: """ Find corresponding database item to reference for cases """ candidates = Case.objects.filter(court__aliases__contains=raw.court, file_number=raw.file_number) if len(candidates) == 1: ref.case = candidates.first() elif len(candidates) > 1: # Multiple candidates # TODO better heuristic? ref.case = candidates.first() else: # Not found raise ProcessingError( 'Cannot find ref target in with court=%s; file_number=%s; for ref=%s' % (raw.court, raw.file_number, raw)) return ref
def process(self, law: Law) -> Law: """ Read law.content, search for references, add ref marker (e.g. [ref=1]xy[/ref]) to text, add ref data to law. Ref data should contain position information, for CPA computations ... :param law: to be processed :return: processed law """ try: self.extractor.law_book_context = law.book.code law.content, markers = self.extractor.extract(law.content) LawReferenceMarker.objects.filter(referenced_by=law).delete() self.save_markers(markers, law) return law except RefExError as e: raise ProcessingError(e)
def process_content(self): for i, content in enumerate(self.pre_processed_content): # type: Law if i > 0: # .save() is already called by input handler content.previous = self.pre_processed_content[i - 1] if not isinstance(content, Law): raise ProcessingError('Invalid processing content: %s' % content) try: content.save() # First save (steps require id) self.call_processing_steps(content) content.save() # Save again self.doc_counter += 1 self.processed_content.append(content) except ProcessingError as e: # logger.error('ERROR: ES - index already created? % s' % e) self.doc_failed_counter += 1 logger.error(e)
def get_type(self, code): if code in self.get_types(): return self.get_types()[code] else: raise ProcessingError('Code not defined: %s' % code)
def find_court(self, query) -> Court: """ Example court names: - Oberverwaltungsgericht für das Land Schleswig-Holstein - VG Magdeburg - {"name": "OVG L\u00fcneburg 5. Senat"} :param query: Dict(name, code) :return: """ if 'code' in query: # Find based on code (EuGH, ...) try: return Court.objects.get(code=query['code']) except Court.DoesNotExist: pass if 'name' not in query: raise ProcessingError('Field name not in query') name = query['name'] if ' ' not in name: # Find based on name if name does not contain whitespaces try: return Court.objects.get(name=name) except Court.DoesNotExist: pass # Determine type # print('Find court: %s' % query) court_type = Court.extract_type_code_from_name(name) # print('Type code: %s' % court_type) if court_type is None: raise ProcessingError('Court type not found') location_levels = CourtTypes().get_type(court_type)['levels'] # print('Location level: %s' % location_levels) # Look for states if CourtLocationLevel.STATE in location_levels: state_id_mapping = {} for r in State.objects.values_list('id', 'name'): if r[1] != '': state_id_mapping[r[1]] = r[0] # Add variations, e.g. Hamburg_er, Holstein_isches for v in ['es', 'er', 'isches']: state_id_mapping[r[1] + v] = r[0] state_id = find_from_mapping(name, state_id_mapping) if state_id is not None: try: logger.debug('Look for state=%i, type=%s' % (state_id, court_type)) return Court.objects.get(state_id=state_id, court_type=court_type) except Court.DoesNotExist: pass # Look for cities if CourtLocationLevel.CITY in location_levels: city_id_mapping = {} for r in City.objects.values_list('id', 'name'): if r[1] != '': city_id_mapping[r[1]] = r[0] city_id = find_from_mapping(name, city_id_mapping) # print(city_id_mapping) if city_id is not None: try: logger.debug('Look for city=%i, type=%s' % (city_id, court_type)) return Court.objects.get(city_id=city_id, court_type=court_type) except Court.DoesNotExist: pass # Search by alias (use case-insensitive filter for umlauts) candidates = Court.objects.filter(aliases__icontains=name) if len(candidates) == 1: return candidates.first() elif len(candidates) > 1: # Multiple candidates found: fuzzy string matching? logger.warning('Multiple candidates found') # return candidates.first() # Nothing found raise Court.DoesNotExist
class ProcessingStep(CaseProcessingStep): """ Extract raw court names with this command: print('\n'.join([json.loads(s)['name'] for s in Case.objects.filter(court=1).values_list('court_raw', flat=True)[:10]])) """ description = 'Assign court to cases' # default_court = Court.objects.get(pk=Court.DEFAULT_ID) def __init__(self): super().__init__() def remove_chamber(self, name): """ Examples: LG Kiel Kammer für Handelssachen LG Koblenz 14. Zivilkammer OLG Koblenz 2. Senat für Bußgeldsachen Schleswig-Holsteinisches Oberlandesgericht Kartellsenat Vergabekammer Sachsen-Anhalt """ chamber = None patterns = [ '\s([0-9]+)(.*)$', '\s(Senat|Kammer) für(.*)$', '\s([a-zA-Z]+)(senat|kammer)(.*)$', ] for pattern in patterns: pattern = re.compile(pattern) match = re.search(pattern, name) if match: name = name[:match.start()] + name[match.end():] chamber = match.group(0).strip() return name.strip(), chamber def find_court(self, query) -> Court: """ Example court names: - Oberverwaltungsgericht für das Land Schleswig-Holstein - VG Magdeburg - {"name": "OVG L\u00fcneburg 5. Senat"} :param query: Dict(name, code) :return: """ if 'code' in query: # Find based on code (EuGH, ...) try: return Court.objects.get(code=query['code']) except Court.DoesNotExist: pass if 'name' not in query: raise ProcessingError('Field name not in query') name = query['name'] if ' ' not in name: # Find based on name if name does not contain whitespaces try: return Court.objects.get(name=name) except Court.DoesNotExist: pass # Determine type # print('Find court: %s' % query) court_type = Court.extract_type_code_from_name(name) # print('Type code: %s' % court_type) if court_type is None: raise ProcessingError('Court type not found') location_levels = CourtTypes().get_type(court_type)['levels'] # print('Location level: %s' % location_levels) # Look for states if CourtLocationLevel.STATE in location_levels: state_id_mapping = {} for r in State.objects.values_list('id', 'name'): if r[1] != '': state_id_mapping[r[1]] = r[0] # Add variations, e.g. Hamburg_er, Holstein_isches for v in ['es', 'er', 'isches']: state_id_mapping[r[1] + v] = r[0] state_id = find_from_mapping(name, state_id_mapping) if state_id is not None: try: logger.debug('Look for state=%i, type=%s' % (state_id, court_type)) return Court.objects.get(state_id=state_id, court_type=court_type) except Court.DoesNotExist: pass # Look for cities if CourtLocationLevel.CITY in location_levels: city_id_mapping = {} for r in City.objects.values_list('id', 'name'): if r[1] != '': city_id_mapping[r[1]] = r[0] city_id = find_from_mapping(name, city_id_mapping) # print(city_id_mapping) if city_id is not None: try: logger.debug('Look for city=%i, type=%s' % (city_id, court_type)) return Court.objects.get(city_id=city_id, court_type=court_type) except Court.DoesNotExist: pass # Search by alias (use case-insensitive filter for umlauts) candidates = Court.objects.filter(aliases__icontains=name) if len(candidates) == 1: return candidates.first() elif len(candidates) > 1: # Multiple candidates found: fuzzy string matching? logger.warning('Multiple candidates found') # return candidates.first() # Nothing found raise Court.DoesNotExist # if 'name' in query and 'code' in query: # candidates = Court.objects.filter(Q(name=query['name']) | Q(code=query['code'])) # instance = candidates[0] # # if len(candidates) == 0: # raise Court.DoesNotExist # elif 'name' in query: # instance = Court.objects.get(name=query['name']) # # else: # raise ProcessingError('Court fields missing: %s' % query) def process(self, case: Case) -> Case: court = json.loads(case.court_raw) try: if 'name' not in court: raise ProcessingError('court_raw has no `name` field') if court['name'] == 'EU': court['code'] = 'EuGH' # Extract court chamber court['name'], case.court_chamber = self.remove_chamber(court['name']) # Handle court instance # TODO Oberverwaltungsgericht für das Land Schleswig-Holsteins case.court = self.find_court(court) case.set_slug() except ProcessingError as e: case.court_id = Court.DEFAULT_ID logger.error('Count not assign court: %s - %s' % (e, court))
def handle_law_book(self, node) -> LawBook: # alternative: amtabk, jurabk code_a = node.xpath('metadaten/amtabk/text()') if code_a: code = code_a[0] else: code_b = node.xpath('metadaten/jurabk/text()') if code_b: code = code_b[0] else: raise ProcessingError('Could not find book_code') revision_date_str = None revision_date = None changelog = [] changelog_comments = node.xpath( 'metadaten/standangabe/standkommentar/text()') changelog_types = node.xpath('metadaten/standangabe/standtyp/text()') for key, value in enumerate(changelog_comments): changelog.append({'type': changelog_types[key], 'text': value}) if changelog_types[key] == 'Stand': revision_date_str = value if revision_date_str is not None: # print(revision_date_str) # [0-9]{2})\.([0-9]{4}) match = re.search( r'(?P<day>[0-9]{1,2})\.(?P<month>[0-9]{1,2})\.(?P<year>[0-9]{4})', revision_date_str) if match: # Revision data as string (datetime.date is not JSON serializable) revision_date = datetime.date(int(match.group('year')), int(match.group('month')), int(match.group('day'))) # revision_date = match.group('year') + '-' + match.group('month') + '-' + match.group('day') revision_date = revision_date.strftime('%Y-%m-%d') book_title_match = node.xpath('metadaten/langue/text()') if book_title_match: book_title = book_title_match[0].replace( '\n', ' ') # replace line breaks else: book_title = None book = LawBook( title=book_title, # gliederung=[], code=code, slug=slugify(code), footnotes=json.dumps( self.get_node_content( node, 'textdaten/fussnoten/Content/*')), # On book level? changelog=json.dumps(changelog) # revision_date=revision_date # jurabk or amtabk missing? ) if revision_date is not None: # TODO raise error if no revision date is provided? # raise ValueError('no revision date: %s; %s' % (changelog_comments, changelog_types)) book.revision_date = revision_date try: book.save() except DatabaseError as e: # TODO set latest depending on revision - check first if other books exist? raise ProcessingError('Cannot save book: %s' % e) return book
def empty_content(self): raise ProcessingError('Do not delete courts')
</verweis.norm> <v.abk ersatz="RDG"></v.abk> """ logger.debug('Extract refs for %s' % case) try: # Clean HTML (should be done by scrapers) case.content = html.unescape(case.content) case.content = re.sub(r'</?verweis\.norm[^>]*>', '', case.content) case.content = re.sub(r'</?v\.abk[^>]*>', '', case.content) case.content = CaseReferenceMarker.remove_markers( case.content) # TODO Removal only for legacy reasons # Do not change original content with markers _content, markers = self.extractor.extract(case.content) # Delete old markers CaseReferenceMarker.objects.filter(referenced_by=case).delete() marker_qs, ref_qs = self.save_markers(markers, case, self.assign_refs) return case except RefExError as e: raise ProcessingError(e)
def handle_input(self, input_content: str) -> None: """Parses law XML and creates book and law instances (append to processed_content) :param input_content: File path to law XML file :return: """ logger.debug('Reading from %s' % input_content) # File exist? if not os.path.isfile(input_content): raise ProcessingError('Is not file: %s' % input_content) # Count lines num_lines = sum(1 for line in open(input_content)) # Skip if lines count is invalid if (self.min_lines is not None and self.min_lines >= 0 and num_lines < self.min_lines) \ or (self.max_lines is not None and 0 <= self.max_lines < num_lines): logger.info('Skip - File has invalid line count (%i): %s' % (num_lines, input_content)) return # Parse XML tree tree = etree.parse(input_content) sort = 0 docs = [] # Prepare docs for idx, n in enumerate(tree.xpath('norm')): # Extract law content (with html tags) content = self.get_node_content(n, 'textdaten/text/Content/*') if idx == 0: # Save book with the first element book = self.handle_law_book(n) # Append section to book object if section title is found section_title = ( n.xpath('metadaten/gliederungseinheit/gliederungstitel/text()') or [None])[0] if section_title is not None: book.add_section(from_order=sort, title=section_title.strip()) # Create law object doc = Law( doknr=n.get('doknr'), section=(n.xpath('metadaten/enbez/text()') or [None])[0], amtabk=(n.xpath('metadaten/amtabk/text()') or [None])[0], kurzue=(n.xpath('metadaten/kurzue/text()') or [None])[0], title=(n.xpath('metadaten/titel/text()') or [''])[0].strip(), order=sort, # use in frontend for sorting content=content, footnotes=json.dumps( self.get_node_content(n, 'textdaten/fussnoten/Content/*')), book=book, ) # Perform processing steps # for processor in self.processing_steps: # type: LawProcessingStep # doc = processor.process(doc) # TODO is Verordnung? is Gesetz? strip <pre>? # slug (unique) slug = slugify(doc.section or '') if slug[:3] == 'ss-': # Is section-symbol slug = slug[3:] doc.slug = slug if slug != '': logger.debug('Pre-processed: %s' % doc) docs.append(doc) sort += 1 else: logger.warning('Ignore invalid document (no slug): %s' % doc) # Append to queue self.pre_processed_content.extend(docs)