Пример #1
0
    def get_available_processing_steps(self) -> dict:
        """Loads available processing steps based on package names in settings"""
        if self.available_processing_steps is None:
            self.available_processing_steps = {}

            # Get packages for model type
            if self.model.__name__ in settings.PROCESSING_STEPS:
                for step_package in settings.PROCESSING_STEPS[
                        self.model.__name__]:  # type: str
                    module = import_module(step_package)

                    if 'ProcessingStep' not in module.__dict__:
                        raise ProcessingError(
                            'Processing step package does not contain "ProcessingStep" class: %s'
                            % step_package)

                    step_cls = module.ProcessingStep(
                    )  # type: BaseProcessingStep

                    if not isinstance(step_cls, BaseProcessingStep):
                        raise ProcessingError(
                            'Processing step needs to inherit from BaseProcessingStep: %s'
                            % step_package)

                    step_name = step_package.split('.')[
                        -1]  # last module name from package path

                    # Write to dict
                    self.available_processing_steps[step_name] = step_cls
            else:
                raise ValueError(
                    'Model `%s` is missing settings.PROCESSING_STEPS.' %
                    self.model.__name__)

        return self.available_processing_steps
Пример #2
0
    def get_input(self) -> List[str]:
        """Select files from input_selector recursively and from directory with dir_selector """

        if self.input_selector is None:
            raise ProcessingError('input_selector is not set')

        content_list = self.get_input_content_from_selector(
            self.input_selector)[self.input_start:]

        if len(content_list) < 1:
            raise ProcessingError('Input selector is empty: %s' %
                                  self.input_selector)

        if self.input_limit > 0:
            content_list = content_list[:self.input_limit]

        return content_list
Пример #3
0
    def assign_law_ref(self, raw: Ref, ref: Reference) -> Reference:
        """
        Find corresponding database item to reference for laws
        """
        if raw.book is None or raw.section is None:
            raise ProcessingError('Reference data is not set')
        else:
            candidates = Law.objects.filter(book__slug=raw.book,
                                            slug=raw.section)

            if len(candidates) >= 1:
                # Multiple candidates should not occur
                ref.law = candidates.first()
            else:
                raise ProcessingError(
                    'Cannot find ref target in with book=%s; section=%s; for ref=%s'
                    % (raw.book, raw.section, raw))

        return ref
Пример #4
0
    def save_markers(
        self,
        markers,
        referenced_by,
        assign_references=True
    ) -> Tuple[List[ReferenceMarker], List[Reference]]:
        """Convert module objects into Django objects"""
        saved_markers = []
        saved_refs = []

        error_counter = 0
        success_counter = 0

        for marker in markers:  # type: RefMarker
            my_marker = self.marker_model(referenced_by=referenced_by,
                                          text=marker.text,
                                          start=marker.start,
                                          end=marker.end)
            my_marker.save()

            for ref in marker.references:  # type: Ref
                my_ref = Reference(to=marker.text)

                # Assign references to target items
                if assign_references:
                    try:
                        if ref.ref_type == RefType.LAW:
                            my_ref = self.assign_law_ref(ref, my_ref)
                        elif ref.ref_type == RefType.CASE:
                            my_ref = self.assign_case_ref(ref, my_ref)
                        else:
                            raise ProcessingError(
                                'Unsupported reference type: %s' %
                                ref.ref_type)

                        success_counter += 1
                    except ProcessingError as e:
                        logger.error(e)
                        error_counter += 1

                # TODO Should we save references all the time or only on successful matching?
                my_ref.set_to_hash()
                my_ref.save()

                # Save in m2m helper
                self.reference_from_content_model(reference=my_ref,
                                                  marker=my_marker).save()

                saved_refs.append(my_ref)
            saved_markers.append(my_marker)

        logger.debug('References: saved=%i; errors=%i' %
                     (success_counter, error_counter))

        return saved_markers, saved_refs
Пример #5
0
    def get_wikipedia_extract(self, query):
        res = requests.get(
            'https://' + self.language +
            '.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro=&explaintext=&titles=%s'
            % query)

        if res.status_code == 200:
            res_obj = res.json()
            for p in res_obj['query']['pages']:
                return res_obj['query']['pages'][p]['extract']
        raise ProcessingError('Cannot get extract')
Пример #6
0
    def get_wikipedia_field(self, query, field='pageid'):
        # Get Wikipedia ID from search API
        res = requests.get(
            'https://' + self.language +
            '.wikipedia.org/w/api.php?action=query&list=search&srsearch=%s&utf8=&format=json'
            % query)

        if res.status_code == 200:
            res_obj = res.json()
            if len(res_obj['query']['search']) > 0:
                return res_obj['query']['search'][0][field]
        raise ProcessingError('Cannot get field')
Пример #7
0
    def get_wikipedia_image(self, query, size=250):
        res = requests.get(
            'https://' + self.language +
            '.wikipedia.org/w/api.php?action=query&titles=%s&prop=pageimages&format=json&pithumbsize=%i'
            % (query, size))

        if res.status_code == 200:
            res_obj = res.json()
            # print(res_obj['query']['pages'])
            for p in res_obj['query']['pages']:
                if 'thumbnail' in res_obj['query']['pages'][p]:
                    return res_obj['query']['pages'][p]['thumbnail']['source']
        raise ProcessingError('Cannot get image')
Пример #8
0
    def from_json_file(file_path):
        with open(file_path) as f:
            out = serializers.deserialize(
                "json", f.read())  # , ignorenonexistent=True)
            # print(len(out))

            try:
                for o in out:
                    return o.object
            except DeserializationError:
                pass

            raise ProcessingError('Cannot deserialize: %s' % file_path)
Пример #9
0
    def set_processing_steps(self, step_list):
        """Selects processing steps from available dict"""

        # Unset old steps and load available steps
        self.processing_steps = []
        self.get_available_processing_steps()

        if not isinstance(step_list, List):
            step_list = [step_list]

        if 'all' in step_list:
            return self.available_processing_steps.values()

        for step in step_list:
            if step in self.available_processing_steps:
                self.processing_steps.append(
                    self.available_processing_steps[step])
            else:
                raise ProcessingError('Requested step is not available: %s' %
                                      step)
Пример #10
0
    def assign_case_ref(self, raw: Ref, ref: Reference) -> Reference:
        """
        Find corresponding database item to reference for cases
        """

        candidates = Case.objects.filter(court__aliases__contains=raw.court,
                                         file_number=raw.file_number)

        if len(candidates) == 1:
            ref.case = candidates.first()
        elif len(candidates) > 1:
            # Multiple candidates
            # TODO better heuristic?
            ref.case = candidates.first()
        else:
            # Not found
            raise ProcessingError(
                'Cannot find ref target in with court=%s; file_number=%s; for ref=%s'
                % (raw.court, raw.file_number, raw))

        return ref
Пример #11
0
    def process(self, law: Law) -> Law:
        """
        Read law.content, search for references, add ref marker (e.g. [ref=1]xy[/ref]) to text, add ref data to law.

        Ref data should contain position information, for CPA computations ...

        :param law: to be processed
        :return: processed law
        """

        try:

            self.extractor.law_book_context = law.book.code

            law.content, markers = self.extractor.extract(law.content)

            LawReferenceMarker.objects.filter(referenced_by=law).delete()

            self.save_markers(markers, law)

            return law

        except RefExError as e:
            raise ProcessingError(e)
Пример #12
0
    def process_content(self):
        for i, content in enumerate(self.pre_processed_content):  # type: Law
            if i > 0:
                # .save() is already called by input handler
                content.previous = self.pre_processed_content[i - 1]

            if not isinstance(content, Law):
                raise ProcessingError('Invalid processing content: %s' %
                                      content)

            try:
                content.save()  # First save (steps require id)

                self.call_processing_steps(content)

                content.save()  # Save again

                self.doc_counter += 1
                self.processed_content.append(content)

            except ProcessingError as e:
                # logger.error('ERROR: ES - index already created? % s' % e)
                self.doc_failed_counter += 1
                logger.error(e)
Пример #13
0
 def get_type(self, code):
     if code in self.get_types():
         return self.get_types()[code]
     else:
         raise ProcessingError('Code not defined: %s' % code)
Пример #14
0
    def find_court(self, query) -> Court:
        """

        Example court names:
        - Oberverwaltungsgericht für das Land Schleswig-Holstein
        - VG Magdeburg
        - {"name": "OVG L\u00fcneburg 5. Senat"}

        :param query: Dict(name, code)
        :return:
        """

        if 'code' in query:
            # Find based on code (EuGH, ...)
            try:
                return Court.objects.get(code=query['code'])
            except Court.DoesNotExist:
                pass

        if 'name' not in query:
            raise ProcessingError('Field name not in query')

        name = query['name']

        if ' ' not in name:
            # Find based on name if name does not contain whitespaces
            try:
                return Court.objects.get(name=name)
            except Court.DoesNotExist:
                pass

        # Determine type
        # print('Find court: %s' % query)
        court_type = Court.extract_type_code_from_name(name)
        # print('Type code: %s' % court_type)

        if court_type is None:
            raise ProcessingError('Court type not found')

        location_levels = CourtTypes().get_type(court_type)['levels']

        # print('Location level: %s' % location_levels)

        # Look for states
        if CourtLocationLevel.STATE in location_levels:
            state_id_mapping = {}
            for r in State.objects.values_list('id', 'name'):
                if r[1] != '':
                    state_id_mapping[r[1]] = r[0]

                    # Add variations, e.g. Hamburg_er, Holstein_isches
                    for v in ['es', 'er', 'isches']:
                        state_id_mapping[r[1] + v] = r[0]

            state_id = find_from_mapping(name, state_id_mapping)

            if state_id is not None:
                try:
                    logger.debug('Look for state=%i, type=%s' % (state_id, court_type))
                    return Court.objects.get(state_id=state_id, court_type=court_type)
                except Court.DoesNotExist:
                    pass

        # Look for cities
        if CourtLocationLevel.CITY in location_levels:
            city_id_mapping = {}
            for r in City.objects.values_list('id', 'name'):
                if r[1] != '':
                    city_id_mapping[r[1]] = r[0]

            city_id = find_from_mapping(name, city_id_mapping)
            # print(city_id_mapping)
            if city_id is not None:
                try:
                    logger.debug('Look for city=%i, type=%s' % (city_id, court_type))
                    return Court.objects.get(city_id=city_id, court_type=court_type)
                except Court.DoesNotExist:
                    pass

        # Search by alias (use case-insensitive filter for umlauts)
        candidates = Court.objects.filter(aliases__icontains=name)
        if len(candidates) == 1:
            return candidates.first()
        elif len(candidates) > 1:
            # Multiple candidates found: fuzzy string matching?
            logger.warning('Multiple candidates found')

            # return candidates.first()

        # Nothing found
        raise Court.DoesNotExist
Пример #15
0
class ProcessingStep(CaseProcessingStep):
    """

    Extract raw court names with this command:

    print('\n'.join([json.loads(s)['name'] for s in Case.objects.filter(court=1).values_list('court_raw', flat=True)[:10]]))

    """

    description = 'Assign court to cases'
    # default_court = Court.objects.get(pk=Court.DEFAULT_ID)

    def __init__(self):
        super().__init__()

    def remove_chamber(self, name):
        """
        Examples:

        LG Kiel Kammer für Handelssachen
        LG Koblenz 14. Zivilkammer
        OLG Koblenz 2. Senat für Bußgeldsachen
        Schleswig-Holsteinisches Oberlandesgericht Kartellsenat
        Vergabekammer Sachsen-Anhalt
        """

        chamber = None
        patterns = [
            '\s([0-9]+)(.*)$',
            '\s(Senat|Kammer) für(.*)$',
            '\s([a-zA-Z]+)(senat|kammer)(.*)$',
        ]

        for pattern in patterns:
            pattern = re.compile(pattern)

            match = re.search(pattern, name)
            if match:
                name = name[:match.start()] + name[match.end():]
                chamber = match.group(0).strip()

        return name.strip(), chamber

    def find_court(self, query) -> Court:
        """

        Example court names:
        - Oberverwaltungsgericht für das Land Schleswig-Holstein
        - VG Magdeburg
        - {"name": "OVG L\u00fcneburg 5. Senat"}

        :param query: Dict(name, code)
        :return:
        """

        if 'code' in query:
            # Find based on code (EuGH, ...)
            try:
                return Court.objects.get(code=query['code'])
            except Court.DoesNotExist:
                pass

        if 'name' not in query:
            raise ProcessingError('Field name not in query')

        name = query['name']

        if ' ' not in name:
            # Find based on name if name does not contain whitespaces
            try:
                return Court.objects.get(name=name)
            except Court.DoesNotExist:
                pass

        # Determine type
        # print('Find court: %s' % query)
        court_type = Court.extract_type_code_from_name(name)
        # print('Type code: %s' % court_type)

        if court_type is None:
            raise ProcessingError('Court type not found')

        location_levels = CourtTypes().get_type(court_type)['levels']

        # print('Location level: %s' % location_levels)

        # Look for states
        if CourtLocationLevel.STATE in location_levels:
            state_id_mapping = {}
            for r in State.objects.values_list('id', 'name'):
                if r[1] != '':
                    state_id_mapping[r[1]] = r[0]

                    # Add variations, e.g. Hamburg_er, Holstein_isches
                    for v in ['es', 'er', 'isches']:
                        state_id_mapping[r[1] + v] = r[0]

            state_id = find_from_mapping(name, state_id_mapping)

            if state_id is not None:
                try:
                    logger.debug('Look for state=%i, type=%s' % (state_id, court_type))
                    return Court.objects.get(state_id=state_id, court_type=court_type)
                except Court.DoesNotExist:
                    pass

        # Look for cities
        if CourtLocationLevel.CITY in location_levels:
            city_id_mapping = {}
            for r in City.objects.values_list('id', 'name'):
                if r[1] != '':
                    city_id_mapping[r[1]] = r[0]

            city_id = find_from_mapping(name, city_id_mapping)
            # print(city_id_mapping)
            if city_id is not None:
                try:
                    logger.debug('Look for city=%i, type=%s' % (city_id, court_type))
                    return Court.objects.get(city_id=city_id, court_type=court_type)
                except Court.DoesNotExist:
                    pass

        # Search by alias (use case-insensitive filter for umlauts)
        candidates = Court.objects.filter(aliases__icontains=name)
        if len(candidates) == 1:
            return candidates.first()
        elif len(candidates) > 1:
            # Multiple candidates found: fuzzy string matching?
            logger.warning('Multiple candidates found')

            # return candidates.first()

        # Nothing found
        raise Court.DoesNotExist

        # if 'name' in query and 'code' in query:
        #     candidates = Court.objects.filter(Q(name=query['name']) | Q(code=query['code']))
        #     instance = candidates[0]
        #
        #     if len(candidates) == 0:
        #         raise Court.DoesNotExist
        # elif 'name' in query:
        #     instance = Court.objects.get(name=query['name'])
        #
        # else:
        #     raise ProcessingError('Court fields missing: %s' % query)


    def process(self, case: Case) -> Case:

        court = json.loads(case.court_raw)

        try:
            if 'name' not in court:
                raise ProcessingError('court_raw has no `name` field')

            if court['name'] == 'EU':
                court['code'] = 'EuGH'

            # Extract court chamber
            court['name'], case.court_chamber = self.remove_chamber(court['name'])

            # Handle court instance
            # TODO Oberverwaltungsgericht für das Land Schleswig-Holsteins

            case.court = self.find_court(court)
            case.set_slug()

        except ProcessingError as e:
            case.court_id = Court.DEFAULT_ID
            logger.error('Count not assign court: %s - %s' % (e, court))
Пример #16
0
    def handle_law_book(self, node) -> LawBook:
        # alternative: amtabk, jurabk
        code_a = node.xpath('metadaten/amtabk/text()')

        if code_a:
            code = code_a[0]
        else:
            code_b = node.xpath('metadaten/jurabk/text()')

            if code_b:
                code = code_b[0]
            else:
                raise ProcessingError('Could not find book_code')

        revision_date_str = None
        revision_date = None
        changelog = []
        changelog_comments = node.xpath(
            'metadaten/standangabe/standkommentar/text()')
        changelog_types = node.xpath('metadaten/standangabe/standtyp/text()')
        for key, value in enumerate(changelog_comments):
            changelog.append({'type': changelog_types[key], 'text': value})

            if changelog_types[key] == 'Stand':
                revision_date_str = value

        if revision_date_str is not None:
            # print(revision_date_str)
            # [0-9]{2})\.([0-9]{4})
            match = re.search(
                r'(?P<day>[0-9]{1,2})\.(?P<month>[0-9]{1,2})\.(?P<year>[0-9]{4})',
                revision_date_str)
            if match:
                # Revision data as string (datetime.date is not JSON serializable)
                revision_date = datetime.date(int(match.group('year')),
                                              int(match.group('month')),
                                              int(match.group('day')))
                # revision_date = match.group('year') + '-' + match.group('month') + '-' + match.group('day')
                revision_date = revision_date.strftime('%Y-%m-%d')

        book_title_match = node.xpath('metadaten/langue/text()')

        if book_title_match:
            book_title = book_title_match[0].replace(
                '\n', ' ')  # replace line breaks
        else:
            book_title = None

        book = LawBook(
            title=book_title,
            # gliederung=[],
            code=code,
            slug=slugify(code),
            footnotes=json.dumps(
                self.get_node_content(
                    node, 'textdaten/fussnoten/Content/*')),  # On book level?
            changelog=json.dumps(changelog)
            # revision_date=revision_date
            # jurabk or amtabk missing?
        )

        if revision_date is not None:
            # TODO raise error if no revision date is provided?
            # raise ValueError('no revision date: %s; %s' % (changelog_comments, changelog_types))
            book.revision_date = revision_date

        try:
            book.save()
        except DatabaseError as e:
            # TODO set latest depending on revision - check first if other books exist?
            raise ProcessingError('Cannot save book: %s' % e)

        return book
Пример #17
0
 def empty_content(self):
     raise ProcessingError('Do not delete courts')
Пример #18
0
        </verweis.norm>
        <v.abk ersatz="RDG"></v.abk>
        
        """

        logger.debug('Extract refs for %s' % case)

        try:

            # Clean HTML (should be done by scrapers)
            case.content = html.unescape(case.content)
            case.content = re.sub(r'</?verweis\.norm[^>]*>', '', case.content)
            case.content = re.sub(r'</?v\.abk[^>]*>', '', case.content)

            case.content = CaseReferenceMarker.remove_markers(
                case.content)  # TODO Removal only for legacy reasons

            # Do not change original content with markers
            _content, markers = self.extractor.extract(case.content)

            # Delete old markers
            CaseReferenceMarker.objects.filter(referenced_by=case).delete()

            marker_qs, ref_qs = self.save_markers(markers, case,
                                                  self.assign_refs)

            return case

        except RefExError as e:
            raise ProcessingError(e)
Пример #19
0
    def handle_input(self, input_content: str) -> None:
        """Parses law XML and creates book and law instances (append to processed_content)

        :param input_content: File path to law XML file
        :return:
        """

        logger.debug('Reading from %s' % input_content)

        # File exist?
        if not os.path.isfile(input_content):
            raise ProcessingError('Is not file: %s' % input_content)

        # Count lines
        num_lines = sum(1 for line in open(input_content))

        # Skip if lines count is invalid
        if (self.min_lines is not None and self.min_lines >= 0 and num_lines < self.min_lines) \
                or (self.max_lines is not None and 0 <= self.max_lines < num_lines):
            logger.info('Skip - File has invalid line count (%i): %s' %
                        (num_lines, input_content))
            return

        # Parse XML tree
        tree = etree.parse(input_content)
        sort = 0
        docs = []

        # Prepare docs
        for idx, n in enumerate(tree.xpath('norm')):
            # Extract law content (with html tags)
            content = self.get_node_content(n, 'textdaten/text/Content/*')

            if idx == 0:
                # Save book with the first element
                book = self.handle_law_book(n)

            # Append section to book object if section title is found
            section_title = (
                n.xpath('metadaten/gliederungseinheit/gliederungstitel/text()')
                or [None])[0]
            if section_title is not None:
                book.add_section(from_order=sort, title=section_title.strip())

            # Create law object
            doc = Law(
                doknr=n.get('doknr'),
                section=(n.xpath('metadaten/enbez/text()') or [None])[0],
                amtabk=(n.xpath('metadaten/amtabk/text()') or [None])[0],
                kurzue=(n.xpath('metadaten/kurzue/text()') or [None])[0],
                title=(n.xpath('metadaten/titel/text()') or [''])[0].strip(),
                order=sort,  # use in frontend for sorting
                content=content,
                footnotes=json.dumps(
                    self.get_node_content(n, 'textdaten/fussnoten/Content/*')),
                book=book,
            )

            # Perform processing steps
            # for processor in self.processing_steps:  # type: LawProcessingStep
            #     doc = processor.process(doc)

            # TODO is Verordnung? is Gesetz? strip <pre>?
            # slug (unique)
            slug = slugify(doc.section or '')

            if slug[:3] == 'ss-':  # Is section-symbol
                slug = slug[3:]

            doc.slug = slug

            if slug != '':
                logger.debug('Pre-processed: %s' % doc)
                docs.append(doc)
                sort += 1
            else:
                logger.warning('Ignore invalid document (no slug): %s' % doc)

        # Append to queue
        self.pre_processed_content.extend(docs)