예제 #1
0
    def bulk_index(self, doc, doc_type=None, doc_index=None, doc_id=None):
        if doc_type is None:
            if self.es_type is None:
                raise ProcessingError('No ES document type defined')
            doc_type = self.es_type

        if doc_index is None:
            doc_index = self.es_index

        action = {'index': {'_index': doc_index, '_type': doc_type}}

        if doc_id is not None:
            action['index']['_id'] = doc_id

        self.bulk_data += json.dumps(action) + '\n'

        # Convert Django model
        if isinstance(doc, Model):
            logger.debug('Convert Django model: %s' % doc)

            # Should some fields be excluded?
            if hasattr(doc, 'es_fields_exclude'):
                doc_dict = model_to_dict(doc, exclude=doc.es_fields_exclude)
            else:
                doc_dict = model_to_dict(doc)

            # Has pre_index signal?
            pre_index = getattr(doc, "pre_index", None)
            if callable(pre_index):
                # logger.debug('Calling pre_index signal for model')
                pre_index(doc_dict)

            # if 'revision_date' in doc:
            #     del doc['revision_date']
            #     pass

            doc_str = json.dumps(doc_dict)

        elif isinstance(doc, dict):
            doc_str = json.dumps(doc)
        else:
            raise ProcessingError('Cannot transform to JSON: %s' % doc)

        self.bulk_data += doc_str + '\n'

        self.bulk_size += 1

        if self.bulk_size >= self.items_per_batch:
            self.send_data_to_es()
예제 #2
0
    def get_input(self) -> list:
        """Select files from input_selector recursively and from directory with dir_selector """

        if self.input_selector is None:
            raise ProcessingError('input_selector is not set')

        content_list = self.get_input_content_from_selector(
            self.input_selector)[self.input_start:]

        if len(content_list) < 1:
            raise ProcessingError('Input selector is empty: %s' %
                                  self.input_selector)

        if self.input_limit > 0:
            content_list = content_list[:self.input_limit]

        return content_list
예제 #3
0
    def handle_single_law_ref(self, ref_str, law_ids):
        logger.debug('Single ref found in: %s' % ref_str)

        # Single ref
        mm = self.get_law_ref_match_single(ref_str)

        # Find book and section (only single result possible)
        if mm is not None:
            # mm.groupdict()

            if mm.group('book') is not None:
                # Found book
                book = mm.group('book').lower()
            else:
                raise ProcessingError('Ref book is not set: %s ' % ref_str)

            if mm.group('sect') is not None:
                # Found section
                sect = str(mm.group('sect'))
            else:
                raise ProcessingError('Ref sect is not set')

            if mm.group('sect_az') is not None:
                # Found section addon
                sect += mm.group('sect_az').lower()

            law_id = {'book': book, 'sect': sect, 'type': 'law'}

            logger.debug('Law ID: %s' % law_id)

            law_ids.append(law_id)
        else:
            law_ids.append({
                'book': 'not matched',
                'sect': 'NOT MATCHED (single) %s ' % ref_str
            })
            logger.warning('Law ID could not be matched.')

        return law_ids
예제 #4
0
    def set_es_url(self, url):
        """Parse ES url to extract index name etc."""
        o = urlparse(url)

        self.es_scheme = o.scheme
        self.es_host = o.hostname
        self.es_port = o.port

        p = o.path.split('/')

        if len(p) == 2:
            self.es_index = p[1]
        else:
            raise ProcessingError('Cannot extract index from ES url: %s' % url)

        self.es_url = '%s://%s:%i' % (self.es_scheme, self.es_host,
                                      self.es_port)
예제 #5
0
    def handle_input(self, input_content: str) -> None:
        """Parses law XML and creates book and law instances (append to processed_content)

        :param input_content: File path to law XML file
        :return:
        """

        logger.debug('Reading from %s' % input_content)

        # File exist?
        if not os.path.isfile(input_content):
            raise ProcessingError('Is not file: %s' % input_content)

        # Count lines
        num_lines = sum(1 for line in open(input_content))

        # Skip if lines count is invalid
        if (self.min_lines is not None and self.min_lines >= 0 and num_lines < self.min_lines) \
                or (self.max_lines is not None and 0 <= self.max_lines < num_lines):
            logger.info('Skip - File has invalid line count (%i): %s' %
                        (num_lines, input_content))
            return

        # Parse XML tree
        tree = etree.parse(input_content)
        sort = 0
        docs = []

        # Prepare docs
        for idx, n in enumerate(tree.xpath('norm')):
            # Extract law content (with html tags)
            content = self.get_node_content(n, 'textdaten/text/Content/*')

            if idx == 0:
                # Save book with the first element
                book = self.handle_law_book(n)

            # Append section to book object if section title is found
            section_title = (
                n.xpath('metadaten/gliederungseinheit/gliederungstitel/text()')
                or [None])[0]
            if section_title is not None:
                book.add_section(from_order=sort, title=section_title.strip())

            # Create law object
            doc = Law(
                doknr=n.get('doknr'),
                enbez=(n.xpath('metadaten/enbez/text()') or [None])[0],
                amtabk=(n.xpath('metadaten/amtabk/text()') or [None])[0],
                kurzue=(n.xpath('metadaten/kurzue/text()') or [None])[0],
                title=(n.xpath('metadaten/titel/text()') or [''])[0].strip(),
                order=sort,  # use in frontend for sorting
                content=content,
                footnotes=json.dumps(
                    self.get_node_content(n, 'textdaten/fussnoten/Content/*')),
                book=book,
            )

            # Perform processing steps
            # for processor in self.processing_steps:  # type: LawProcessingStep
            #     doc = processor.process(doc)

            # TODO is Verordnung? is Gesetz? strip <pre>?
            # slug (unique)
            slug = slugify(doc.enbez or '')

            if slug[:3] == 'ss-':  # Is section-symbol
                slug = slug[3:]

            doc.slug = slug

            if slug != '':
                logger.debug('Pre-processed: %s' % doc)
                docs.append(doc)
                sort += 1
            else:
                logger.warning('Ignore invalid document (no slug): %s' % doc)

        # Append to queue
        self.pre_processed_content.extend(docs)
예제 #6
0
    def handle_law_book(self, node) -> LawBook:
        jurabk = (node.xpath('metadaten/jurabk/text()') or [None])[0]

        revision_date_str = None
        revision_date = None
        changelog = []
        changelog_comments = node.xpath(
            'metadaten/standangabe/standkommentar/text()')
        changelog_types = node.xpath('metadaten/standangabe/standtyp/text()')
        for key, value in enumerate(changelog_comments):
            changelog.append({'type': changelog_types[key], 'text': value})

            if changelog_types[key] == 'Stand':
                revision_date_str = value

        if revision_date_str is not None:
            # print(revision_date_str)
            # [0-9]{2})\.([0-9]{4})
            match = re.search(
                r'(?P<day>[0-9]{1,2})\.(?P<month>[0-9]{1,2})\.(?P<year>[0-9]{4})',
                revision_date_str)
            if match:
                # Revision data as string (datetime.date is not JSON serializable)
                revision_date = datetime.date(int(match.group('year')),
                                              int(match.group('month')),
                                              int(match.group('day')))
                # revision_date = match.group('year') + '-' + match.group('month') + '-' + match.group('day')
                revision_date = revision_date.strftime('%Y-%m-%d')

        book_title_match = node.xpath('metadaten/langue/text()')

        if book_title_match:
            book_title = book_title_match[0].replace(
                '\n', ' ')  # replace line breaks
        else:
            book_title = None

        book = LawBook(
            title=book_title,
            # gliederung=[],
            code=jurabk,
            slug=slugify(jurabk),
            footnotes=json.dumps(
                self.get_node_content(
                    node, 'textdaten/fussnoten/Content/*')),  # On book level?
            changelog=json.dumps(changelog)
            # revision_date=revision_date
        )

        if revision_date is not None:
            # TODO raise error if no revision date is provided?
            # raise ValueError('no revision date: %s; %s' % (changelog_comments, changelog_types))
            book.revision_date = revision_date

        try:
            book.save()
        except DatabaseError as e:
            # TODO set latest depending on revision - check first if other books exist?
            raise ProcessingError('Cannot save book: %s' % e)

        return book
예제 #7
0
    def handle_multiple_law_refs(self, ref_str, law_ids):
        # Search for multiple refs
        mms = self.get_law_ref_match_multi(ref_str)

        ids_tmp = []
        prev_sect = None
        prev_book = None

        logger.debug('Multi refs found in: %s' % ref_str)

        # Loop over all results
        for m in mms:

            # If book is not set, use __placeholder__ and replace later
            if m.group('book') is not None:
                book = m.group('book').lower()
            else:
                book = '__book__'

            # Section must exist
            if m.group('sect') is not None:
                sect = str(m.group('sect'))
            else:
                raise ProcessingError('Ref sect is not set')

            if m.group('sect_az') is not None:
                sect += m.group('sect_az').lower()

            law_id = {'book': book, 'sect': sect, 'type': 'law'}

            logger.debug('Law ID found: %s' % law_id)

            # Check for section ranges
            if m.group('delimiter') == 'bis':
                logger.debug('Handle section range - Add ids from ' +
                             prev_sect + ' to ' + sect)
                # TODO how to handle az sects
                prev_sect = re.sub('[^0-9]', '', prev_sect)
                sect = re.sub('[^0-9]', '', sect)

                for between_sect in range(int(prev_sect) + 1, int(sect)):
                    # print(between_sect)

                    ids_tmp.append({
                        'book': prev_book,
                        'sect': between_sect,
                        'type': 'law'
                    })
            else:
                prev_sect = sect
                prev_book = book

            ids_tmp.append(law_id)

        # law_ids.append('multi = ' + ref_str)
        # handle __book__
        logger.debug('All law ids found: %s' % ids_tmp)

        ids_tmp.reverse()
        book = None
        for id_tmp in ids_tmp:
            if id_tmp['book'] != '__book__':
                book = id_tmp['book']
            elif book is not None:
                id_tmp['book'] = book
            else:
                raise ProcessingError(
                    'Cannot determine law book (Should never happen): %s' %
                    ref_str)

            law_ids.append(id_tmp)

        return law_ids
예제 #8
0
    def extract_law_refs(self,
                         referenced_by: Case,
                         content: str,
                         key: int = 0):
        """
        § 3d AsylG
        § 123 VwGO
        §§ 3, 3b AsylG
        § 77 Abs. 1 Satz 1, 1. Halbsatz AsylG
        § 3 Abs. 1 AsylG
        § 77 Abs. 2 AsylG
        § 113 Abs. 5 Satz 1 VwGO
        § 3 Abs. 1 Nr. 1 i.V.m. § 3b AsylG
        § 3a Abs. 1 und 2 AsylG
        §§ 154 Abs. 1 VwGO
        § 83 b AsylG
        § 167 VwGO iVm §§ 708 Nr. 11, 711 ZPO
        § 167 VwGO i.V.m. §§ 708 Nr. 11, 711 ZPO
        §§ 167 Abs. 2 VwGO, 708 Nr. 11, 711 ZPO
        §§ 52 Abs. 1; 53 Abs. 2 Nr. 1; 63 Abs. 2 GKG
        § 6 Abs. 5 Satz 1 LBO
        §§ 80 a Abs. 3, 80 Abs. 5 VwGO
        § 1 Satz 2 SbStG
        § 2 ZWStS
        § 6 Abs. 2 S. 2 ZWStS

        TODO all law-book jurabk

        :param referenced_by:
        :param key:
        :link https://www.easy-coding.de/Thread/5536-RegExp-f%C3%BCr-Gesetze/

        :param content:
        :return:
        """

        logger.debug('Extracting law references')

        refs = []
        results = list(re.finditer(self.get_law_ref_regex(), content))
        marker_offset = 0

        logger.debug('Current content value: %s' % content)
        logger.debug('Law refs found: %i' % len(results))

        for ref_m in results:

            ref_str = str(ref_m.group(0)).strip()
            law_ids = []

            # Handle single and multi refs separately
            if re.match(r'^(Art\.|§)\s', ref_str):
                law_ids = self.handle_single_law_ref(ref_str, law_ids)

            elif re.match(r'^§§\s', ref_str):
                law_ids = self.handle_multiple_law_refs(ref_str, law_ids)

            else:
                raise ProcessingError('Unsupported ref beginning: %s' %
                                      ref_str)

            ref = CaseReferenceMarker(referenced_by=referenced_by,
                                      text=ref_str,
                                      start=ref_m.start(),
                                      end=ref_m.end(),
                                      line=0)  # TODO
            ref.set_uuid()
            ref.set_references(law_ids)

            refs.append(ref)
            content, marker_offset = ref.replace_content(
                content, marker_offset, key + len(refs))

        return content, refs
예제 #9
0
    def find_court(query) -> Court:
        """

        Example court names:
        - Oberverwaltungsgericht für das Land Schleswig-Holstein
        - VG Magdeburg
        - {"name": "OVG L\u00fcneburg 5. Senat"}

        :param query: Dict(name, code)
        :return:
        """

        if 'code' in query:
            # Find based on code (EuGH, ...)
            try:
                return Court.objects.get(code=query['code'])
            except Court.DoesNotExist:
                pass

        if 'name' not in query:
            raise ProcessingError('Field name not in query')

        if ' ' not in query['name']:
            # Find based on name if name does not contain whitespaces
            try:
                return Court.objects.get(name=query['name'])
            except Court.DoesNotExist:
                pass

        # Determine type
        # print('Find court: %s' % query)
        court_type = Court.extract_type_code_from_name(query['name'])
        # print('Type code: %s' % court_type)

        if court_type is None:
            raise ProcessingError('Court type not found')

        location_levels = CourtTypes().get_type(court_type)['levels']

        # print('Location level: %s' % location_levels)

        # Look for states
        if CourtLocationLevel.STATE in location_levels:
            state_id_mapping = {}
            for r in State.objects.values_list('id', 'name'):
                if r[1] != '':
                    state_id_mapping[r[1]] = r[0]

                    # Add variations, e.g. Hamburg_er, Holstein_isches
                    for v in ['es', 'er', 'isches']:
                        state_id_mapping[r[1] + v] = r[0]

            state_id = find_from_mapping(query['name'], state_id_mapping)

            if state_id is not None:
                try:
                    logger.debug('Look for state=%i, type=%s' %
                                 (state_id, court_type))
                    return Court.objects.get(state_id=state_id,
                                             court_type=court_type)
                except Court.DoesNotExist:
                    pass

        # Look for cities
        if CourtLocationLevel.CITY in location_levels:
            city_id_mapping = {}
            for r in City.objects.values_list('id', 'name'):
                if r[1] != '':
                    city_id_mapping[r[1]] = r[0]

            city_id = find_from_mapping(query['name'], city_id_mapping)
            # print(city_id_mapping)
            if city_id is not None:
                try:
                    logger.debug('Look for city=%i, type=%s' %
                                 (city_id, court_type))
                    return Court.objects.get(city_id=city_id,
                                             court_type=court_type)
                except Court.DoesNotExist:
                    pass

        # Nothing found
        raise Court.DoesNotExist

        # if 'name' in query and 'code' in query:
        #     candidates = Court.objects.filter(Q(name=query['name']) | Q(code=query['code']))
        #     instance = candidates[0]
        #
        #     if len(candidates) == 0:
        #         raise Court.DoesNotExist
        # elif 'name' in query:
        #     instance = Court.objects.get(name=query['name'])
        #
        # else:
        #     raise ProcessingError('Court fields missing: %s' % query)

        return instance