예제 #1
0
def construct_ziotype_data(session, log_scope,
                           document: etree.ElementBase) -> dict:
    fields = document.find("velden")
    omschrijving = trim_string(
        session,
        log_scope,
        find(fields, "naam"),
        80,
        "omschrijving",
        ObjectTypenKeys.resultaattypen,
    )
    return {
        "informatieobjecttype_omschrijving":
        omschrijving,
        "volgnummer":
        int(document.get("volgnummer")),
        "richting":
        get_choice_field(
            session,
            f"{log_scope} richting",
            find(fields, "type", False),
            RichtingChoices.values,
            ObjectTypenKeys.zaakinformatieobjecttypen,
            default=DEFAULT_RICHTING,
        ),
        # TODO no mapping for non-required fields
        # "statustype": "http://example.com"
    }
예제 #2
0
 def _get_info(self, link: etree.ElementBase) -> str:
     href = link.get("href")
     is_ncdetails = all([(key in href)
                         for key in ["details", "ncshipment"]])
     args = dict([
         ("url", href),
         (
             "headers",
             dict([
                 ("Accept", link.get("media-type")),
                 ("Authorization", "Basic %s" % self.authorization),
                 ("Accept-language", "en-CA"),
             ] + ([("Content-Type",
                    link.get("media-type"))] if is_ncdetails else [])),
         ),
         ("method", "POST" if is_ncdetails else "GET"),
     ])
     return http(**args)
예제 #3
0
 def _cover_from_tuple(self, item: ElementBase, attributes):
     for attr in attributes:
         value = item.get(attr, None)
         if value is None:
             continue
         value = self.http.normalize_uri(value)
         test = self.http.check_url(value)
         if test:
             return value
     return None
예제 #4
0
def construct_statustype_data(session, log_scope,
                              statustype: etree.ElementBase) -> dict:
    fields = statustype.find("velden")
    return {
        "volgnummer": int(statustype.get("volgnummer")),
        "omschrijving": find(fields, "naam"),
        "omschrijvingGeneriek": find(fields, "naam-model", False),
        "statustekst": find(fields, "bericht", False),
        # "informeren": true
    }
예제 #5
0
    def run_event(event_node: etree.ElementBase, project_path: str) -> None:
        if event_node is None:
            return

        ProcessManager.log.info(event_node.get(XmlAttributeName.DESCRIPTION))

        ws: re.Pattern = re.compile('[ \t\n\r]+')

        environ: dict = os.environ.copy()
        command: str = ' && '.join(
            ws.sub(' ', node.text)
            for node in filter(is_command_node, event_node)
        )

        ProcessManager.run_command(command, project_path, environ)
예제 #6
0
    def create_child(self, element: etree.ElementBase):
        """
        Finds NodeBase child class for a child element. There is no need to use this function. If this lookup
        behaviour has to be customized, override child_lookup(..) instead.
        """
        if element.tag is etree.Comment:
            return

        if element.tag is etree.PI:
            return

        # If this is a ref element. Find the element it references and add it as a child.
        if element.tag == "ref":
            node_id = element.get("id")
            child = self.find_node_by_id(node_id)
            if child is None:
                raise IdNotFoundError(self.file_name, element.sourceline, node_id)
        else:
            # Check if this is a data Data element
            if isinstance(inspect.getattr_static(self, element.tag, None), Data):
                from urban_journey.ujml.nodes.data import data
                klass = data
            else:
                # Check if parent element knows what type it is.
                klass = self.child_lookup(element)

            # Update the node_register if it's empty.
            if len(node_register) == 0:
                update_plugins()

            # Look for node class in the register.
            if klass is None:
                if element.tag in node_register:
                    klass = node_register[element.tag]
                else:
                    # Node type was not found.
                    raise UnknownElementError(self.file_name, element.sourceline, element.tag)
            child = klass(element, self.root)

        # Check if the super() was called in the __init__ of the node.
        if not hasattr(child, "element"):
            self.raise_exception(MissingSuperInitError, self.tag, element.tag)

        # Add child
        self.add_child(child)
예제 #7
0
 def parse_background(self, element: ElementBase) -> str:
     """
     :param element:
     :return:
     """
     style = element.get('style', None)
     value = None
     if style:
         css = make_parser(None)
         try:  # do not touch this!
             for declaration in css.parse_style_attr(style)[0]:
                 if declaration.name == 'background':
                     for token in declaration.value:
                         if token.type == 'URI':
                             value = token.value
                             break
                 if declaration.name == 'background-image':
                     value = declaration.value[0].value
                     break
         except IndexError:
             return ''
     return self.http.normalize_uri(value)
예제 #8
0
def get_codes(cmd: ET.ElementBase, attr_name: str) -> List[str]:
    # the list comprehension here drops empty elements
    return [x for x in cmd.get(attr_name, "").split(",") if x]
예제 #9
0
def construct_zaaktype_data(session, log_scope, process: etree.ElementBase,
                            processtype_year: int) -> dict:
    fields = process.find("velden")

    omschrijving = trim_string(
        session,
        log_scope,
        find(fields, "kernomschrijving"),
        80,
        "omschrijving",
        ObjectTypenKeys.zaaktypen,
    )
    omschrijvingGeneriek = trim_string(
        session,
        log_scope,
        find(fields, "model-kernomschrijving", False),
        80,
        "omschrijvingGeneriek",
        ObjectTypenKeys.zaaktypen,
    )
    indicatie_intern_of_extern = ("extern" if "extern" in find(
        fields, "zaaktype-categorie", False).lower() else "intern")
    handeling_initiator = value_or_default(
        session,
        f"{log_scope} handelingInitiator",
        find(fields, "zaaktype-naam/structuur/handeling-initiator", False),
        DEFAULT_HANDELING_INITIATOR,
        ObjectTypenKeys.zaaktypen,
    )
    aanleiding = value_or_default(
        session,
        f"{log_scope} aanleiding",
        find(fields, "aanleiding", False),
        DEFAULT_AANLEIDING,
        ObjectTypenKeys.zaaktypen,
    )
    onderwerp = value_or_default(
        session,
        f"{log_scope} onderwerp",
        find(fields, "zaaktype-naam/structuur/onderwerp", False),
        DEFAULT_ONDERWERP,
        ObjectTypenKeys.zaaktypen,
    )
    handeling_behandelaar = value_or_default(
        session,
        f"{log_scope} handeling_behandelaar",
        find(fields, "zaaktype-naam/structuur/handeling-behandelaar", False),
        DEFAULT_HANDELING_BEHANDELAAR,
        ObjectTypenKeys.zaaktypen,
    )

    servicenorm = get_duration(
        find(fields, "afdoeningstermijn"),
        find(fields, "afdoeningstermijn-eenheid"),
    )
    doorlooptijd = get_duration(
        find(fields, "wettelijke-afdoeningstermijn", False),
        find(fields, "wettelijke-afdoeningstermijn-eenheid", False),
    )
    if not doorlooptijd:
        doorlooptijd = get_duration(
            find(fields, "afdoeningstermijn"),
            find(fields, "afdoeningstermijn-eenheid"),
        )
        session.log_info(
            f'{log_scope} Used "afdoeningstermijn" ({doorlooptijd}) for "Zaaktype.doorlooptijd": Import has no value for "wettelijke-afdoeningstermijn".',
            ObjectTypenKeys.zaaktypen,
        )

    verlengings_termijn = get_duration(
        find(fields, "wettelijke-verdagingstermijn", False),
        find(fields, "wettelijke-verdagingstermijn-eenheid", False),
    )

    return {
        "identificatie":
        process.get("id"),
        "omschrijving":
        omschrijving,
        "omschrijvingGeneriek":
        omschrijvingGeneriek,
        "vertrouwelijkheidaanduiding":
        get_choice_field(
            session,
            f"{log_scope} vertrouwelijkheidaanduiding",
            find(fields, "vertrouwelijkheid", False),
            VertrouwelijkheidsAanduidingen.values,
            ObjectTypenKeys.zaaktypen,
            default=DEFAULT_VERTROUWELIJKHEID,
            required=True,
        ),
        "doel":
        find(fields, "naam"),
        "aanleiding":
        aanleiding,
        "toelichting":
        find(fields, "toelichting-proces", False),
        "indicatieInternOfExtern":
        indicatie_intern_of_extern,
        "handelingInitiator":
        handeling_initiator,
        "onderwerp":
        onderwerp,
        "handelingBehandelaar":
        handeling_behandelaar,
        "doorlooptijd":
        doorlooptijd,
        "opschortingEnAanhoudingMogelijk":
        get_boolean(find(fields, "aanhouden-mogelijk", False)),
        "verlengingMogelijk":
        bool(verlengings_termijn),
        "verlengingstermijn":
        verlengings_termijn,
        "trefwoorden":
        get_array(find(fields, "lokale-trefwoorden", False)),  # always empty?
        "publicatieIndicatie":
        get_boolean(find(fields, "publicatie-indicatie", False)),
        "publicatietekst":
        find(fields, "publicatietekst", False),
        "verantwoordingsrelatie":
        get_array(find(fields, "verantwoordingsrelatie",
                       False)),  # always empty?
        "selectielijstProcestype":
        get_procestype(process, processtype_year),
        "referentieproces": {
            "naam": find(fields, "ztc-procestype")
        },
        # Set during `load_data`
        # "catalogus": "",
        "beginGeldigheid":
        session.job.start_date.isoformat(),
        "eindeGeldigheid":
        None,
        "versiedatum":
        get_date(find(fields, "actueel-van")),
        "servicenorm":
        servicenorm,
        "productenOfDiensten": [],
        "gerelateerdeZaaktypen": [],
        "besluittypen": [],
        # "deelzaaktypen": [],
    }
예제 #10
0
    def process_xml_element(self, el: etree.ElementBase, event: str,
                            a: Dict[str, Any]) -> None:
        if el.tag == 'CONTENT' and event == 'end':
            a['metadata']['record_id'] = el.get('RECORDID')

        elif el.tag == 'MASTERIDENTIFER' and event == 'end':
            a['metadata']['title'] = clean_text(el.text)

        elif el.tag == 'TYPE' and event == 'end':
            a['metadata']['doc_type'] = clean_text(el.text)

        elif el.tag == 'DOCUMENTID' and event == 'end':
            a['metadata']['doc_id'] = clean_text(el.text)

        elif el.tag == 'VERSION' and event == 'end':
            a['metadata']['version'] = clean_text(el.text)

        elif el.tag == 'AUTHOR' and event == 'end':
            a['metadata']['author'] = clean_text(el.text)

        elif el.tag == 'ENDTIMESTAMP_MILLIS' and event == 'end':
            millis = int(clean_text(el.text))
            a['metadata']['end_timestamp_millis'] = millis
            a['metadata']['end_time'] = get_iso_datetime_from_millis(millis)

        elif el.tag == 'STARTTIMESTAMP_MILLIS' and event == 'end':
            millis = int(clean_text(el.text))
            a['metadata']['start_timestamp_millis'] = millis
            a['metadata']['start_time'] = get_iso_datetime_from_millis(millis)

        elif el.tag == 'CREATETIMESTAMP_MILLIS' and event == 'end':
            millis = int(clean_text(el.text))
            a['metadata']['create_timestamp_millis'] = millis
            a['metadata']['create_time'] = get_iso_datetime_from_millis(millis)

        elif el.tag == 'LASTMODIFIEDTIMESTAMP_MILLIS' and event == 'end':
            millis = int(clean_text(el.text))
            a['metadata']['last_modified_timestamp_millis'] = millis
            a['metadata']['last_modified_time'] = get_iso_datetime_from_millis(
                millis)

        elif el.tag == 'RESOURCEPATH' and event == 'end':
            a['metadata']['doc_location_path'] = clean_text(el.text)

        elif el.tag == 'PUBLISHEDTIMESTAMP_MILLIS' and event == 'end':
            millis = int(clean_text(el.text))
            a['metadata']['published_timestamp_millis'] = millis
            a['metadata']['published_time'] = get_iso_datetime_from_millis(
                millis)

        elif el.tag == a['metadata']['doc_type']:
            a['is_data'] = (event == 'start')

        elif a['is_data'] and event == 'end' and el.text:
            # treat all text as html
            # lxml will automatically wrap plain text in a para, body and html tags
            structured_content = []
            text_list = []

            try:
                maybe_json = json.loads(el.text)
                structured_content.append({'type': 'json', 'json': maybe_json})
            except (JSONDecodeError, ValueError):
                extractors = [
                    ListExtractor(excluded_tags=['table']),
                    TableExtractor(),
                    TextExtractor(excluded_tags=[
                        'ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4'
                    ]),
                    HeadingExtractor(excluded_tags=['ul', 'ol', 'table'])
                ]
                stream: IO[AnyStr] = BytesIO(
                    fix_content(el.text).encode('utf-8'))
                for ev, elem in self.element_iterator(stream, html=True):
                    process_html_element(elem, ev, extractors,
                                         structured_content, text_list)

                # re-extract content in single column tables used for layout purposes only
                html = None  # memoize
                k = []
                for i, c in enumerate(structured_content):
                    typ = c['type']
                    if typ in ['text', 'heading']:
                        k.append(1)
                    elif typ == 'list':
                        k.append(len(c.get('items', [])))
                    elif typ == 'table':
                        k.append(
                            len(c.get('head', [])) + len(c.get('body', [])))
                        if len(c.get('fields', [])) == 1:
                            if not html:
                                # reset stream to reiterate
                                stream.seek(0)

                                # read stream into str and parse as html
                                html = lxml.html.fromstring(stream.read())

                            # find single column layout table
                            contents = html.xpath(
                                ('/descendant::table[{0}]/tbody/tr/td/*|' +
                                 '/descendant::table[{0}]/tr/td/*').format(
                                     c['index']))
                            root = etree.Element('div')
                            root.extend(contents)
                            sc = []
                            tl = []
                            for evt, ele in etree.iterwalk(root,
                                                           events=('start',
                                                                   'end')):
                                process_html_element(ele, evt, extractors, sc,
                                                     tl)

                            j = len(c.get('references', []))
                            structured_content = flatten([
                                structured_content[:(i - j)], sc,
                                structured_content[(i + 1):]
                            ])
                            text_list = flatten([
                                text_list[:sum(k[:(i - j)])], tl,
                                text_list[sum(k[:(i + 1)]):]
                            ])

            data = {}
            if len(text_list) == 1:
                data['text'] = text_list[0]
            else:
                data['text'] = text_list

            if structured_content:
                data['structured_content'] = structured_content

            a['data'][el.tag.lower()] = data
예제 #11
0
def process_xml_element(
    el: etree.ElementBase,
    event: str,
    accumulator: Dict[str, Any],
    excluded_html_tags: List[str],
) -> Dict[str, Any]:
    """
    Stateful, so cannot be parallelized.

    :param el: XML element
    :param event: event type [start, end]
    :param accumulator: accumulates state
    :param excluded_html_tags: XML tags to exclude
    :return: accumulated content as dict
    """
    a = deepcopy(accumulator)

    if el.tag == 'CONTENT' and event == 'end':
        a['metadata']['record_id'] = el.get('RECORDID')

    elif el.tag == 'MASTERIDENTIFER' and event == 'end':
        a['metadata']['title'] = clean_text(el.text)

    elif el.tag == 'TYPE' and event == 'end':
        a['metadata']['doc_type'] = clean_text(el.text)

    elif el.tag == 'DOCUMENTID' and event == 'end':
        a['metadata']['doc_id'] = clean_text(el.text)

    elif el.tag == 'VERSION' and event == 'end':
        a['metadata']['version'] = clean_text(el.text)

    elif el.tag == 'AUTHOR' and event == 'end':
        a['metadata']['author'] = clean_text(el.text)

    elif el.tag == 'ENDTIMESTAMP_MILLIS' and event == 'end':
        millis = int(clean_text(el.text))
        a['metadata']['end_timestamp_millis'] = millis
        a['metadata']['end_time'] = get_iso_datetime_from_millis(millis)

    elif el.tag == 'STARTTIMESTAMP_MILLIS' and event == 'end':
        millis = int(clean_text(el.text))
        a['metadata']['start_timestamp_millis'] = millis
        a['metadata']['start_time'] = get_iso_datetime_from_millis(millis)

    elif el.tag == 'CREATETIMESTAMP_MILLIS' and event == 'end':
        millis = int(clean_text(el.text))
        a['metadata']['create_timestamp_millis'] = millis
        a['metadata']['create_time'] = get_iso_datetime_from_millis(millis)

    elif el.tag == 'LASTMODIFIEDTIMESTAMP_MILLIS' and event == 'end':
        millis = int(clean_text(el.text))
        a['metadata']['last_modified_timestamp_millis'] = millis
        a['metadata']['last_modified_time'] = get_iso_datetime_from_millis(
            millis)

    elif el.tag == 'RESOURCEPATH' and event == 'end':
        a['metadata']['doc_location_path'] = clean_text(el.text)

    elif el.tag == 'PUBLISHEDTIMESTAMP_MILLIS' and event == 'end':
        millis = int(clean_text(el.text))
        a['metadata']['published_timestamp_millis'] = millis
        a['metadata']['published_time'] = get_iso_datetime_from_millis(millis)

    elif el.tag == a['metadata']['doc_type']:
        a['is_data'] = (event == 'start')

    elif a['is_data'] and event == 'end' and el.text:
        # treat all text as html
        # lxml will automatically wrap plain text in a para, body and html tags
        structured_content = []
        text_list = []
        extractors = [
            ListExtractor(excluded_tags=['table']),
            TableExtractor(),
            TextExtractor(excluded_tags=[
                'ul', 'ol', 'table', 'title', 'h1', 'h2', 'h3', 'h4'
            ]),
            HeadingExtractor(excluded_tags=['ul', 'ol', 'table'])
        ]
        stream = BytesIO(fix_content(el.text).encode('utf-8'))
        for ev, elem in element_iterator(stream, excluded_html_tags,
                                         html=True):
            structured, text = process_html_element(elem, ev, extractors)
            structured_content.extend(structured)
            text_list.extend(text)

        data = {}
        if len(text_list) == 1:
            data['text'] = text_list[0]
        else:
            data['text'] = text_list

        if structured_content:
            data['structured_content'] = structured_content

        a['data'][el.tag.lower()] = data

    return a
예제 #12
0
 def _get_attr_as_bool(node: etree.ElementBase,
                       attribute_name: str,
                       default_value: str = 'false') -> bool:
     attr: str = node.get(attribute_name, default=default_value).casefold()
     return any([attr == 'true', attr == '1'])