Пример #1
0
def process(oldnode, informat):
    if oldnode.tag in [
            'url', 'href', 'mrf', 'doi', 'bibtype', 'bibkey', 'revision',
            'erratum', 'attachment', 'paper', 'presentation', 'dataset',
            'software', 'video'
    ]:
        return
    elif oldnode.tag in ['author', 'editor']:
        for oldchild in oldnode:
            process(oldchild, informat=informat)
    else:
        if informat == "latex":
            if len(oldnode) > 0:
                logging.error("field has child elements {}".format(', '.join(
                    child.tag for child in oldnode)))
            oldtext = ''.join(oldnode.itertext())
            newnode = latex_to_xml(oldtext, trivial_math=True, fixed_case=True)
            newnode.tag = oldnode.tag
            newnode.attrib.update(oldnode.attrib)
            replace_node(oldnode, newnode)

        maptext(oldnode, html.unescape)
        maptext(oldnode, curly_quotes)
        maptext(oldnode, clean_unicode)
        if oldnode.tag in ['title', 'booktitle']:
            protect(oldnode)
Пример #2
0
def normalize(oldnode, informat):
    """
    Receives an XML 'paper' node and normalizes many of its fields, including:
    - Unescaping HTML
    - Normalizing quotes and other punctuation
    - Mapping many characters to unicode
    In addition, if the 'informat' is "latex", it will convert many LaTeX characters
    to unicode equivalents. Note that these latter LaTeX operations are not idempotent.
    """

    if oldnode.tag in [
            "url",
            "href",
            "mrf",
            "doi",
            "bibtype",
            "bibkey",
            "revision",
            "erratum",
            "attachment",
            "paper",
            "presentation",
            "dataset",
            "software",
            "video",
    ]:
        return
    elif oldnode.tag in ["author", "editor"]:
        for oldchild in oldnode:
            normalize(oldchild, informat=informat)
    else:
        if informat == "latex":
            if len(oldnode) > 0:
                logging.error("field has child elements {}".format(", ".join(
                    child.tag for child in oldnode)))
            oldtext = "".join(oldnode.itertext())
            newnode = latex_to_xml(
                oldtext,
                trivial_math=True,
                fixed_case=oldnode.tag in ["title", "booktitle"],
            )
            newnode.tag = oldnode.tag
            newnode.attrib.update(oldnode.attrib)
            replace_node(oldnode, newnode)

        maptext(oldnode, html.unescape)
        maptext(oldnode, curly_quotes)
        maptext(oldnode, clean_unicode)
        if oldnode.tag in ["title", "booktitle"]:
            protect(oldnode)