Python sanitizeXmlSafeCharset 예제들, util.sanitizeXmlSafeCharset Python 예제들

예제 #1

0

파일 보기

파일: api.py 프로젝트: datadavev/ezid

def _readInput(request):
    if "CONTENT_TYPE" in request.META:
        ct = [w.strip() for w in request.META["CONTENT_TYPE"].split(";")]
        if ct[0] != "text/plain":
            return "error: bad request - unsupported content type"
        if len(ct) > 1 and ct[1].startswith("charset=") and\
          ct[1][8:].upper() != "UTF-8":
            return "error: bad request - unsupported character encoding"
        try:
            # We'd like to call sanitizeXmlSafeCharset just once, before the
            # ANVL parsing, but the problem is that hex-percent-encoded
            # characters, when decoded, can result in additional disallowed
            # characters appearing.  So we sanitize after ANVL parsing.
            # Note that it is possible here that two different labels, that
            # differ in only disallowed characters, will be silently
            # collapsed into one instead of resulting in an error.  But
            # that's a real edge case, so we don't worry about it.
            return { util.sanitizeXmlSafeCharset(k):\
              util.sanitizeXmlSafeCharset(v)\
              for k, v in anvl.parse(request.body.decode("UTF-8")).items() }
        except UnicodeDecodeError:
            return "error: bad request - character decoding error"
        except anvl.AnvlParseException, e:
            return "error: bad request - ANVL parse error (%s)" % str(e)
        except:

예제 #2

0

파일 보기

파일: oai.py 프로젝트: HEG-INCIPIT/ARKetype

def _buildDublinCoreRecord(identifier):
    root = lxml.etree.Element(
        "{http://www.openarchives.org/OAI/2.0/oai_dc/}dc",
        nsmap={
            "oai_dc": "http://www.openarchives.org/OAI/2.0/oai_dc/",
            "dc": "http://purl.org/dc/elements/1.1/",
        },
    )
    root.attrib["{http://www.w3.org/2001/XMLSchema-instance}schemaLocation"] = (
        "http://www.openarchives.org/OAI/2.0/oai_dc/ "
        + "http://www.openarchives.org/OAI/2.0/oai_dc.xsd"
    )

    def q(elementName):
        return "{http://purl.org/dc/elements/1.1/}" + elementName

    lxml.etree.SubElement(root, q("identifier")).text = identifier.identifier
    km = identifier.kernelMetadata()
    for e in ["creator", "title", "publisher", "date", "type"]:
        if getattr(km, e) != None:
            # Adding a try catch block to generate XML
            try:
                # Generate XML node text from the arrtibute value
                lxml.etree.SubElement(root, q(e)).text = getattr(km, e)
            except:
                # Function "sanitizeXmlSafeCharset" returns a copy of the given Unicode string
                # in which characters not accepted by XML 1.1 have been replaced with spaces.
                lxml.etree.SubElement(root, q(e)).text = util.sanitizeXmlSafeCharset(
                    getattr(km, e)
                ).strip()
    return root

예제 #3

0

파일 보기

파일: datacite.py 프로젝트: HEG-INCIPIT/ARKetype

            # exceptions and so replace them.  Too, the presence of such
            # characters can be the source of the problem, so explicitly
            # exposing them can be a help.
            assert False, e.message.encode("ASCII", "xmlcharrefreplace")
        finally:
            schema[1].release()
        i.attrib["identifierType"] = type
    i.text = identifier
    root.attrib["{http://www.w3.org/2001/XMLSchema-instance}schemaLocation"] = (
        "http://datacite.org/schema/kernel-%s " +
        "http://schema.datacite.org/meta/kernel-%s/metadata.xsd") % (version,
                                                                     version)
    try:
        # We re-sanitize the document because unacceptable characters can
        # be (and have been) introduced via XML character entities.
        return "<?xml version=\"1.0\"?>\n" + util.sanitizeXmlSafeCharset(
            lxml.etree.tostring(root, encoding=unicode))
    except Exception, e:
        assert False, "XML serialization error: " + str(e)


def _interpolate(template, *args):
    return template % tuple(util.xmlEscape(a) for a in args)


_metadataTemplate = u"""<?xml version="1.0" encoding="UTF-8"?>
<resource xmlns="http://datacite.org/schema/kernel-4"
  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://datacite.org/schema/kernel-4
    http://schema.datacite.org/meta/kernel-4/metadata.xsd">
  <identifier identifierType="%s">%s</identifier>
  <creators>

예제 #4

0

파일 보기

파일: datacite_xml.py 프로젝트: HEG-INCIPIT/ARKetype

def formElementsToDataciteXml(d, shoulder=None, identifier=None):
    """
  The inverse of dataciteXmlToFormElements.  Dictionary entries not
  related to the DataCite metadata schema (Django formset *_FORMS
  entries, etc.) are removed.
  """
    d = {
        k: v
        for (k, v) in d.iteritems()
        if "_FORMS" not in k and any(e in k for e in _elementList)
    }
    d = _addIdentifierInfo(d, shoulder, identifier)
    namespace = "http://datacite.org/schema/kernel-4"
    schemaLocation = "http://schema.datacite.org/meta/kernel-4/metadata.xsd"

    def q(elementName):
        return "{%s}%s" % (namespace, elementName)

    def tagName(tag):
        return tag.split("}")[1]

    root = lxml.etree.Element(q("resource"), nsmap={None: namespace})
    root.attrib[
        "{http://www.w3.org/2001/XMLSchema-instance}schemaLocation"] = (
            namespace + " " + schemaLocation)
    for key, value in d.items():
        value = util.sanitizeXmlSafeCharset(value).strip()
        if value == "":
            continue
        node = root
        while len(key) > 0:
            k, remainder = key.split("-", 1) if "-" in key else (key, "")
            if k in _elements or ("_" in k
                                  and k.split("_", 1)[0] in _elements):
                if tagName(node.tag) in _repeatableElementContainers:
                    i, remainder = remainder.split("-", 1)
                    i = int(i)
                    while len(node) <= i:
                        lxml.etree.SubElement(node, q(k))
                    node = node[i]
                    if remainder == k:
                        remainder = ""
                else:
                    n = node.find(q(k))
                    if n != None:
                        node = n
                    else:
                        node = lxml.etree.SubElement(node, q(k))
                    if "_" in k and remainder == k.split("_", 1)[0]:
                        remainder = ""
                if remainder == "":
                    if k == "geoLocationPolygon":
                        parent = node.getparent()
                        parent.insert(
                            parent.index(node) + 1,
                            geometry_util.polygonToDatacite(value)[0],
                        )
                        parent.remove(node)
                    else:
                        node.text = value
            else:
                node.attrib[k] = value
            key = remainder

    def sortValue(node):
        v = tagName(node.tag)
        m = re.match(".*_(\d+)$", v)
        if m:
            return (_elements[v.split("_", 1)[0]], int(m.group(1)))
        else:
            return (_elements[v], 0)

    def sortChildren(node):
        if (tagName(node.tag) not in _repeatableElementContainers
                and tagName(node.tag) != "geoLocationPolygon"):
            children = node.getchildren()
            children.sort(key=lambda c: sortValue(c))
            for i, c in enumerate(children):
                node.insert(i, c)
        for c in node.iterchildren():
            sortChildren(c)

    sortChildren(root)
    for tag in _numberedElementContainers:
        for node in root.xpath("//N:" + tag, namespaces={"N": namespace}):
            for t in _numberedElementContainers[tag]:
                for n in node.xpath(
                        "*[substring(local-name(), 1, %d) = '%s']" %
                    (len(t) + 1, t + "_")):
                    n.tag = n.tag.rsplit("_", 1)[0]
    return lxml.etree.tostring(root, encoding=unicode)

예제 #5

0

파일 보기

    resource = resource[0]
    resource.text = "(:tba)"
    assert doiData.find("N:collection/N:item/N:doi", namespaces=ns) == None,\
      "<doi_data> element contains more than one <doi> subelement"
    e = doiData.find("N:timestamp", namespaces=ns)
    if e != None: doiData.remove(e)
    assert doiData.find("N:timestamp", namespaces=ns) == None,\
      "<doi_data> element contains more than one <timestamp> subelement"
    # Normalize schema declarations.
    root.attrib[_schemaLocation] =\
      namespace + " " + (_schemaLocationTemplate % version)
    try:
        # We re-sanitize the document because unacceptable characters can
        # be (and have been) introduced via XML character entities.
        return _addDeclaration(
            util.sanitizeXmlSafeCharset(
                lxml.etree.tostring(root, encoding="unicode")))
    except Exception, e:
        assert False, "XML serialization error: " + str(e)


# In the Crossref deposit schema, version 4.3.4, the <doi_data>
# element can occur in 20 different places.  An analysis shows that
# the resource title corresponding to the DOI being defined can be
# found by one or more of the following XPaths relative to the
# <doi_data> element.

_titlePaths = [
    "../N:titles/N:title", "../N:titles/N:original_language_title",
    "../N:proceedings_title", "../N:full_title", "../N:abbrev_title"
]