def _readInput(request): if "CONTENT_TYPE" in request.META: ct = [w.strip() for w in request.META["CONTENT_TYPE"].split(";")] if ct[0] != "text/plain": return "error: bad request - unsupported content type" if len(ct) > 1 and ct[1].startswith("charset=") and\ ct[1][8:].upper() != "UTF-8": return "error: bad request - unsupported character encoding" try: # We'd like to call sanitizeXmlSafeCharset just once, before the # ANVL parsing, but the problem is that hex-percent-encoded # characters, when decoded, can result in additional disallowed # characters appearing. So we sanitize after ANVL parsing. # Note that it is possible here that two different labels, that # differ in only disallowed characters, will be silently # collapsed into one instead of resulting in an error. But # that's a real edge case, so we don't worry about it. return { util.sanitizeXmlSafeCharset(k):\ util.sanitizeXmlSafeCharset(v)\ for k, v in anvl.parse(request.body.decode("UTF-8")).items() } except UnicodeDecodeError: return "error: bad request - character decoding error" except anvl.AnvlParseException, e: return "error: bad request - ANVL parse error (%s)" % str(e) except:
def _buildDublinCoreRecord(identifier): root = lxml.etree.Element( "{http://www.openarchives.org/OAI/2.0/oai_dc/}dc", nsmap={ "oai_dc": "http://www.openarchives.org/OAI/2.0/oai_dc/", "dc": "http://purl.org/dc/elements/1.1/", }, ) root.attrib["{http://www.w3.org/2001/XMLSchema-instance}schemaLocation"] = ( "http://www.openarchives.org/OAI/2.0/oai_dc/ " + "http://www.openarchives.org/OAI/2.0/oai_dc.xsd" ) def q(elementName): return "{http://purl.org/dc/elements/1.1/}" + elementName lxml.etree.SubElement(root, q("identifier")).text = identifier.identifier km = identifier.kernelMetadata() for e in ["creator", "title", "publisher", "date", "type"]: if getattr(km, e) != None: # Adding a try catch block to generate XML try: # Generate XML node text from the arrtibute value lxml.etree.SubElement(root, q(e)).text = getattr(km, e) except: # Function "sanitizeXmlSafeCharset" returns a copy of the given Unicode string # in which characters not accepted by XML 1.1 have been replaced with spaces. lxml.etree.SubElement(root, q(e)).text = util.sanitizeXmlSafeCharset( getattr(km, e) ).strip() return root
# exceptions and so replace them. Too, the presence of such # characters can be the source of the problem, so explicitly # exposing them can be a help. assert False, e.message.encode("ASCII", "xmlcharrefreplace") finally: schema[1].release() i.attrib["identifierType"] = type i.text = identifier root.attrib["{http://www.w3.org/2001/XMLSchema-instance}schemaLocation"] = ( "http://datacite.org/schema/kernel-%s " + "http://schema.datacite.org/meta/kernel-%s/metadata.xsd") % (version, version) try: # We re-sanitize the document because unacceptable characters can # be (and have been) introduced via XML character entities. return "<?xml version=\"1.0\"?>\n" + util.sanitizeXmlSafeCharset( lxml.etree.tostring(root, encoding=unicode)) except Exception, e: assert False, "XML serialization error: " + str(e) def _interpolate(template, *args): return template % tuple(util.xmlEscape(a) for a in args) _metadataTemplate = u"""<?xml version="1.0" encoding="UTF-8"?> <resource xmlns="http://datacite.org/schema/kernel-4" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd"> <identifier identifierType="%s">%s</identifier> <creators>
def formElementsToDataciteXml(d, shoulder=None, identifier=None): """ The inverse of dataciteXmlToFormElements. Dictionary entries not related to the DataCite metadata schema (Django formset *_FORMS entries, etc.) are removed. """ d = { k: v for (k, v) in d.iteritems() if "_FORMS" not in k and any(e in k for e in _elementList) } d = _addIdentifierInfo(d, shoulder, identifier) namespace = "http://datacite.org/schema/kernel-4" schemaLocation = "http://schema.datacite.org/meta/kernel-4/metadata.xsd" def q(elementName): return "{%s}%s" % (namespace, elementName) def tagName(tag): return tag.split("}")[1] root = lxml.etree.Element(q("resource"), nsmap={None: namespace}) root.attrib[ "{http://www.w3.org/2001/XMLSchema-instance}schemaLocation"] = ( namespace + " " + schemaLocation) for key, value in d.items(): value = util.sanitizeXmlSafeCharset(value).strip() if value == "": continue node = root while len(key) > 0: k, remainder = key.split("-", 1) if "-" in key else (key, "") if k in _elements or ("_" in k and k.split("_", 1)[0] in _elements): if tagName(node.tag) in _repeatableElementContainers: i, remainder = remainder.split("-", 1) i = int(i) while len(node) <= i: lxml.etree.SubElement(node, q(k)) node = node[i] if remainder == k: remainder = "" else: n = node.find(q(k)) if n != None: node = n else: node = lxml.etree.SubElement(node, q(k)) if "_" in k and remainder == k.split("_", 1)[0]: remainder = "" if remainder == "": if k == "geoLocationPolygon": parent = node.getparent() parent.insert( parent.index(node) + 1, geometry_util.polygonToDatacite(value)[0], ) parent.remove(node) else: node.text = value else: node.attrib[k] = value key = remainder def sortValue(node): v = tagName(node.tag) m = re.match(".*_(\d+)$", v) if m: return (_elements[v.split("_", 1)[0]], int(m.group(1))) else: return (_elements[v], 0) def sortChildren(node): if (tagName(node.tag) not in _repeatableElementContainers and tagName(node.tag) != "geoLocationPolygon"): children = node.getchildren() children.sort(key=lambda c: sortValue(c)) for i, c in enumerate(children): node.insert(i, c) for c in node.iterchildren(): sortChildren(c) sortChildren(root) for tag in _numberedElementContainers: for node in root.xpath("//N:" + tag, namespaces={"N": namespace}): for t in _numberedElementContainers[tag]: for n in node.xpath( "*[substring(local-name(), 1, %d) = '%s']" % (len(t) + 1, t + "_")): n.tag = n.tag.rsplit("_", 1)[0] return lxml.etree.tostring(root, encoding=unicode)
resource = resource[0] resource.text = "(:tba)" assert doiData.find("N:collection/N:item/N:doi", namespaces=ns) == None,\ "<doi_data> element contains more than one <doi> subelement" e = doiData.find("N:timestamp", namespaces=ns) if e != None: doiData.remove(e) assert doiData.find("N:timestamp", namespaces=ns) == None,\ "<doi_data> element contains more than one <timestamp> subelement" # Normalize schema declarations. root.attrib[_schemaLocation] =\ namespace + " " + (_schemaLocationTemplate % version) try: # We re-sanitize the document because unacceptable characters can # be (and have been) introduced via XML character entities. return _addDeclaration( util.sanitizeXmlSafeCharset( lxml.etree.tostring(root, encoding="unicode"))) except Exception, e: assert False, "XML serialization error: " + str(e) # In the Crossref deposit schema, version 4.3.4, the <doi_data> # element can occur in 20 different places. An analysis shows that # the resource title corresponding to the DOI being defined can be # found by one or more of the following XPaths relative to the # <doi_data> element. _titlePaths = [ "../N:titles/N:title", "../N:titles/N:original_language_title", "../N:proceedings_title", "../N:full_title", "../N:abbrev_title" ]