def _detectAndValidate(self, *args, **kwargs): allArguments = list(args) + kwargs.values() for arg in allArguments: if type(arg) == _ElementTree: #Should be only one... for strName, strXPath, schema in self._xmlSchemas: ## Doe xpath op betreffende XML/argument: # Wij laten hier het volledige upload-record voorbijkomen, want die is later nodig. Echter is de metadata die wij moeten valideren beschikbaar als text en NIET als LXM-object. # Wij gaan deze dus nu eerst opzoeken en converteren naar een LXML node. record_part = arg.xpath("//document:document/document:part[@name='record']/text()", namespaces=self._namespacesMap) record_lxml = fromstring(record_part[0]) xml = record_lxml.xpath(strXPath, namespaces=self._namespacesMap) ################# # xml = arg.xpath(strXPath, namespaces=self._namespacesMap) if len(xml) > 0: schema.validate(xml[0]) if schema.error_log: exception = ValidateException(formatXSDException(strName + " is NOT valid.", None, schema)) #, arg self.do.logException(exception) # Sends ValidateException back to the Harvester, stops processing this record. raise exception else: exception = ValidateException(formatExceptionLine("Mandatory " + strName + " NOT found.")) self.do.logException(exception) # Sends ValidateException back to the Harvester, stops processing this record. raise exception
def _getDescriptiveMetadata(self, lxmlNode): ## This always normalizes to rdf namespace, without warning/message descriptiveMetadataItem = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/descriptiveMetadata"]', namespaces=self._nsMap) if len(descriptiveMetadataItem ) == 0: #Fallback to @resource (no rdf nmsp), if available... descriptiveMetadataItem = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/descriptiveMetadata"]', namespaces=self._nsMap) if len(descriptiveMetadataItem) > 0: self.do.logMsg(self._uploadid, LOGGER3, prefix=STR_DIDL) if len(descriptiveMetadataItem ) == 0: #Fallback to dip namespace, if available... descriptiveMetadataItem = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/descriptiveMetadata"]', namespaces=self._nsMap) if len(descriptiveMetadataItem) > 0: self.do.logMsg(self._uploadid, LOGGER4, prefix=STR_DIDL) if len(descriptiveMetadataItem) > 0: #look for first DMI containing MODS: dmi_mods = None dmItem = None for dmi in descriptiveMetadataItem: node = dmi.xpath('self::didl:Item//mods:mods', namespaces=self._nsMap) if len(node) > 0: #Found MODS: dmi_mods = node[0] dmItem = dmi break else: raise ValidateException( formatExceptionLine(EXCEPTION6, prefix=STR_DIDL)) item_template = """<didl:Item> <didl:Descriptor> <didl:Statement mimeType="application/xml"> <rdf:type rdf:resource="info:eu-repo/semantics/descriptiveMetadata"/> </didl:Statement> </didl:Descriptor> %s%s<didl:Component> <didl:Resource mimeType="application/xml"> %s </didl:Resource> </didl:Component> </didl:Item>""" % ( self._getIdentifierDescriptor(dmItem), self._getDateModifiedDescriptor(dmItem), tostring(dmi_mods)) else: raise ValidateException( formatExceptionLine(EXCEPTION7, prefix=STR_DIDL)) return item_template
def _validateNames(self, modsNode): for name in modsNode.iterfind(('{%s}name') % self._nsMap['mods']): for roleterm in name.iterfind(('.//{%s}roleTerm') % self._nsMap['mods']): if roleterm.text: roleterm.text = roleterm.text.strip() role = name.xpath("self::mods:name/mods:role/mods:roleTerm[@type='code' and @authority='marcrelator']/text()", namespaces=self._nsMap) for namepart in name.iterfind(('{%s}namePart') % self._nsMap['mods']): if not namepart.text or not namepart.text.strip(): # Remove empty nameparts name.remove(namepart) if not role or len(role) < 1 or name.find(('{%s}namePart') % self._nsMap['mods']) is None: ## Geen roleterm gevonden, of lege string voor type code en authority marcrelator, of geen nameParts: Verwijder dit name element: modsNode.remove(name) elif len(role) > 0 and not self.__isValidRoleTerm(role[0]): raise ValidateException(formatExceptionLine( EXCEPTION4 + role[0], prefix=STR_MODS)) if len(modsNode.xpath("//mods:mods/mods:name", namespaces=self._nsMap)) <= 0: raise ValidateException(formatExceptionLine(EXCEPTION5, prefix=STR_MODS))
def testValidationErrors(self): self.observer.exceptions['add'] = ValidateException('Some <Exception>') headers, result = self.performRequest(self.createRequestBody()) self.assertTrue( """<ucp:operationStatus>fail</ucp:operationStatus>""" in result, result) diag = parse(StringIO(result)) self.assertEquals( "info:srw/diagnostic/12/12", xpathFirst( diag, '/srw:updateResponse/srw:diagnostics/diag:diagnostic/diag:uri/text()' )) self.assertEquals( "Some <Exception>", xpathFirst( diag, '/srw:updateResponse/srw:diagnostics/diag:diagnostic/diag:details/text()' )) self.assertEquals( "Invalid data: record rejected", xpathFirst( diag, '/srw:updateResponse/srw:diagnostics/diag:diagnostic/diag:message/text()' ))
def _normaliseRecord(self, lxmlNode): # MODS normalisation in 4 steps: # 1. Get Mods from the lxmlNode. # 2. Normalize it # 3. Put it back in place. # 4. return the lxmlNode containing the normalized MODS. #1: Get Mods from the lxmlNode: lxmlMODS = lxmlNode.xpath('(//mods:mods)[1]', namespaces=self._nsMap) ## Our normalisation functions to call: modsFunctions = [ self._convertFullMods2GHMods ] if len(lxmlMODS) > 0: #2: Normalize it str_norm_mods = '' for function in modsFunctions: str_norm_mods += function(lxmlMODS[0]) #3: Put it back in DIDL/place: lxmlMODS[0].getparent().replace(lxmlMODS[0], etree.fromstring(str_norm_mods) ) else: #This should never happen @runtime: record should have been validated up front... raise ValidateException(formatExceptionLine(EXCEPTION1, prefix=STR_MODS)) #4: Return the lxmlNode containing the normalized MODS: #print(etree.tostring(lxmlNode, pretty_print=True)) return lxmlNode
def testCollectLogWithErrors(self): self.observer.exceptions['delete'] = Exception('Some <Exception>') requestBody = self.createRequestBody(action=DELETE, recordIdentifier='idDelete') headers, result = self.performRequest(requestBody) self.assertEquals( dict(sruRecordUpdate=dict(delete=['idDelete'], errorType=['Exception'], errorMessage=["Some <Exception>"])), self.logCollector) self.observer.exceptions['add'] = ValidateException('Nee') requestBody = self.createRequestBody(action=CREATE, recordIdentifier='idAdd') headers, result = self.performRequest(requestBody) self.assertEquals( dict(sruRecordUpdate=dict(add=['idAdd'], invalid=['idAdd'], errorType=['ValidateException'], errorMessage=["Nee"])), self.logCollector) headers, result = self.performRequest( '<srw:updateRequest>Will raise XMLSyntaxError') sru_error = self.logCollector['sruRecordUpdate'] self.assertEqual(['XMLSyntaxError'], sru_error['errorType']) self.assertTrue(sru_error['errorMessage'][0].startswith( 'Namespace prefix srw on updateRequest is not defined, line 1, column 19' ))
def _tlOrigininfo(self, childNode): hasDateIssued = False ## Select all children from originInfo having 'encoding' attribute: children = childNode.xpath( "self::mods:originInfo/child::*[@encoding='w3cdtf' or @encoding='iso8601']", namespaces=self._nsMap) if len(children) > 0: for child in children: if self._validateISO8601(child.text): child.text = self._granulateDate(child.text) child.set('encoding', 'w3cdtf') if child.tag == ('{%s}dateIssued') % self._nsMap['mods']: hasDateIssued = True else: child.getparent().remove(child) if not hasDateIssued: raise ValidateException( formatExceptionLine(EXCEPTION7, prefix=STR_MODS)) for child in childNode.xpath("self::mods:originInfo/mods:publisher", namespaces=self._nsMap): if not child.text: child.getparent().remove(child) return childNode if len(childNode) > 0 else None
def _isValidTitleInfoTag(self, lxmlNode): for title in lxmlNode.iterfind(('{%s}title') % self._nsMap['mods']): if not title.text or not title.text.strip(): raise ValidateException(formatExceptionLine(EXCEPTION3, prefix=STR_MODS)) for subtitle in lxmlNode.iterfind(('{%s}subTitle') % self._nsMap['mods']): if not subtitle.text or not subtitle.text.strip(): subtitle.getparent().remove(subtitle) return True
def _normalizeTitleinfo(self, modsNode): ## Select all titleInfo's hasTitleInfo = False for child in modsNode.iterfind(('{%s}titleInfo') % self._nsMap['mods']): hasTitleInfo = True if not self._isValidTitleInfoTag(child): modsNode.remove(child) if not hasTitleInfo: raise ValidateException(formatExceptionLine(EXCEPTION2, prefix=STR_MODS))
def _detectAndValidate(self, *args, **kwargs): allArguments = list(args) + kwargs.values() for arg in allArguments: if type(arg) == _ElementTree: #Should be only one... for strName, strXPath, schema in self._xmlSchemas: ## Doe xpath op betreffende XML/argument: xml = arg.xpath(strXPath, namespaces=self._namespacesMap) if len(xml) > 0: schema.validate(xml[0]) if schema.error_log: exception = ValidateException( formatXSDException(strName + " is NOT valid.", None, schema)) #, arg self.do.logException(exception) raise exception else: exception = ValidateException( formatExceptionLine("Mandatory " + strName + " NOT found.")) self.do.logException(exception) raise exception
def _getHumanStartPage(self, lxmlNode): didl_hsp_item = lxmlNode.xpath( '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/humanStartPage"]', namespaces=self._nsMap) if len(didl_hsp_item) == 0: didl_hsp_item = lxmlNode.xpath( '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/humanStartPage"]', namespaces=self._nsMap) if len(didl_hsp_item) > 0: self.do.logMsg(self._uploadid, LOGGER9, prefix=STR_DIDL) if len(didl_hsp_item) == 0: didl_hsp_item = lxmlNode.xpath( '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/humanStartPage"]', namespaces=self._nsMap) if len(didl_hsp_item) > 0: self.do.logMsg(self._uploadid, LOGGER10, prefix=STR_DIDL) if len(didl_hsp_item) == 0: self.do.logMsg(self._uploadid, LOGGER11, prefix=STR_DIDL) return "" uriref = didl_hsp_item[0].xpath( 'self::didl:Item/didl:Component/didl:Resource/@ref', namespaces=self._nsMap) mimetype = didl_hsp_item[0].xpath( 'self::didl:Item/didl:Component/didl:Resource/@mimeType', namespaces=self._nsMap) if len(mimetype) == 0: self.do.logMsg(self._uploadid, LOGGER13, prefix=STR_DIDL) if len(mimetype) > 0 and not comm.isMimeType(mimetype[0]): self.do.logMsg(self._uploadid, LOGGER12 + mimetype[0], prefix=STR_DIDL) if len(uriref) == 0 or not comm.isURL(uriref[0]): raise ValidateException( formatExceptionLine(EXCEPTION11, prefix=STR_DIDL)) return """<didl:Item> <didl:Descriptor> <didl:Statement mimeType="application/xml"> <rdf:type rdf:resource="info:eu-repo/semantics/humanStartPage"/> </didl:Statement> </didl:Descriptor> <didl:Component> <didl:Resource ref="%s" mimeType="%s"/> </didl:Component> </didl:Item>""" % (escapeXml(comm.urlQuote( uriref[0].strip())), escapeXml(mimetype[0]))
def _validateGenre(self, modsNode): fqGenre = None bln_hasValid = False ## Loop all 'genre' elements as separate nodes: for genre in modsNode.iterfind('{'+self._nsMap.get('mods')+'}genre'): for key, value in GENRES_SEMANTIEK.iteritems(): if genre.text and genre.text.strip().lower().find(key) >= 0: #found a (lowercased) genre fqGenre = value break if fqGenre is not None and not bln_hasValid: bln_hasValid = True genre.text = fqGenre else: modsNode.remove(genre) if not bln_hasValid: raise ValidateException(formatExceptionLine(EXCEPTION6, prefix=STR_MODS))
def add(self, lxmlNode, **kwargs): self._fieldslist = [] # reset the fieldslist self._nids_aut_enriched.clear() # Empty the set. self._record_pids.clear() # hier komt een compleet meresco:document binnen als LXMLnode: # self.uploadid = kwargs['identifier'] # Get meta, header and metadata part(='long') from the normdoc: e_metapart = etree.fromstring( lxmlNode.xpath( '/document:document/document:part[@name="meta"]/text()', namespaces=namespacesmap)[0]) wcp_collection = e_metapart.xpath( '/meta:meta/meta:repository/meta:collection/text()', namespaces=namespacesmap) if not wcp_collection: raise ValidateException( "Collection is missing from metapart! Please add collection in WCP." ) self._wcp_collection = wcp_collection[0] e_recordpart = etree.fromstring( lxmlNode.xpath( '/document:document/document:part[@name="record"]/text()', namespaces=namespacesmap)[0]) # Add known metapart fields for all records: for field, xpad in metaFieldNamesToXpath.iteritems(): self._fieldslist.append( (field, e_metapart.xpath(xpad, namespaces=namespacesmap)[0])) if self._verbose: print 'addField:', field.upper(), "-->", e_metapart.xpath( xpad, namespaces=namespacesmap)[0] record = None if self._wcp_collection in WCPNODCOLLECTION: record = e_recordpart.xpath( '//prs:persoon | //prj:activiteit | //org:organisatie', namespaces=namespacesmap) else: record = e_recordpart.xpath('//norm:normalized/long:knaw_long', namespaces=namespacesmap) self._fillFieldslist(record[0], '') # Add fiels by path in xml. self._addAuthorsAndNamesFields(record[0]) for field, xpad in fieldNamesXpathMap.iteritems( ): # Add fields by xPath self._findAndAddToFieldslist(record[0], field, xpad) if self._wcp_collection in WCPEDUCOLLECTION: nidlist = self.all.lookupNameIds(pidlist=self._record_pids) for generator in nidlist: for nid in generator: splitted = nid.split(":", 2) nameId = NameIdentifierFactory.factory( splitted[0], splitted[1]) if nameId.is_valid(): self._nids_aut_enriched.add(nameId.get_idx_id()) self._nids_aut_enriched.add(nameId.get_id()) for nid in self._nids_aut_enriched: self._fieldslist.append(('nids_aut_enriched', nid)) if self._verbose: print 'addField:', 'nids_aut_enriched'.upper(), "-->", nid # Ready filling fieldslist, now call add method: yield self.all.add(fieldslist=self._fieldslist, **kwargs)
def _checkOriginInfoDateIssued(self, modsNode): if len(modsNode.xpath("//mods:mods/mods:originInfo/mods:dateIssued", namespaces=self._nsMap)) <= 0: raise ValidateException(formatExceptionLine(EXCEPTION7, prefix=STR_MODS))
def __init__(self, lxmlNode, uploadId): md_format = None if len( lxmlNode.xpath('//didl:DIDL[1]', namespaces=Namespaces.NAMESPACEMAP) ) > 0: # Check for DIDL container, Max. 1 according to EduStandaard. if int( lxmlNode.xpath("count(//mods:mods)", namespaces=Namespaces.NAMESPACEMAP) ) >= 1: # Check for MODS container. # Found MODS: Check op aanwezigheid rdf namespace, to differentiate between known versions: if lxmlNode.xpath("boolean(count(//rdf:*))", namespaces=Namespaces.NAMESPACEMAP): md_format = MetadataFormat.DIDLM30 else: md_format = MetadataFormat.DIDLM23 elif int( lxmlNode.xpath("count(//oai_dc:dc)", namespaces=Namespaces.NAMESPACEMAP) ) == 1: # Check for OAI_DC container. md_format = MetadataFormat.DIDLDC elif int( lxmlNode.xpath("count(//mods:mods)", namespaces=Namespaces.NAMESPACEMAP) ) >= 1: # Full MODS (MODS only) md_format = MetadataFormat.DIDLM36 elif lxmlNode.xpath( "boolean(count(//oai_dc:dc))", namespaces=Namespaces.NAMESPACEMAP ): # No DIDL, nor MODS was found, check for plain DC: md_format = MetadataFormat.OAIDC elif lxmlNode.xpath( "boolean(count(//org:organisatie))", namespaces=Namespaces.NAMESPACEMAP ): # No DIDL, nor MODS was found, check for plain DC: md_format = MetadataFormat.ORG # NOD organization elif lxmlNode.xpath( "boolean(count(//proj:activiteit))", namespaces=Namespaces.NAMESPACEMAP ): # No DIDL, nor MODS was found, check for plain DC: md_format = MetadataFormat.PROJ # NOD project elif lxmlNode.xpath( "boolean(count(//prs:persoon))", namespaces=Namespaces.NAMESPACEMAP ): # No DIDL, nor MODS was found, check for plain DC: md_format = MetadataFormat.PRS # NOD Person elif lxmlNode.xpath( "boolean(count(//datacite:resource))", namespaces=Namespaces.NAMESPACEMAP ): # No DIDL, nor MODS or ORE was found, check for DATACITE: md_format = MetadataFormat.DATACITE if md_format == None: raise ValidateException( "No known EduStandaard format was found in the metadata for uploadid: %s! This record cannot be processed." % (uploadId)) self._format = md_format self._namespace = Namespaces.getNamespace(format)
def _getObjectfiles(self, lxmlNode): of_container = '' objectfiles = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/objectFile"]', namespaces=self._nsMap) if len(objectfiles) == 0: objectfiles = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/objectFile"]', namespaces=self._nsMap) if len(objectfiles) > 0: self.do.logMsg(self._uploadid, LOGGER6, prefix=STR_DIDL) if len(objectfiles) == 0: objectfiles = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/objectFile"]', namespaces=self._nsMap) if len(objectfiles) > 0: self.do.logMsg(self._uploadid, LOGGER7, prefix=STR_DIDL) for objectfile in objectfiles: #1:Define correct ObjectFile descriptor: of_container += '<didl:Item><didl:Descriptor><didl:Statement mimeType="application/xml"><rdf:type rdf:resource="info:eu-repo/semantics/objectFile"/></didl:Statement></didl:Descriptor>' #2: Check geldige Identifier (feitelijk verplicht, hoewel vaak niet geimplemeteerd...) pi = objectfile.xpath( 'self::didl:Item/didl:Descriptor/didl:Statement/dii:Identifier/text()', namespaces=self._nsMap) if len(pi) > 0: of_container += descr_templ % ('<dii:Identifier>' + escapeXml( pi[0].strip()) + '</dii:Identifier>') #3: Check op geldige AccessRights: arights = objectfile.xpath( 'self::didl:Item/didl:Descriptor/didl:Statement/dcterms:accessRights/text()', namespaces=self._nsMap) if len(arights) > 0: for key, value in accessRights.iteritems(): if arights[0].strip().lower().find(key) >= 0: of_container += descr_templ % ( '<dcterms:accessRights>' + value + '</dcterms:accessRights>') break else: raise ValidateException( formatExceptionLine(arights[0] + EXCEPTION12, prefix=STR_DIDL)) else: raise ValidateException( formatExceptionLine(EXCEPTION8, prefix=STR_DIDL)) #4: Check geldige datemodified (feitelijk verplicht, hoewel vaak niet geimplemeteerd...) modified = objectfile.xpath( 'self::didl:Item/didl:Descriptor/didl:Statement/dcterms:modified/text()', namespaces=self._nsMap) if len(modified) > 0 and comm.isISO8601(modified[0]): of_container += descr_templ % ('<dcterms:modified>' + modified[0].strip() + '</dcterms:modified>') #5: Check for 'file' description: descr = objectfile.xpath( 'self::didl:Item/didl:Descriptor/didl:Statement/dc:description/text()', namespaces=self._nsMap) if len(descr) > 0: of_container += descr_templ % ('<dc:description>' + escapeXml( descr[0].strip()) + '</dc:description>') ## SKIPPING: Not in EduStandaard. #6.0: Check for embargo: # embargo = objectfile.xpath('self::didl:Item/didl:Descriptor/didl:Statement/dcterms:available/text()', namespaces=self._nsMap) # if len(embargo) > 0 and comm.isISO8601(embargo[0]): # of_container += descr_templ % ('<dcterms:available>'+embargo[0].strip()+'</dcterms:available>') ## SKIPPING: Not in EduStandaard. #6.1: Check for dateSubmitted: # dembargo = objectfile.xpath('self::didl:Item/didl:Descriptor/didl:Statement/dcterms:dateSubmitted/text()', namespaces=self._nsMap) # if len(dembargo) > 0 and comm.isISO8601(dembargo[0]): # of_container += descr_templ % ('<dcterms:dateSubmitted>'+dembargo[0].strip()+'</dcterms:dateSubmitted>') # else: # #6.2: Check for issued (depricated, normalize to dateSubmitted): # issued = objectfile.xpath('self::didl:Item/didl:Descriptor/didl:Statement/dcterms:issued/text()', namespaces=self._nsMap) # if len(issued) > 0 and comm.isISO8601(issued[0]): # of_container += descr_templ % ('<dcterms:dateSubmitted>'+issued[0].strip()+'</dcterms:dateSubmitted>') #7: Check for published version(author/publisher): pubVersion = objectfile.xpath( 'self::didl:Item/didl:Descriptor/didl:Statement/rdf:type/@rdf:resource', namespaces=self._nsMap) if len( pubVersion ) > 0: ## Both (author/publisher) may be available: we'll take the first one... for key, value in pubVersions.iteritems(): if pubVersion[0].strip().lower().find(key) >= 0: of_container += descr_templ % ( '<rdf:type rdf:resource="' + value + '"/>') break #8:Check for MANDATORY resources and mimetypes: didl_resources = objectfile.xpath( 'self::didl:Item/didl:Component/didl:Resource[@mimeType and @ref]', namespaces=self._nsMap) resources = '' _url_list = [] for resource in didl_resources: mimeType = resource.xpath('self::didl:Resource/@mimeType', namespaces=self._nsMap) uri = resource.xpath('self::didl:Resource/@ref', namespaces=self._nsMap) ## We need both mimeType and URI: (MIMETYPE is required by DIDL schema, @ref not). if len(mimeType) > 0 and len(uri) > 0: if not comm.isMimeType(mimeType[0]): self.do.logMsg(self._uploadid, LOGGER8 + mimeType[0], prefix=STR_DIDL) if comm.isURL(uri[0].strip()): resources += """<didl:Resource mimeType="%s" ref="%s"/>""" % ( escapeXml(mimeType[0].strip()), escapeXml(comm.urlQuote(uri[0].strip()))) _url_list.append( """<didl:Resource mimeType="%s" ref="%s"/>""" % (escapeXml(mimeType[0].strip()), escapeXml(comm.urlQuote(uri[0].strip())))) else: raise ValidateException( formatExceptionLine(EXCEPTION9 + uri[0], prefix=STR_DIDL)) if resources != '': of_container += """<didl:Component> %s </didl:Component>""" % (resources) else: raise ValidateException( formatExceptionLine(EXCEPTION10, prefix=STR_DIDL)) of_container += '</didl:Item>' return of_container
def _getTopItem(self, lxmlNode): ## Wrappers: pid, modified, mimetype, pidlocation = '', '', "application/xml", '' #1: Get persistentIdentifier: pidlist = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Descriptor/didl:Statement/dii:Identifier/text()', namespaces=self._nsMap) if len(pidlist) > 0: pid = pidlist[0].strip() if not comm.isURNNBN(pid): raise ValidateException( formatExceptionLine(EXCEPTION0 + pid, prefix=STR_DIDL)) else: raise ValidateException( formatExceptionLine(EXCEPTION1, prefix=STR_DIDL)) #2: Get toplevel modificationDate: comm.isISO8601() tl_modified = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Descriptor/didl:Statement/dcterms:modified/text()', namespaces=self._nsMap) ## Check op geldig/aanwezigheid tlModified, anders exception: if len(tl_modified) > 0 and not comm.isISO8601(tl_modified[0]): raise ValidateException( formatExceptionLine(EXCEPTION2 + tl_modified[0], prefix=STR_DIDL)) elif len(tl_modified) == 0: raise ValidateException( formatExceptionLine(EXCEPTION3, prefix=STR_DIDL)) ## Get all modified dates: all_modified = lxmlNode.xpath( '//didl:Item/didl:Descriptor/didl:Statement/dcterms:modified/text()', namespaces=self._nsMap) ## Get most recent date from all items, to add to toplevelItem: if len(all_modified) > 0: datedict = {} for date in all_modified: if comm.isISO8601(date.strip()): #datedict[parseDate(date.strip())] = date.strip() pd = parseDate(date.strip()) datedict["%s %s" % (str(pd.date()), str(pd.time()))] = date.strip() ## Get first sorted key: for key in reversed(sorted(datedict.iterkeys())): modified = datedict[key] break if not tl_modified[0].strip() == modified: self.do.logMsg(self._uploadid, LOGGER1, prefix=STR_DIDL) #3: Get PidResourceMimetype mimetypelist = lxmlNode.xpath( '//didl:DIDL/didl:Item/didl:Component/didl:Resource/@mimeType', namespaces=self._nsMap) if len(mimetypelist) > 0: mimetype = mimetypelist[0].strip() if not comm.isMimeType(mimetype): self.do.logMsg(self._uploadid, LOGGER2 + mimetype, prefix=STR_DIDL) #4: Get PidResourceLocation: pidlocation = self._findAndBindFirst( lxmlNode, '%s', '//didl:DIDL/didl:Item/didl:Component/didl:Resource/@ref', '//didl:DIDL/didl:Item/didl:Component/didl:Resource/text()' '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/humanStartPage"]/didl:Component/didl:Resource/@ref', #DIDL 3.0 '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/humanStartPage"]/didl:Component/didl:Resource/@ref', #DIDL 3.0, without @rdf:resource '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/humanStartPage"]/didl:Component/didl:Resource/@ref', #fallback DIDL 2.3.1 '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@rdf:resource="info:eu-repo/semantics/objectFile"]/didl:Component/didl:Resource/@ref', #fallback DIDL 3.0 '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/rdf:type/@resource="info:eu-repo/semantics/objectFile"]/didl:Component/didl:Resource/@ref', #fallback DIDL 3.0, without @rdf:resource '//didl:Item/didl:Item[didl:Descriptor/didl:Statement/dip:ObjectType/text()="info:eu-repo/semantics/objectFile"]/didl:Component/didl:Resource/@ref' #fallback DIDL 2.3.1 ).strip() if pidlocation == '': raise ValidateException( formatExceptionLine(EXCEPTION4, prefix=STR_DIDL)) if not comm.isURL(pidlocation): raise ValidateException( formatExceptionLine(EXCEPTION5 + pidlocation, prefix=STR_DIDL)) return """<didl:Item> <didl:Descriptor><didl:Statement mimeType="application/xml"><dii:Identifier>%s</dii:Identifier></didl:Statement></didl:Descriptor> <didl:Descriptor><didl:Statement mimeType="application/xml"><dcterms:modified>%s</dcterms:modified></didl:Statement></didl:Descriptor> <didl:Component><didl:Resource mimeType="%s" ref="%s"/></didl:Component>""" % ( escapeXml(pid), modified, escapeXml(mimetype), comm.urlQuote(pidlocation))