def parse_metadata(self): XSI = "http://www.w3.org/2001/XMLSchema-instance" if self.link_type == 'embedded': source_name = self.getEnumSourceNames().LINKED_DATA.value elif self.link_type == 'guessed': source_name = self.getEnumSourceNames().GUESSED_XML.value elif self.link_type == 'negotiated': source_name = self.getEnumSourceNames().XML_NEGOTIATED.value else: source_name = self.getEnumSourceNames().TYPED_LINK.value dc_core_metadata = None requestHelper = RequestHelper(self.target_url, self.logger) requestHelper.setAcceptType(AcceptTypes.xml) #self.logger.info('FsF-F2-01M : Sending request to access metadata from -: {}'.format(self.target_url)) neg_source, xml_response = requestHelper.content_negotiate( 'FsF-F2-01M') if requestHelper.getHTTPResponse() is not None: self.logger.info( 'FsF-F2-01M : Trying to extract/parse metadata from -: {}'. format(source_name)) #dom = lxml.html.fromstring(self.landing_html.encode('utf8')) if neg_source != 'xml': self.logger.info( 'FsF-F2-01M : Expected XML but content negotiation responded -: ' + str(neg_source)) else: tree = lxml.etree.XML(xml_response) schema_locations = set( tree.xpath("//*/@xsi:schemaLocation", namespaces={'xsi': XSI})) for schema_location in schema_locations: self.namespaces = re.split('\s', schema_location) #TODO: implement some XSLT to handle the XML.. return source_name, dc_core_metadata
def evaluate(self): self.result = Persistence(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) self.output = PersistenceOutput() # ======= CHECK IDENTIFIER PERSISTENCE ======= self.logger.info( 'FsF-F1-02D : PID schemes-based assessment supported by the assessment service - {}' .format(Mapper.VALID_PIDS.value)) check_url = None signposting_pid = None if self.fuji.id_scheme is not None: check_url = self.fuji.pid_url #check_url = idutils.to_url(self.fuji.id, scheme=self.fuji.id_scheme) if self.fuji.id_scheme == 'url': self.fuji.origin_url = self.fuji.id check_url = self.fuji.id if check_url: # ======= RETRIEVE METADATA FROM LANDING PAGE ======= requestHelper = RequestHelper(check_url, self.logger) requestHelper.setAcceptType(AcceptTypes.html_xml) # request neg_source, self.fuji.extruct_result = requestHelper.content_negotiate( 'FsF-F1-02D', ignore_html=False) if not 'html' in str(requestHelper.content_type): self.logger.info( 'FsF-F2-01M :Content type is ' + str(requestHelper.content_type) + ', therefore skipping embedded metadata (microdata, RDFa) tests' ) self.fuji.extruct_result = {} if type(self.fuji.extruct_result) != dict: self.fuji.extruct_result = {} r = requestHelper.getHTTPResponse() response_status = requestHelper.response_status if r: self.fuji.landing_url = requestHelper.redirect_url #in case the test has been repeated because a PID has been found in metadata #print(self.fuji.landing_url, self.fuji.input_id) if self.fuji.repeat_pid_check == True: if self.fuji.landing_url != self.fuji.input_id: self.logger.warning( 'FsF-F1-02D : Landing page URL resolved from PID found in metadata does not match with input URL' ) self.logger.warning( 'FsF-F2-01M : Seems to be a catalogue entry or alternative representation of the data set, landing page URL resolved from PID found in metadata does not match with input URL' ) #self.fuji.repeat_pid_check = False if self.fuji.landing_url not in [ 'https://datacite.org/invalid.html' ]: if response_status == 200: # identify signposting links in header header_link_string = requestHelper.getHTTPResponse( ).getheader('Link') if header_link_string is not None: self.logger.info( 'FsF-F1-02D : Found signposting links in response header of landingpage' ) for preparsed_link in header_link_string.split( ','): found_link = None found_type, type_match = None, None found_rel, rel_match = None, None found_formats, formats_match = None, None parsed_link = preparsed_link.strip().split(';') found_link = parsed_link[0].strip() for link_prop in parsed_link[1:]: if str(link_prop).startswith('rel="'): rel_match = re.search( 'rel=\"(.*?)\"', link_prop) elif str(link_prop).startswith('type="'): type_match = re.search( 'type=\"(.*?)\"', link_prop) elif str(link_prop).startswith( 'formats="'): formats_match = re.search( 'formats=\"(.*?)\"', link_prop) if type_match: found_type = type_match[1] if rel_match: found_rel = rel_match[1] if formats_match: found_formats = formats_match[1] signposting_link_dict = { 'url': found_link[1:-1], 'type': found_type, 'rel': found_rel, 'profile': found_formats } if found_link: self.fuji.signposting_header_links.append( signposting_link_dict) #check if there is a cite-as signposting link if self.fuji.pid_scheme is None: signposting_pid_link = self.fuji.get_signposting_links( 'cite-as') if signposting_pid_link: signposting_pid = signposting_pid_link[0].get( 'url') if signposting_pid: signidhelper = IdentifierHelper #found_ids = idutils.detect_identifier_schemes(signposting_pid[0]) found_id = signidhelper.preferred_schema #if len(found_ids) > 1: # found_ids.remove('url') # found_id = found_ids[0] if signidhelper.is_persistent: self.logger.info( 'FsF-F1-02D : Found object identifier in signposting header links' ) self.fuji.pid_scheme = found_id up = urlparse(self.fuji.landing_url) self.fuji.landing_origin = '{uri.scheme}://{uri.netloc}'.format( uri=up) self.fuji.landing_html = requestHelper.getResponseContent( ) self.fuji.landing_content_type = requestHelper.content_type self.output.resolved_url = self.fuji.landing_url # url is active, although the identifier is not based on a pid scheme self.output.resolvable_status = True self.logger.info( 'FsF-F1-02D : Object identifier active (status code = 200)' ) self.fuji.isMetadataAccessible = True elif response_status in [401, 402, 403]: self.fuji.isMetadataAccessible = False self.logger.warning( "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}" .format(code=response_status)) else: self.fuji.isMetadataAccessible = False self.logger.warning( "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}" .format(code=response_status)) else: self.logger.warning( "FsF-F1-02D : Invalid DOI, identifier resolved to -: {code}" .format(code=self.fuji.landing_url)) else: self.fuji.isMetadataAccessible = False self.logger.warning( "FsF-F1-02D :Resource inaccessible, no response received from -: {}" .format(check_url)) if response_status in [401, 402, 403]: self.logger.warning( "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}" .format(code=response_status)) else: self.logger.warning( "FsF-F1-02D :Resource inaccessible, could not identify an actionable representation for the given identfier -: {}" .format(self.fuji.id)) if self.fuji.pid_scheme is not None: # short_pid = id.normalize_pid(self.id, scheme=pid_scheme) if signposting_pid is None: idhelper = IdentifierHelper(self.fuji.id) self.fuji.pid_url = idhelper.identifier_url #self.fuji.pid_url = idutils.to_url(self.fuji.id, scheme=self.fuji.pid_scheme) else: self.fuji.pid_url = signposting_pid[0] self.output.pid_scheme = self.fuji.pid_scheme self.output.pid = self.fuji.pid_url self.setEvaluationCriteriumScore('FsF-F1-02D-1', 0.5, 'pass') self.score.earned = 0.5 self.maturity = 1 if self.fuji.isMetadataAccessible: self.setEvaluationCriteriumScore('FsF-F1-02D-2', 0.5, 'pass') self.maturity = 3 self.result.test_status = 'pass' self.score.earned = self.total_score # idenfier should be based on a persistence scheme and resolvable #print(self.metric_tests) self.logger.log( self.fuji.LOG_SUCCESS, 'FsF-F1-02D : Persistence identifier scheme -: {}'.format( self.fuji.pid_scheme)) #self.logger.info('FsF-F1-02D : Persistence identifier scheme - {}'.format(self.fuji.pid_scheme)) else: self.score.earned = 0 self.logger.warning( 'FsF-F1-02D : Not a persistent identifier scheme -: {}'.format( self.fuji.id_scheme)) self.result.score = self.score self.result.maturity = self.maturity self.result.metric_tests = self.metric_tests self.result.output = self.output
def evaluate(self): self.result = Persistence(id=self.fuji.count, metric_identifier=self.metric_identifier, metric_name=self.metric_name) self.output = PersistenceOutput() # ======= CHECK IDENTIFIER PERSISTENCE ======= self.logger.info( 'FsF-F1-02D : PID schemes-based assessment supported by the assessment service - {}' .format(Mapper.VALID_PIDS.value)) if self.fuji.pid_scheme is not None: check_url = idutils.to_url(self.fuji.id, scheme=self.fuji.pid_scheme) elif self.fuji.id_scheme == 'url': check_url = self.fuji.id # ======= RETRIEVE METADATA FROM LANDING PAGE ======= requestHelper = RequestHelper(check_url, self.logger) requestHelper.setAcceptType(AcceptTypes.html) # request neg_source, self.fuji.extruct_result = requestHelper.content_negotiate( 'FsF-F1-02D') r = requestHelper.getHTTPResponse() signposting_pid = None if r: self.fuji.landing_url = requestHelper.redirect_url if r.status == 200: # identify signposting links in header header_link_string = requestHelper.getHTTPResponse().getheader( 'Link') if header_link_string is not None: self.logger.info( 'FsF-F1-02D : Found signposting links in response header of landingpage' ) for preparsed_link in header_link_string.split(','): found_link = None found_type, type_match = None, None found_rel, rel_match = None, None parsed_link = preparsed_link.strip().split(';') found_link = parsed_link[0].strip() for link_prop in parsed_link[1:]: if str(link_prop).startswith('rel="'): rel_match = re.search('rel=\"(.*?)\"', link_prop) elif str(link_prop).startswith('type="'): type_match = re.search('type=\"(.*?)\"', link_prop) if type_match: found_type = type_match[1] if rel_match: found_rel = rel_match[1] signposting_link_dict = { 'url': found_link[1:-1], 'type': found_type, 'rel': found_rel } if found_link: self.fuji.signposting_header_links.append( signposting_link_dict) ''' if found_rel: if self.fuji.signposting_header_links.get(found_rel[1]): self.fuji.signposting_header_links[found_rel[1]].append(found_link[1:-1]) else: self.fuji.signposting_header_links[found_rel[1]]=[found_link[1:-1]] ''' #check if there is a cite-as signposting link if self.fuji.pid_scheme is None: signposting_pid_link = self.fuji.get_signposting_links( 'cite-as') if signposting_pid_link: signposting_pid = signposting_pid_link[0].get('url') if signposting_pid: found_ids = idutils.detect_identifier_schemes( signposting_pid[0]) if len(found_ids) > 1: found_ids.remove('url') found_id = found_ids[0] if found_id in Mapper.VALID_PIDS.value: self.logger.info( 'FsF-F1-02D : Found object identifier in signposting header links' ) self.fuji.pid_scheme = found_id up = urlparse(self.fuji.landing_url) self.fuji.landing_origin = '{uri.scheme}://{uri.netloc}'.format( uri=up) self.fuji.landing_html = requestHelper.getResponseContent() self.output.resolved_url = self.fuji.landing_url # url is active, although the identifier is not based on a pid scheme self.output.resolvable_status = True self.logger.info( 'FsF-F1-02D : Object identifier active (status code = 200)' ) self.fuji.isMetadataAccessible = True elif r.status_code in [401, 402, 403]: self.fuji.isMetadataAccessible = False self.logger.warning( "Resource inaccessible, identifier returned http status code: {code}" .format(code=r.status_code)) else: self.fuji.isMetadataAccessible = False self.logger.warning( "Resource inaccessible, identifier returned http status code: {code}" .format(code=r.status_code)) else: self.fuji.isMetadataAccessible = False self.logger.warning( "FsF-F1-02D :Resource inaccessible, no response received from: {}" .format(check_url)) if self.fuji.pid_scheme is not None: # short_pid = id.normalize_pid(self.id, scheme=pid_scheme) if signposting_pid is None: self.fuji.pid_url = idutils.to_url(self.fuji.id, scheme=self.fuji.pid_scheme) else: self.fuji.pid_url = signposting_pid[0] self.output.pid_scheme = self.fuji.pid_scheme self.result.test_status = 'pass' self.output.pid = self.fuji.pid_url self.setEvaluationCriteriumScore('FsF-F1-02D-1', 0, 'pass') if self.fuji.isMetadataAccessible: self.setEvaluationCriteriumScore('FsF-F1-02D-2', 1, 'pass') self.score.earned = self.total_score # idenfier should be based on a persistence scheme and resolvable #print(self.metric_tests) self.logger.log( self.fuji.LOG_SUCCESS, 'FsF-F1-02D : Persistence identifier scheme - {}'.format( self.fuji.pid_scheme)) #self.logger.info('FsF-F1-02D : Persistence identifier scheme - {}'.format(self.fuji.pid_scheme)) else: self.score.earned = 0 self.logger.warning( 'FsF-F1-02D : Not a persistent identifier scheme - {}'.format( self.fuji.id_scheme)) self.result.score = self.score self.result.metric_tests = self.metric_tests self.result.output = self.output
def parse_metadata(self): xml_metadata = None xml_mapping = None metatree = None envelope_metadata = {} XSI = "http://www.w3.org/2001/XMLSchema-instance" if self.link_type == 'linked': source_name = self.getEnumSourceNames().TYPED_LINK.value if self.link_type == 'embedded': source_name = self.getEnumSourceNames().LINKED_DATA.value elif self.link_type == 'guessed': source_name = self.getEnumSourceNames().GUESSED_XML.value elif self.link_type == 'negotiated': source_name = self.getEnumSourceNames().XML_NEGOTIATED.value else: source_name = self.getEnumSourceNames().TYPED_LINK.value dc_core_metadata = None requestHelper = RequestHelper(self.target_url, self.logger) requestHelper.setAcceptType(AcceptTypes.xml) #self.logger.info('FsF-F2-01M : Sending request to access metadata from -: {}'.format(self.target_url)) neg_source, xml_response = requestHelper.content_negotiate( 'FsF-F2-01M') if requestHelper.getHTTPResponse() is not None: self.logger.info( 'FsF-F2-01M : Trying to extract/parse metadata from -: {}'. format(source_name)) #dom = lxml.html.fromstring(self.landing_html.encode('utf8')) if neg_source != 'xml': self.logger.info( 'FsF-F2-01M : Expected XML but content negotiation responded -: ' + str(neg_source)) else: parser = lxml.etree.XMLParser(strip_cdata=False) tree = lxml.etree.XML(xml_response, parser) root_element = tree.tag if root_element.endswith('}OAI-PMH'): self.logger.info( 'FsF-F2-01M : Found OAI-PMH type XML envelope, unpacking \'metadata\' element for further processing' ) metatree = tree.find('.//{*}metadata/*') elif root_element.endswith('}mets'): self.logger.info( 'FsF-F2-01M : Found METS type XML envelope, unpacking all \'mods\' elements for further processing' ) envelope_metadata = self.get_mapped_xml_metadata( tree, Mapper.XML_MAPPING_METS.value) metatree = tree.find('.//{*}dmdSec/{*}mdWrap/{*}xmlData/*') elif root_element.endswith('}GetRecordsResponse'): self.logger.info( 'FsF-F2-01M : Found OGC CSW GetRecords type XML envelope, unpacking \'SearchResults\' element for further processing' ) metatree = tree.find('.//{*}SearchResults/*') elif root_element.endswith('}GetRecordByIdResponse'): self.logger.info( 'FsF-F2-01M : Found OGC CSW GetRecordByIdResponse type XML envelope, unpacking metadata element for further processing' ) metatree = tree.find('.//*') else: metatree = tree if metatree is not None: root_namespace = None nsmatch = re.match(r'^\{(.+)\}(.+)$', metatree.tag) schema_locations = set( metatree.xpath("//*/@xsi:schemaLocation", namespaces={'xsi': XSI})) for schema_location in schema_locations: self.namespaces = re.split('\s', schema_location) if nsmatch: root_namespace = nsmatch[1] root_element = nsmatch[2] print('#' + root_element + '#', root_namespace) self.namespaces.append(root_namespace) if root_element == 'codeBook': xml_mapping = Mapper.XML_MAPPING_DDI_CODEBOOK.value self.logger.info( 'FsF-F2-01M : Identified DDI codeBook XML based on root tag' ) elif root_element == 'dc': xml_mapping = Mapper.XML_MAPPING_DUBLIN_CORE.value self.logger.info( 'FsF-F2-01M : Identified Dublin Core XML based on root tag' ) elif root_element == 'mods': xml_mapping = Mapper.XML_MAPPING_MODS.value self.logger.info( 'FsF-F2-01M : Identified MODS XML based on root tag' ) elif root_element == 'eml': xml_mapping = Mapper.XML_MAPPING_EML.value self.logger.info( 'FsF-F2-01M : Identified EML XML based on root tag' ) elif root_element == 'MD_Metadata': xml_mapping = Mapper.XML_MAPPING_GCMD_ISO.value self.logger.info( 'FsF-F2-01M : Identified ISO 19115 XML based on root tag' ) elif root_namespace: if 'datacite.org/schema' in root_namespace: xml_mapping = Mapper.XML_MAPPING_DATACITE.value self.logger.info( 'FsF-F2-01M : Identified DataCite XML based on namespace' ) if xml_mapping and metatree is not None: xml_metadata = self.get_mapped_xml_metadata(metatree, xml_mapping) if envelope_metadata: for envelope_key, envelope_values in envelope_metadata.items(): if envelope_key not in xml_metadata: xml_metadata[envelope_key] = envelope_values return source_name, xml_metadata