Exemplo n.º 1
0
    def parse_metadata(self):
        XSI = "http://www.w3.org/2001/XMLSchema-instance"
        if self.link_type == 'embedded':
            source_name = self.getEnumSourceNames().LINKED_DATA.value
        elif self.link_type == 'guessed':
            source_name = self.getEnumSourceNames().GUESSED_XML.value
        elif self.link_type == 'negotiated':
            source_name = self.getEnumSourceNames().XML_NEGOTIATED.value
        else:
            source_name = self.getEnumSourceNames().TYPED_LINK.value
        dc_core_metadata = None
        requestHelper = RequestHelper(self.target_url, self.logger)
        requestHelper.setAcceptType(AcceptTypes.xml)
        #self.logger.info('FsF-F2-01M : Sending request to access metadata from -: {}'.format(self.target_url))
        neg_source, xml_response = requestHelper.content_negotiate(
            'FsF-F2-01M')
        if requestHelper.getHTTPResponse() is not None:
            self.logger.info(
                'FsF-F2-01M : Trying to extract/parse metadata from -: {}'.
                format(source_name))
            #dom = lxml.html.fromstring(self.landing_html.encode('utf8'))
            if neg_source != 'xml':
                self.logger.info(
                    'FsF-F2-01M : Expected XML but content negotiation responded -: '
                    + str(neg_source))
            else:
                tree = lxml.etree.XML(xml_response)
                schema_locations = set(
                    tree.xpath("//*/@xsi:schemaLocation",
                               namespaces={'xsi': XSI}))
                for schema_location in schema_locations:
                    self.namespaces = re.split('\s', schema_location)
                #TODO: implement some XSLT to handle the XML..

        return source_name, dc_core_metadata
    def evaluate(self):
        self.result = Persistence(id=self.metric_number,
                                  metric_identifier=self.metric_identifier,
                                  metric_name=self.metric_name)
        self.output = PersistenceOutput()
        # ======= CHECK IDENTIFIER PERSISTENCE =======
        self.logger.info(
            'FsF-F1-02D : PID schemes-based assessment supported by the assessment service - {}'
            .format(Mapper.VALID_PIDS.value))
        check_url = None
        signposting_pid = None
        if self.fuji.id_scheme is not None:
            check_url = self.fuji.pid_url
            #check_url = idutils.to_url(self.fuji.id, scheme=self.fuji.id_scheme)
        if self.fuji.id_scheme == 'url':
            self.fuji.origin_url = self.fuji.id
            check_url = self.fuji.id
        if check_url:
            # ======= RETRIEVE METADATA FROM LANDING PAGE =======
            requestHelper = RequestHelper(check_url, self.logger)
            requestHelper.setAcceptType(AcceptTypes.html_xml)  # request
            neg_source, self.fuji.extruct_result = requestHelper.content_negotiate(
                'FsF-F1-02D', ignore_html=False)
            if not 'html' in str(requestHelper.content_type):
                self.logger.info(
                    'FsF-F2-01M :Content type is ' +
                    str(requestHelper.content_type) +
                    ', therefore skipping embedded metadata (microdata, RDFa) tests'
                )
                self.fuji.extruct_result = {}
            if type(self.fuji.extruct_result) != dict:
                self.fuji.extruct_result = {}
            r = requestHelper.getHTTPResponse()
            response_status = requestHelper.response_status

            if r:
                self.fuji.landing_url = requestHelper.redirect_url
                #in case the test has been repeated because a PID has been found in metadata
                #print(self.fuji.landing_url, self.fuji.input_id)
                if self.fuji.repeat_pid_check == True:
                    if self.fuji.landing_url != self.fuji.input_id:
                        self.logger.warning(
                            'FsF-F1-02D : Landing page URL resolved from PID found in metadata does not match with input URL'
                        )
                        self.logger.warning(
                            'FsF-F2-01M : Seems to be a catalogue entry or alternative representation of the data set, landing page URL resolved from PID found in metadata does not match with input URL'
                        )

                        #self.fuji.repeat_pid_check = False
                if self.fuji.landing_url not in [
                        'https://datacite.org/invalid.html'
                ]:

                    if response_status == 200:
                        # identify signposting links in header
                        header_link_string = requestHelper.getHTTPResponse(
                        ).getheader('Link')
                        if header_link_string is not None:
                            self.logger.info(
                                'FsF-F1-02D : Found signposting links in response header of landingpage'
                            )

                            for preparsed_link in header_link_string.split(
                                    ','):
                                found_link = None
                                found_type, type_match = None, None
                                found_rel, rel_match = None, None
                                found_formats, formats_match = None, None
                                parsed_link = preparsed_link.strip().split(';')
                                found_link = parsed_link[0].strip()
                                for link_prop in parsed_link[1:]:
                                    if str(link_prop).startswith('rel="'):
                                        rel_match = re.search(
                                            'rel=\"(.*?)\"', link_prop)
                                    elif str(link_prop).startswith('type="'):
                                        type_match = re.search(
                                            'type=\"(.*?)\"', link_prop)
                                    elif str(link_prop).startswith(
                                            'formats="'):
                                        formats_match = re.search(
                                            'formats=\"(.*?)\"', link_prop)
                                if type_match:
                                    found_type = type_match[1]
                                if rel_match:
                                    found_rel = rel_match[1]
                                if formats_match:
                                    found_formats = formats_match[1]
                                signposting_link_dict = {
                                    'url': found_link[1:-1],
                                    'type': found_type,
                                    'rel': found_rel,
                                    'profile': found_formats
                                }
                                if found_link:
                                    self.fuji.signposting_header_links.append(
                                        signposting_link_dict)

                        #check if there is a cite-as signposting link
                        if self.fuji.pid_scheme is None:
                            signposting_pid_link = self.fuji.get_signposting_links(
                                'cite-as')
                            if signposting_pid_link:
                                signposting_pid = signposting_pid_link[0].get(
                                    'url')
                            if signposting_pid:
                                signidhelper = IdentifierHelper
                                #found_ids = idutils.detect_identifier_schemes(signposting_pid[0])
                                found_id = signidhelper.preferred_schema
                                #if len(found_ids) > 1:
                                #    found_ids.remove('url')
                                #    found_id = found_ids[0]
                                if signidhelper.is_persistent:
                                    self.logger.info(
                                        'FsF-F1-02D : Found object identifier in signposting header links'
                                    )
                                    self.fuji.pid_scheme = found_id

                        up = urlparse(self.fuji.landing_url)
                        self.fuji.landing_origin = '{uri.scheme}://{uri.netloc}'.format(
                            uri=up)
                        self.fuji.landing_html = requestHelper.getResponseContent(
                        )
                        self.fuji.landing_content_type = requestHelper.content_type

                        self.output.resolved_url = self.fuji.landing_url  # url is active, although the identifier is not based on a pid scheme
                        self.output.resolvable_status = True
                        self.logger.info(
                            'FsF-F1-02D : Object identifier active (status code = 200)'
                        )
                        self.fuji.isMetadataAccessible = True
                    elif response_status in [401, 402, 403]:
                        self.fuji.isMetadataAccessible = False
                        self.logger.warning(
                            "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}"
                            .format(code=response_status))
                    else:
                        self.fuji.isMetadataAccessible = False
                        self.logger.warning(
                            "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}"
                            .format(code=response_status))
                else:
                    self.logger.warning(
                        "FsF-F1-02D : Invalid DOI, identifier resolved to -: {code}"
                        .format(code=self.fuji.landing_url))

            else:
                self.fuji.isMetadataAccessible = False
                self.logger.warning(
                    "FsF-F1-02D :Resource inaccessible, no response received from -: {}"
                    .format(check_url))
                if response_status in [401, 402, 403]:
                    self.logger.warning(
                        "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}"
                        .format(code=response_status))
        else:
            self.logger.warning(
                "FsF-F1-02D :Resource inaccessible, could not identify an actionable representation for the given identfier -: {}"
                .format(self.fuji.id))

        if self.fuji.pid_scheme is not None:
            # short_pid = id.normalize_pid(self.id, scheme=pid_scheme)
            if signposting_pid is None:
                idhelper = IdentifierHelper(self.fuji.id)
                self.fuji.pid_url = idhelper.identifier_url
                #self.fuji.pid_url = idutils.to_url(self.fuji.id, scheme=self.fuji.pid_scheme)
            else:
                self.fuji.pid_url = signposting_pid[0]
            self.output.pid_scheme = self.fuji.pid_scheme

            self.output.pid = self.fuji.pid_url
            self.setEvaluationCriteriumScore('FsF-F1-02D-1', 0.5, 'pass')
            self.score.earned = 0.5
            self.maturity = 1
            if self.fuji.isMetadataAccessible:
                self.setEvaluationCriteriumScore('FsF-F1-02D-2', 0.5, 'pass')
                self.maturity = 3
                self.result.test_status = 'pass'
                self.score.earned = self.total_score  # idenfier should be based on a persistence scheme and resolvable

            #print(self.metric_tests)

            self.logger.log(
                self.fuji.LOG_SUCCESS,
                'FsF-F1-02D : Persistence identifier scheme -: {}'.format(
                    self.fuji.pid_scheme))
            #self.logger.info('FsF-F1-02D : Persistence identifier scheme - {}'.format(self.fuji.pid_scheme))
        else:
            self.score.earned = 0
            self.logger.warning(
                'FsF-F1-02D : Not a persistent identifier scheme -: {}'.format(
                    self.fuji.id_scheme))

        self.result.score = self.score
        self.result.maturity = self.maturity
        self.result.metric_tests = self.metric_tests
        self.result.output = self.output
Exemplo n.º 3
0
    def evaluate(self):

        self.result = Persistence(id=self.fuji.count,
                                  metric_identifier=self.metric_identifier,
                                  metric_name=self.metric_name)
        self.output = PersistenceOutput()
        # ======= CHECK IDENTIFIER PERSISTENCE =======
        self.logger.info(
            'FsF-F1-02D : PID schemes-based assessment supported by the assessment service - {}'
            .format(Mapper.VALID_PIDS.value))

        if self.fuji.pid_scheme is not None:
            check_url = idutils.to_url(self.fuji.id,
                                       scheme=self.fuji.pid_scheme)
        elif self.fuji.id_scheme == 'url':
            check_url = self.fuji.id

        # ======= RETRIEVE METADATA FROM LANDING PAGE =======
        requestHelper = RequestHelper(check_url, self.logger)
        requestHelper.setAcceptType(AcceptTypes.html)  # request
        neg_source, self.fuji.extruct_result = requestHelper.content_negotiate(
            'FsF-F1-02D')
        r = requestHelper.getHTTPResponse()
        signposting_pid = None
        if r:
            self.fuji.landing_url = requestHelper.redirect_url
            if r.status == 200:
                # identify signposting links in header
                header_link_string = requestHelper.getHTTPResponse().getheader(
                    'Link')
                if header_link_string is not None:
                    self.logger.info(
                        'FsF-F1-02D : Found signposting links in response header of landingpage'
                    )

                    for preparsed_link in header_link_string.split(','):
                        found_link = None
                        found_type, type_match = None, None
                        found_rel, rel_match = None, None
                        parsed_link = preparsed_link.strip().split(';')
                        found_link = parsed_link[0].strip()
                        for link_prop in parsed_link[1:]:
                            if str(link_prop).startswith('rel="'):
                                rel_match = re.search('rel=\"(.*?)\"',
                                                      link_prop)
                            elif str(link_prop).startswith('type="'):
                                type_match = re.search('type=\"(.*?)\"',
                                                       link_prop)
                        if type_match:
                            found_type = type_match[1]
                        if rel_match:
                            found_rel = rel_match[1]
                        signposting_link_dict = {
                            'url': found_link[1:-1],
                            'type': found_type,
                            'rel': found_rel
                        }
                        if found_link:
                            self.fuji.signposting_header_links.append(
                                signposting_link_dict)
                        '''
                        if found_rel:
                            if self.fuji.signposting_header_links.get(found_rel[1]):
                                self.fuji.signposting_header_links[found_rel[1]].append(found_link[1:-1])
                            else:
                                self.fuji.signposting_header_links[found_rel[1]]=[found_link[1:-1]]
                        '''

                #check if there is a cite-as signposting link
                if self.fuji.pid_scheme is None:
                    signposting_pid_link = self.fuji.get_signposting_links(
                        'cite-as')
                    if signposting_pid_link:
                        signposting_pid = signposting_pid_link[0].get('url')
                    if signposting_pid:
                        found_ids = idutils.detect_identifier_schemes(
                            signposting_pid[0])
                        if len(found_ids) > 1:
                            found_ids.remove('url')
                            found_id = found_ids[0]
                            if found_id in Mapper.VALID_PIDS.value:
                                self.logger.info(
                                    'FsF-F1-02D : Found object identifier in signposting header links'
                                )
                                self.fuji.pid_scheme = found_id

                up = urlparse(self.fuji.landing_url)
                self.fuji.landing_origin = '{uri.scheme}://{uri.netloc}'.format(
                    uri=up)
                self.fuji.landing_html = requestHelper.getResponseContent()

                self.output.resolved_url = self.fuji.landing_url  # url is active, although the identifier is not based on a pid scheme
                self.output.resolvable_status = True
                self.logger.info(
                    'FsF-F1-02D : Object identifier active (status code = 200)'
                )
                self.fuji.isMetadataAccessible = True
            elif r.status_code in [401, 402, 403]:
                self.fuji.isMetadataAccessible = False
                self.logger.warning(
                    "Resource inaccessible, identifier returned http status code: {code}"
                    .format(code=r.status_code))
            else:
                self.fuji.isMetadataAccessible = False
                self.logger.warning(
                    "Resource inaccessible, identifier returned http status code: {code}"
                    .format(code=r.status_code))
        else:
            self.fuji.isMetadataAccessible = False
            self.logger.warning(
                "FsF-F1-02D :Resource inaccessible, no response received from: {}"
                .format(check_url))

        if self.fuji.pid_scheme is not None:
            # short_pid = id.normalize_pid(self.id, scheme=pid_scheme)
            if signposting_pid is None:
                self.fuji.pid_url = idutils.to_url(self.fuji.id,
                                                   scheme=self.fuji.pid_scheme)
            else:
                self.fuji.pid_url = signposting_pid[0]
            self.output.pid_scheme = self.fuji.pid_scheme
            self.result.test_status = 'pass'
            self.output.pid = self.fuji.pid_url
            self.setEvaluationCriteriumScore('FsF-F1-02D-1', 0, 'pass')
            if self.fuji.isMetadataAccessible:
                self.setEvaluationCriteriumScore('FsF-F1-02D-2', 1, 'pass')
                self.score.earned = self.total_score  # idenfier should be based on a persistence scheme and resolvable

            #print(self.metric_tests)

            self.logger.log(
                self.fuji.LOG_SUCCESS,
                'FsF-F1-02D : Persistence identifier scheme - {}'.format(
                    self.fuji.pid_scheme))
            #self.logger.info('FsF-F1-02D : Persistence identifier scheme - {}'.format(self.fuji.pid_scheme))
        else:
            self.score.earned = 0
            self.logger.warning(
                'FsF-F1-02D : Not a persistent identifier scheme - {}'.format(
                    self.fuji.id_scheme))

        self.result.score = self.score
        self.result.metric_tests = self.metric_tests
        self.result.output = self.output
Exemplo n.º 4
0
    def parse_metadata(self):
        xml_metadata = None
        xml_mapping = None
        metatree = None
        envelope_metadata = {}
        XSI = "http://www.w3.org/2001/XMLSchema-instance"
        if self.link_type == 'linked':
            source_name = self.getEnumSourceNames().TYPED_LINK.value
        if self.link_type == 'embedded':
            source_name = self.getEnumSourceNames().LINKED_DATA.value
        elif self.link_type == 'guessed':
            source_name = self.getEnumSourceNames().GUESSED_XML.value
        elif self.link_type == 'negotiated':
            source_name = self.getEnumSourceNames().XML_NEGOTIATED.value
        else:
            source_name = self.getEnumSourceNames().TYPED_LINK.value
        dc_core_metadata = None
        requestHelper = RequestHelper(self.target_url, self.logger)
        requestHelper.setAcceptType(AcceptTypes.xml)
        #self.logger.info('FsF-F2-01M : Sending request to access metadata from -: {}'.format(self.target_url))
        neg_source, xml_response = requestHelper.content_negotiate(
            'FsF-F2-01M')
        if requestHelper.getHTTPResponse() is not None:
            self.logger.info(
                'FsF-F2-01M : Trying to extract/parse metadata from -: {}'.
                format(source_name))
            #dom = lxml.html.fromstring(self.landing_html.encode('utf8'))
            if neg_source != 'xml':
                self.logger.info(
                    'FsF-F2-01M : Expected XML but content negotiation responded -: '
                    + str(neg_source))
            else:
                parser = lxml.etree.XMLParser(strip_cdata=False)
                tree = lxml.etree.XML(xml_response, parser)
                root_element = tree.tag
                if root_element.endswith('}OAI-PMH'):
                    self.logger.info(
                        'FsF-F2-01M : Found OAI-PMH type XML envelope, unpacking \'metadata\' element for further processing'
                    )
                    metatree = tree.find('.//{*}metadata/*')
                elif root_element.endswith('}mets'):
                    self.logger.info(
                        'FsF-F2-01M : Found METS type XML envelope, unpacking all \'mods\' elements for further processing'
                    )
                    envelope_metadata = self.get_mapped_xml_metadata(
                        tree, Mapper.XML_MAPPING_METS.value)
                    metatree = tree.find('.//{*}dmdSec/{*}mdWrap/{*}xmlData/*')
                elif root_element.endswith('}GetRecordsResponse'):
                    self.logger.info(
                        'FsF-F2-01M : Found OGC CSW GetRecords type XML envelope, unpacking \'SearchResults\' element for further processing'
                    )
                    metatree = tree.find('.//{*}SearchResults/*')
                elif root_element.endswith('}GetRecordByIdResponse'):
                    self.logger.info(
                        'FsF-F2-01M : Found OGC CSW GetRecordByIdResponse type XML envelope, unpacking metadata element for further processing'
                    )
                    metatree = tree.find('.//*')
                else:
                    metatree = tree
                if metatree is not None:
                    root_namespace = None
                    nsmatch = re.match(r'^\{(.+)\}(.+)$', metatree.tag)
                    schema_locations = set(
                        metatree.xpath("//*/@xsi:schemaLocation",
                                       namespaces={'xsi': XSI}))
                    for schema_location in schema_locations:
                        self.namespaces = re.split('\s', schema_location)
                    if nsmatch:
                        root_namespace = nsmatch[1]
                        root_element = nsmatch[2]
                        print('#' + root_element + '#', root_namespace)
                        self.namespaces.append(root_namespace)
                    if root_element == 'codeBook':
                        xml_mapping = Mapper.XML_MAPPING_DDI_CODEBOOK.value
                        self.logger.info(
                            'FsF-F2-01M : Identified DDI codeBook XML based on root tag'
                        )
                    elif root_element == 'dc':
                        xml_mapping = Mapper.XML_MAPPING_DUBLIN_CORE.value
                        self.logger.info(
                            'FsF-F2-01M : Identified Dublin Core XML based on root tag'
                        )
                    elif root_element == 'mods':
                        xml_mapping = Mapper.XML_MAPPING_MODS.value
                        self.logger.info(
                            'FsF-F2-01M : Identified MODS XML based on root tag'
                        )

                    elif root_element == 'eml':
                        xml_mapping = Mapper.XML_MAPPING_EML.value
                        self.logger.info(
                            'FsF-F2-01M : Identified EML XML based on root tag'
                        )
                    elif root_element == 'MD_Metadata':
                        xml_mapping = Mapper.XML_MAPPING_GCMD_ISO.value
                        self.logger.info(
                            'FsF-F2-01M : Identified ISO 19115 XML based on root tag'
                        )
                    elif root_namespace:
                        if 'datacite.org/schema' in root_namespace:
                            xml_mapping = Mapper.XML_MAPPING_DATACITE.value
                            self.logger.info(
                                'FsF-F2-01M : Identified DataCite XML based on namespace'
                            )

        if xml_mapping and metatree is not None:
            xml_metadata = self.get_mapped_xml_metadata(metatree, xml_mapping)

        if envelope_metadata:
            for envelope_key, envelope_values in envelope_metadata.items():
                if envelope_key not in xml_metadata:
                    xml_metadata[envelope_key] = envelope_values
        return source_name, xml_metadata