Exemplo n.º 1
0
 def load_predata(cls):
     cls.FILES_LIMIT = Preprocessor.data_files_limit
     if not cls.METRICS:
         cls.METRICS = Preprocessor.get_custom_metrics(['metric_name', 'total_score'])
     if not cls.SPDX_LICENSES:
         # cls.SPDX_LICENSES, cls.SPDX_LICENSE_NAMES, cls.SPDX_LICENSE_URLS = Preprocessor.get_licenses()
         cls.SPDX_LICENSES, cls.SPDX_LICENSE_NAMES = Preprocessor.get_licenses()
     if not cls.COMMUNITY_METADATA_STANDARDS_URIS:
         cls.COMMUNITY_METADATA_STANDARDS_URIS = Preprocessor.get_metadata_standards_uris()
         cls.COMMUNITY_METADATA_STANDARDS_URIS_LIST = list(cls.COMMUNITY_METADATA_STANDARDS_URIS.keys())
     if not cls.COMMUNITY_STANDARDS:
         cls.COMMUNITY_STANDARDS = Preprocessor.get_metadata_standards()
         cls.COMMUNITY_STANDARDS_NAMES = list(cls.COMMUNITY_STANDARDS.keys())
     if not cls.SCIENCE_FILE_FORMATS:
         cls.SCIENCE_FILE_FORMATS = Preprocessor.get_science_file_formats()
     if not cls.LONG_TERM_FILE_FORMATS:
         cls.LONG_TERM_FILE_FORMATS = Preprocessor.get_long_term_file_formats()
     if not cls.OPEN_FILE_FORMATS:
         cls.OPEN_FILE_FORMATS = Preprocessor.get_open_file_formats()
     if not cls.DEFAULT_NAMESPACES:
         cls.DEFAULT_NAMESPACES = Preprocessor.getDefaultNamespaces()
     if not cls.VOCAB_NAMESPACES:
         cls.VOCAB_NAMESPACES = Preprocessor.getLinkedVocabs()
     if not cls.STANDARD_PROTOCOLS:
         cls.STANDARD_PROTOCOLS = Preprocessor.get_standard_protocols()
Exemplo n.º 2
0
def get_metrics():  # noqa: E501
    """Return all metrics and their definitions
     # noqa: E501
    :rtype: Metrics
    """
    response = Preprocessor.get_metrics()
    return response, 200
Exemplo n.º 3
0
    def getNamespacesfromIRIs(self, meta_source):
        extractor = URLExtract()
        namespaces = set()
        if meta_source is not None:
            for url in set(extractor.gen_urls(str(meta_source))):
                namespace_candidate = url.rsplit('/', 1)[0]
                if namespace_candidate != url:
                    namespaces.add(namespace_candidate)
                else:
                    namespace_candidate = url.rsplit('#', 1)[0]
                    if namespace_candidate != url:
                        namespaces.add(namespace_candidate)

            vocabs = Preprocessor.getLinkedVocabs()
            lod_namespaces = [
                d['namespace'] for d in vocabs if 'namespace' in d
            ]
            for ns in namespaces:
                if ns + '/' in lod_namespaces:
                    self.namespaces.append(ns + '/')
                elif ns + '#' in lod_namespaces:
                    self.namespaces.append(ns + '#')
Exemplo n.º 4
0
class IdentifierHelper:
    IDENTIFIERS_ORG_DATA = Preprocessor.get_identifiers_org_data()
    identifier_schemes = []
    preferred_schema = None  # the preferred schema
    identifier_url = None
    identifier = None
    method = 'idutils'
    is_persistent = False

    def __init__(self, idstring):
        self.identifier = idstring
        self.normalized_id = self.identifier
        if len(self.identifier) > 4 and not self.identifier.isnumeric():
            generic_identifiers_org_pattern = '^([a-z0-9\._]+):(.+)'
            # idutils check
            self.identifier_schemes = idutils.detect_identifier_schemes(
                self.identifier)
            # identifiers.org check
            if not self.identifier_schemes:
                self.method = 'identifiers.org'
                idmatch = re.search(generic_identifiers_org_pattern,
                                    self.identifier)
                if idmatch:
                    found_prefix = idmatch[1]
                    found_suffix = idmatch[2]
                    if found_prefix in self.IDENTIFIERS_ORG_DATA.keys():
                        if (re.search(
                                self.IDENTIFIERS_ORG_DATA[found_prefix]
                            ['pattern'], found_suffix)):
                            self.identifier_schemes = [
                                found_prefix, 'identifiers_org'
                            ]
                            self.preferred_schema = found_prefix
                        self.identifier_url = str(
                            self.IDENTIFIERS_ORG_DATA[found_prefix]
                            ['url_pattern']).replace('{$id}', found_suffix)
                        self.normalized_id = found_prefix.lower(
                        ) + ':' + found_suffix
            else:
                # preferred schema
                if len(self.identifier_schemes) > 0:
                    if len(self.identifier_schemes) > 1:
                        if 'url' in self.identifier_schemes:  # ['doi', 'url']
                            self.identifier_schemes.remove('url')
                    self.preferred_schema = self.identifier_schemes[0]
                    self.normalized_id = idutils.normalize_pid(
                        self.identifier, self.preferred_schema)
                self.identifier_url = idutils.to_url(self.identifier,
                                                     self.preferred_schema)
            if self.preferred_schema in Mapper.VALID_PIDS.value or self.preferred_schema in self.IDENTIFIERS_ORG_DATA.keys(
            ):
                self.is_persistent = True

    def get_preferred_schema(self):
        return self.preferred_schema

    def get_identifier_schemes(self):
        return self.identifier_schemes

    def get_identifier_url(self):
        return self.identifier_url

    def get_normalized_id(self):
        return self.normalized_id
Exemplo n.º 5
0
class MetaDataCollectorSchemaOrg (MetaDataCollector):
    source_name=None
    SCHEMA_ORG_CONTEXT = Preprocessor.get_schema_org_context()
    def __init__(self, sourcemetadata, mapping, loggerinst, pidurl):
        #self.is_pid = ispid
        self.pid_url = pidurl
        super().__init__(logger=loggerinst, mapping=mapping, sourcemetadata=sourcemetadata)




    def parse_metadata(self, ls=None):
        jsnld_metadata = {}
        ext_meta=None
        if self.source_metadata:
            self.source_name = self.getEnumSourceNames().SCHEMAORG_EMBED.value
            ext_meta = self.source_metadata[0]
        elif self.pid_url:
            self.source_name = self.getEnumSourceNames().SCHEMAORG_NEGOTIATE.value
            # TODO (IMPORTANT) PID agency may support Schema.org in JSON-LD
            # TODO (IMPORTANT) validate schema.org
            # fallback, request (doi) metadata specified in schema.org JSON-LD
            requestHelper: RequestHelper = RequestHelper(self.pid_url, self.logger)
            requestHelper.setAcceptType(AcceptTypes.schemaorg)
            neg_source,ext_meta = requestHelper.content_negotiate('FsF-F2-01M')
        if ext_meta is not None:
            self.getNamespacesfromIRIs(ext_meta)
            self.logger.info('FsF-F2-01M : Trying to extract schema.org JSON-LD metadata from -: {}'.format(self.source_name))
            # TODO check syntax - not ending with /, type and @type
            # TODO (important) extend mapping to detect other pids (link to related entities)?
            check_context_type =  ["Dataset", "Collection"]
            try:
                #if ext_meta['@context'] in check_context_type['@context'] and ext_meta['@type'] in check_context_type["@type"]:

                if str(ext_meta['@context']).find('://schema.org') > -1:
                    if str(ext_meta['@type']).lower() not in self.SCHEMA_ORG_CONTEXT:
                        self.logger.info('FsF-F2-01M : Found JSON-LD but seems not to be a schema.org object based on the given context type')
                    elif ext_meta['@type'] not in check_context_type:
                        self.logger.info('FsF-F2-01M : Found schema.org JSON-LD but seems not to be a research data object')
                    else:
                        self.logger.info('FsF-F2-01M : Found schema.org JSON-LD which seems to be valid, based on the given context type')

                        self.namespaces.append('http://schema.org/')
                    jsnld_metadata = jmespath.search(self.metadata_mapping.value, ext_meta)
                    # TODO all properties with null values extracted through jmespath should be excluded
                    if jsnld_metadata.get('creator') is None:
                        #TODO: handle None values for first and last name
                        first = jsnld_metadata.get('creator_first')
                        last = jsnld_metadata.get('creator_last')
                        if isinstance(first, list) and isinstance(last, list):
                            if len(first) == len(last):
                                names = [str(i) + " " + str(j) for i, j in zip(first, last)]
                                jsnld_metadata['creator'] = names
                        else:
                            jsnld_metadata['creator'] = [str(first) + " " + str(last)]

                    #TODO instead of custom check there should a valdiator to evaluate the whole schema.org metadata
                    invalid_license = False
                    if jsnld_metadata.get('license'):
                        self.logger.info('FsF-R1.1-01M : License metadata found (schema.org) -: {}'.format(
                            jsnld_metadata.get('license')))

                        if isinstance(jsnld_metadata.get('license'), list):
                            jsnld_metadata['license'] = jsnld_metadata['license'][0]
                        if isinstance(jsnld_metadata.get('license'), dict):
                            ls_type = jsnld_metadata.get('license').get('@type')
                            if ls_type =='CreativeWork':
                                ls = jsnld_metadata.get('license').get('url')
                                if not ls:
                                    ls = jsnld_metadata.get('license').get('name')
                                if ls:
                                    jsnld_metadata['license'] = ls
                                else:
                                    invalid_license = True
                            else:
                                invalid_license = True
                    if invalid_license:
                        self.logger.warning('FsF-R1.1-01M : Looks like schema.org representation of license is incorrect, skipping the test.')
                        jsnld_metadata['license'] = None

                    # filter out None values of related_resources
                    if jsnld_metadata.get('related_resources'):
                        relateds = [d for d in jsnld_metadata['related_resources'] if d['related_resource'] is not None]
                        if relateds:
                            jsnld_metadata['related_resources'] = relateds
                            self.logger.info('FsF-I3-01M : {0} related resource(s) extracted from -: {1}'.format(len(jsnld_metadata['related_resources']), self.source_name))
                        else:
                            del jsnld_metadata['related_resources']
                            self.logger.info('FsF-I3-01M : No related resource(s) found in Schema.org metadata')



                    # TODO quick-fix, expand mapping expression instead
                    if jsnld_metadata.get('object_size'):
                        jsnld_metadata['object_size'] = str(jsnld_metadata['object_size'].get('value')) + ' '+ jsnld_metadata['object_size'].get('unitText')

                else:
                    self.logger.info('FsF-F2-01M : Found JSON-LD schema.org but record is not of type "Dataset"')

            except Exception as err:
                #print(err.with_traceback())
                self.logger.info('FsF-F2-01M : Failed to parse JSON-LD schema.org -: {}'.format(err))
        else:
            self.logger.info('FsF-F2-01M : Could not identify JSON-LD schema.org metadata')

        return self.source_name, jsnld_metadata
Exemplo n.º 6
0
def main():
    logging.getLogger('connexion.operation').setLevel('INFO')
    ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
    YAML_DIR = config['SERVICE']['yaml_directory']
    METRIC_YAML = config['SERVICE']['metrics_yaml']
    METRIC_YML_PATH = os.path.join(ROOT_DIR, YAML_DIR, METRIC_YAML)
    SPDX_URL = config['EXTERNAL']['spdx_license_github']
    DATACITE_API_REPO = config['EXTERNAL']['datacite_api_repo']
    RE3DATA_API = config['EXTERNAL']['re3data_api']
    METADATACATALOG_API = config['EXTERNAL']['metadata_catalog']
    LOV_API = config['EXTERNAL']['lov_api']
    LOD_CLOUDNET = config['EXTERNAL']['lod_cloudnet']
    #BIOPORTAL_REST = config['EXTERNAL']['bioportal_rest']
    #BIOPORTAL_APIKEY = config['EXTERNAL']['bioportal_apikey']
    data_files_limit = int(config['SERVICE']['data_files_limit'])
    metric_specification = config['SERVICE']['metric_specification']

    #TODO further implementation on authentication needed
    auth_enabled = config.getboolean('USER', 'auth_enabled')
    usr = config['USER']['usr']
    pwd = config['USER']['pwd']
    authen.service_username = usr
    authen.service_password = pwd

    preproc = Preprocessor()
    preproc.retrieve_metrics_yaml(METRIC_YML_PATH, data_files_limit,
                                  metric_specification)
    logger.info('Total metrics defined: {}'.format(
        preproc.get_total_metrics()))

    isDebug = config.getboolean('SERVICE', 'debug_mode')
    preproc.retrieve_licenses(SPDX_URL, isDebug)
    preproc.retrieve_datacite_re3repos(RE3DATA_API, DATACITE_API_REPO, isDebug)
    preproc.retrieve_metadata_standards(METADATACATALOG_API, isDebug)
    #preproc.retrieve_linkedvocabs(lov_api=LOV_API, lodcloud_api=LOD_CLOUDNET, bioportal_api=BIOPORTAL_REST, bioportal_key=BIOPORTAL_APIKEY, isDebugMode=False)
    preproc.retrieve_linkedvocabs(lov_api=LOV_API,
                                  lodcloud_api=LOD_CLOUDNET,
                                  isDebugMode=isDebug)
    preproc.retrieve_default_namespaces()
    preproc.set_remote_log_info(config['SERVICE']['remote_log_host'],
                                config['SERVICE']['remote_log_path'])

    logger.info('Total SPDX licenses : {}'.format(
        preproc.get_total_licenses()))
    logger.info('Total re3repositories found from datacite api : {}'.format(
        len(preproc.getRE3repositories())))
    logger.info(
        'Total subjects area of imported metadata standards : {}'.format(
            len(preproc.metadata_standards)))
    logger.info('Total LD vocabs imported : {}'.format(
        len(preproc.getLinkedVocabs())))
    logger.info('Total default namespaces specified : {}'.format(
        len(preproc.getDefaultNamespaces())))

    #you can also use Tornado or gevent as the HTTP server, to do so set server to tornado or gevent
    app = connexion.FlaskApp(__name__, specification_dir=YAML_DIR)
    API_YAML = os.path.join(ROOT_DIR, YAML_DIR,
                            config['SERVICE']['swagger_yaml'])
    app.app.json_encoder = encoder.JSONEncoder
    api_title = 'F-UJI : FAIRsFAIR Research Data Object Assessment Service'
    if auth_enabled:
        api_args = {'title': api_title, 'security': [{'basicAuth': []}]}
    else:
        api_args = {'title': api_title}

    app.add_api(API_YAML, arguments=api_args, validate_responses=True)
    app.app.wsgi_app = ProxyFix(app.app.wsgi_app, x_for=1, x_host=1)
    if os.getenv('ENABLE_CORS', 'False').lower() == 'true':
        CORS(app.app)
    app.run(host=config['SERVICE']['service_host'],
            port=int(config['SERVICE']['service_port']))
Exemplo n.º 7
0
def main():
    config = ConfigParser.ConfigParser()
    my_path = Path(__file__).parent.parent
    ini_path = os.path.join(my_path, 'config', 'server.ini')
    config.read(ini_path)
    YAML_DIR = config['SERVICE']['yaml_directory']
    METRIC_YAML = config['SERVICE']['metrics_yaml']
    METRIC_YML_PATH = os.path.join(my_path, YAML_DIR, METRIC_YAML)
    SPDX_URL = config['EXTERNAL']['spdx_license_github']
    DATACITE_API_REPO = config['EXTERNAL']['datacite_api_repo']
    RE3DATA_API = config['EXTERNAL']['re3data_api']
    METADATACATALOG_API = config['EXTERNAL']['metadata_catalog']
    isDebug = config.getboolean('SERVICE', 'debug_mode')

    preproc = Preprocessor()
    preproc.retrieve_metrics_yaml(METRIC_YML_PATH)
    print('Total metrics defined: {}'.format(preproc.get_total_metrics()))

    isDebug = config.getboolean('SERVICE', 'debug_mode')
    preproc.retrieve_licenses(SPDX_URL, isDebug)
    preproc.retrieve_datacite_re3repos(RE3DATA_API, DATACITE_API_REPO, isDebug)
    preproc.retrieve_metadata_standards(METADATACATALOG_API, isDebug)
    preproc.retrieve_science_file_formats(isDebug)
    preproc.retrieve_long_term_file_formats(isDebug)

    print('Total SPDX licenses : {}'.format(preproc.get_total_licenses()))
    print('Total re3repositories found from datacite api : {}'.format(
        len(preproc.getRE3repositories())))
    print('Total subjects area of imported metadata standards : {}'.format(
        len(preproc.metadata_standards)))
    start = False
    for identifier in testpids:
        print(identifier)
        if identifier == startpid or not startpid:
            start = True
        if start:
            ft = FAIRCheck(uid=identifier, test_debug=debug)
            uid_result, pid_result = ft.check_unique_persistent()
            core_metadata_result = ft.check_minimal_metatadata()
            content_identifier_included_result = ft.check_content_identifier_included(
            )
            check_searchable_result = ft.check_searchable()
            license_result = ft.check_license()
            relatedresources_result = ft.check_relatedresources()
            access_level_result = ft.check_data_access_level()
            data_file_format_result = ft.check_data_file_format()
            data_provenance_result = ft.check_data_provenance()
            community_standards_result = ft.check_community_metadatastandards()
            data_content_metadata = ft.check_data_content_metadata()
            results = [
                uid_result, pid_result, core_metadata_result,
                content_identifier_included_result, check_searchable_result,
                access_level_result, license_result, data_file_format_result,
                data_provenance_result, community_standards_result,
                data_content_metadata
            ]
            #results=[data_file_format_result]
            print(json.dumps(results, indent=4, sort_keys=True))
class RepositoryHelper:

    DATACITE_REPOSITORIES = Preprocessor.getRE3repositories()
    ns = {"r3d": "http://www.re3data.org/schema/2-2"}
    RE3DATA_APITYPES = ['OAI-PMH', 'SOAP', 'SPARQL', 'SWORD', 'OpenDAP']

    def __init__(self, client, pidscheme, logger):
        self.client_id = client
        self.pid_scheme = pidscheme
        self.re3metadata_raw = None
        self.repository_name = None
        self.repository_url = None
        self.repo_apis = {}
        self.repo_standards = []
        self.logger = logging.getLogger(logger)
        #print(__name__)
    def lookup_re3data(self):
        if self.client_id and self.pid_scheme:

            re3doi = RepositoryHelper.DATACITE_REPOSITORIES.get(
                self.client_id)  # {client_id,re3doi}
            #print(self.client_id,'Re3DOI',re3doi, idutils.is_doi(re3doi))
            if re3doi:
                if idutils.is_doi(re3doi):
                    short_re3doi = idutils.normalize_pid(
                        re3doi, scheme='doi')  #https://doi.org/10.17616/R3XS37
                else:
                    re3doi = None

            # pid -> clientId -> repo doi-> re3id, and query repository metadata from re3api
            if re3doi:
                self.logger.info(
                    'FsF-R1.3-01M : Found match re3data (DOI-based) record')
                query_url = Preprocessor.RE3DATA_API + '?query=' + short_re3doi  # https://re3data.org/api/beta/repositories?query=
                q = RequestHelper(url=query_url)
                q.setAcceptType(AcceptTypes.xml)
                re_source, xml = q.content_negotiate(metric_id='RE3DATA')
                try:
                    if isinstance(xml, bytes):
                        xml = xml.decode().encode()
                    root = etree.fromstring(xml)

                    #<link href="https://www.re3data.org/api/beta/repository/r3d100010134" rel="self" />
                    re3link = root.xpath('//link')[0].attrib['href']
                    if re3link is not None:
                        self.logger.info(
                            'FsF-R1.3-01M : Found match re3data metadata record -: '
                            + str(re3link))
                        # query reposiroty metadata
                        q2 = RequestHelper(url=re3link)
                        q2.setAcceptType(AcceptTypes.xml)
                        re3_source, re3_response = q2.content_negotiate(
                            metric_id='RE3DATA')
                        self.re3metadata_raw = re3_response
                        self.parseRepositoryMetadata()
                except Exception as e:
                    self.logger.warning(
                        'FsF-R1.3-01M : Malformed re3data (DOI-based) record received: '
                        + str(e))
            else:
                self.logger.warning(
                    'FsF-R1.3-01M : No DOI of client id is available from datacite api'
                )

    def parseRepositoryMetadata(self):
        #http://schema.re3data.org/3-0/re3data-example-V3-0.xml
        root = etree.fromstring(self.re3metadata_raw)
        # ns = {k: v for k, v in root.nsmap.items() if k}
        name = root.xpath('//r3d:repositoryName',
                          namespaces=RepositoryHelper.ns)
        url = root.xpath('//r3d:repositoryURL', namespaces=RepositoryHelper.ns)
        if name:
            self.repository_name = name[0].text
        if url:
            self.repository_url = url[0].text
        apis = root.xpath('//r3d:api', namespaces=RepositoryHelper.ns)
        for a in apis:
            apiType = a.attrib['apiType']
            if apiType in RepositoryHelper.RE3DATA_APITYPES:
                self.repo_apis[a.attrib['apiType']] = a.text
        standards = root.xpath(
            '//r3d:metadataStandard/r3d:metadataStandardName',
            namespaces=RepositoryHelper.ns)
        #we only use the name as the url specified in re3data is dcc-based, e.g., http://www.dcc.ac.uk/resources/metadata-standards/dif-directory-interchange-format
        self.repo_standards = [s.text for s in standards]

    def getRe3MetadataStandards(self):
        return self.repo_standards

    def getRe3MetadataAPIs(self):
        return self.repo_apis

    def getRepoNameURL(self):
        return self.repository_name, self.repository_url
Exemplo n.º 9
0
def main():
    config = ConfigParser.ConfigParser()
    my_path = Path(__file__).parent.parent
    ini_path = os.path.join(my_path, 'config', 'server.ini')
    config.read(ini_path)
    YAML_DIR = config['SERVICE']['yaml_directory']
    METRIC_YAML = config['SERVICE']['metrics_yaml']
    METRIC_YML_PATH = os.path.join(my_path, YAML_DIR, METRIC_YAML)
    SPDX_URL = config['EXTERNAL']['spdx_license_github']
    DATACITE_API_REPO = config['EXTERNAL']['datacite_api_repo']
    RE3DATA_API = config['EXTERNAL']['re3data_api']
    METADATACATALOG_API = config['EXTERNAL']['metadata_catalog']
    isDebug = config.getboolean('SERVICE', 'debug_mode')
    data_files_limit = int(config['SERVICE']['data_files_limit'])
    metric_specification = config['SERVICE']['metric_specification']

    preproc = Preprocessor()
    preproc.retrieve_metrics_yaml(METRIC_YML_PATH, data_files_limit,
                                  metric_specification)
    print('Total metrics defined: {}'.format(preproc.get_total_metrics()))

    isDebug = config.getboolean('SERVICE', 'debug_mode')
    preproc.retrieve_licenses(SPDX_URL, isDebug)
    preproc.retrieve_datacite_re3repos(RE3DATA_API, DATACITE_API_REPO, isDebug)
    preproc.retrieve_metadata_standards(METADATACATALOG_API, isDebug)
    preproc.retrieve_science_file_formats(isDebug)
    preproc.retrieve_long_term_file_formats(isDebug)

    print('Total SPDX licenses : {}'.format(preproc.get_total_licenses()))
    print('Total re3repositories found from datacite api : {}'.format(
        len(preproc.getRE3repositories())))
    print('Total subjects area of imported metadata standards : {}'.format(
        len(preproc.metadata_standards)))
    start = False
    usedatacite = True
    tracemalloc.start()
    n = 1
    for identifier in testpids:

        print(identifier)
        print(n)
        n += 1
        if identifier == startpid or not startpid:
            start = True
        if start:
            ft = FAIRCheck(uid=identifier,
                           test_debug=debug,
                           metadata_service_url=metadata_service_endpoint,
                           metadata_service_type=metadata_service_type,
                           use_datacite=usedatacite)

            #ft = FAIRCheck(uid=identifier,  test_debug=True, use_datacite=usedatacite)

            uid_result, pid_result = ft.check_unique_persistent()
            ft.retrieve_metadata_embedded(ft.extruct_result)
            include_embedded = True
            if ft.repeat_pid_check:
                uid_result, pid_result = ft.check_unique_persistent()
            ft.retrieve_metadata_external()

            core_metadata_result = ft.check_minimal_metatadata()
            content_identifier_included_result = ft.check_content_identifier_included(
            )
            access_level_result = ft.check_data_access_level()
            license_result = ft.check_license()
            relatedresources_result = ft.check_relatedresources()
            check_searchable_result = ft.check_searchable()
            data_content_metadata = ft.check_data_content_metadata()
            data_file_format_result = ft.check_data_file_format()
            community_standards_result = ft.check_community_metadatastandards()
            data_provenance_result = ft.check_data_provenance()
            formal_representation_result = ft.check_formal_metadata()
            semantic_vocabulary_result = ft.check_semantic_vocabulary()
            metadata_preserved_result = ft.check_metadata_preservation()
            standard_protocol_metadata_result = ft.check_standardised_protocol_metadata(
            )
            standard_protocol_data_result = ft.check_standardised_protocol_data(
            )

            results = [
                uid_result, pid_result, core_metadata_result,
                content_identifier_included_result, check_searchable_result,
                access_level_result, formal_representation_result,
                semantic_vocabulary_result, license_result,
                data_file_format_result, data_provenance_result,
                relatedresources_result, community_standards_result,
                data_content_metadata, metadata_preserved_result,
                standard_protocol_data_result,
                standard_protocol_metadata_result
            ]
            #results=[core_metadata_result,uid_result, pid_result]
            #print(ft.metadata_merged)
            debug_messages = ft.get_log_messages_dict()
            ft.logger_message_stream.flush()
            ft.get_assessment_summary(results)
            for res_k, res_v in enumerate(results):
                if ft.isDebug:
                    debug_list = debug_messages.get(res_v['metric_identifier'])
                    #debug_list= ft.msg_filter.getMessage(res_v['metric_identifier'])
                    if debug_list is not None:
                        results[res_k]['test_debug'] = debug_messages.get(
                            res_v['metric_identifier'])
                    else:
                        results[res_k]['test_debug'] = [
                            'INFO: No debug messages received'
                        ]
                else:
                    results[res_k]['test_debug'] = ['INFO: Debugging disabled']
                    debug_messages = {}
            print(json.dumps(results, indent=4, sort_keys=True))
            #remove unused logger handlers and filters to avoid memory leaks
            ft.logger.handlers = [ft.logger.handlers[-1]]
            #ft.logger.filters = [ft.logger.filters]
            current, peak = tracemalloc.get_traced_memory()
            print(
                f"Current memory usage is {current / 10 ** 6}MB; Peak was {peak / 10 ** 6}MB"
            )
            snapshot = tracemalloc.take_snapshot()
            top_stats = snapshot.statistics('traceback')

            # pick the biggest memory block
            stat = top_stats[0]
            print("%s memory blocks: %.1f KiB" %
                  (stat.count, stat.size / 1024))
            for line in stat.traceback.format():
                print(line)

            for i, stat in enumerate(snapshot.statistics('filename')[:5], 1):
                print(i, str(stat))

            #preproc.logger.
            gc.collect()
    tracemalloc.stop()
Exemplo n.º 10
0
def main():
    config = ConfigParser.ConfigParser()
    my_path = Path(__file__).parent.parent
    ini_path = os.path.join(my_path,'config','server.ini')
    config.read(ini_path)
    YAML_DIR = config['SERVICE']['yaml_directory']
    METRIC_YAML = config['SERVICE']['metrics_yaml']
    METRIC_YML_PATH = os.path.join(my_path, YAML_DIR , METRIC_YAML)
    SPDX_URL = config['EXTERNAL']['spdx_license_github']
    DATACITE_API_REPO = config['EXTERNAL']['datacite_api_repo']
    RE3DATA_API = config['EXTERNAL']['re3data_api']
    METADATACATALOG_API = config['EXTERNAL']['metadata_catalog']
    isDebug = config.getboolean('SERVICE', 'debug_mode')
    data_files_limit = int(config['SERVICE']['data_files_limit'])
    metric_specification = config['SERVICE']['metric_specification']

    preproc = Preprocessor()
    preproc.retrieve_metrics_yaml(METRIC_YML_PATH, data_files_limit,metric_specification)
    print('Total metrics defined: {}'.format(preproc.get_total_metrics()))

    isDebug = config.getboolean('SERVICE', 'debug_mode')
    preproc.retrieve_licenses(SPDX_URL, isDebug)
    preproc.retrieve_datacite_re3repos(RE3DATA_API, DATACITE_API_REPO, isDebug)
    preproc.retrieve_metadata_standards(METADATACATALOG_API, isDebug)
    preproc.retrieve_science_file_formats(isDebug)
    preproc.retrieve_long_term_file_formats(isDebug)

    print('Total SPDX licenses : {}'.format(preproc.get_total_licenses()))
    print('Total re3repositories found from datacite api : {}'.format(len(preproc.getRE3repositories())))
    print('Total subjects area of imported metadata standards : {}'.format(len(preproc.metadata_standards)))
    start=False
    for identifier in testpids:
        print (identifier)
        if identifier==startpid or not startpid:
            start=True
        if start:
            ft = FAIRCheck(uid=identifier,  test_debug=True, use_datacite=False)
            uid_result, pid_result = ft.check_unique_persistent()
            core_metadata_result = ft.check_minimal_metatadata()
            content_identifier_included_result = ft.check_content_identifier_included()
            access_level_result=ft.check_data_access_level()
            license_result = ft.check_license()
            relatedresources_result = ft.check_relatedresources()
            check_searchable_result = ft.check_searchable()
            data_file_format_result=ft.check_data_file_format()
            community_standards_result=ft.check_community_metadatastandards()
            data_provenance_result=ft.check_data_provenance()
            data_content_metadata = ft.check_data_content_metadata()
            formal_representation_result=ft.check_formal_metadata()
            semantic_vocabulary_result =ft.check_semantic_vocabulary()
            metadata_preserved_result = ft.check_metadata_preservation()
            standard_protocol_data_result = ft.check_standardised_protocol_data()
            standard_protocol_metadata_result = ft.check_standardised_protocol_metadata()
            results = [uid_result, pid_result, core_metadata_result, content_identifier_included_result, check_searchable_result, access_level_result, formal_representation_result,semantic_vocabulary_result, license_result, data_file_format_result,data_provenance_result,relatedresources_result,community_standards_result,data_content_metadata,metadata_preserved_result, standard_protocol_data_result,standard_protocol_metadata_result]
            #results=[core_metadata_result,uid_result, pid_result]
            #print(ft.metadata_merged)
            for res_k, res_v in enumerate(results):
                if ft.isDebug:
                    debug_list= ft.msg_filter.getMessage(res_v['metric_identifier'])
                    if debug_list is not None:
                        results[res_k]['test_debug'] = ft.msg_filter.getMessage(res_v['metric_identifier'])
                    else:
                        results[res_k]['test_debug'] =['INFO: No debug messages received']
                else:
                    results[res_k]['test_debug'] = ['INFO: Debugging disabled']

            print(json.dumps(results, indent=4, sort_keys=True))
Exemplo n.º 11
0
class RepositoryHelper:

    DATACITE_REPOSITORIES = Preprocessor.getRE3repositories()
    ns = {"r3d": "http://www.re3data.org/schema/2-2"}
    RE3DATA_APITYPES = ['OAI-PMH', 'SOAP', 'SPARQL', 'SWORD', 'OpenDAP']

    def __init__(self, client, pidscheme):
        self.client_id = client
        self.pid_scheme = pidscheme
        self.re3metadata_raw = None
        self.repository_name = None
        self.repository_url = None
        self.repo_apis = {}
        self.repo_standards = []
        self.logger = logging.getLogger(self.__class__.__name__)

    def lookup_re3data(self):
        if self.client_id and self.pid_scheme:
            re3doi = RepositoryHelper.DATACITE_REPOSITORIES.get(
                self.client_id)  # {client_id,re3doi}
            short_re3doi = idutils.normalize_pid(
                re3doi, scheme='doi')  #https://doi.org/10.17616/R3XS37
            # pid -> clientId -> repo doi-> re3id, and query repository metadata from re3api
            if re3doi:
                self.logger.info('Found match re3data (DOI-based) record')
                query_url = Preprocessor.RE3DATA_API + '?query=' + short_re3doi  # https://re3data.org/api/beta/repositories?query=
                q = RequestHelper(url=query_url)
                q.setAcceptType(AcceptTypes.xml)
                re_source, xml = q.content_negotiate(metric_id='RE3DATA')
                root = etree.fromstring(xml.content)
                #<link href="https://www.re3data.org/api/beta/repository/r3d100010134" rel="self" />
                re3link = root.xpath('//link')[0].attrib['href']
                if re3link is not None:
                    self.logger.info('Found match re3data metadata record')
                    # query reposiroty metadata
                    q2 = RequestHelper(url=re3link)
                    q2.setAcceptType(AcceptTypes.xml)
                    re3_source, re3_response = q2.content_negotiate(
                        metric_id='RE3DATA')
                    self.re3metadata_raw = re3_response.content
                    self.parseRepositoryMetadata()
            else:
                self.logger.warning(
                    'No DOI of client id is available from datacite api')

    def parseRepositoryMetadata(self):
        #http://schema.re3data.org/3-0/re3data-example-V3-0.xml
        root = etree.fromstring(self.re3metadata_raw)
        # ns = {k: v for k, v in root.nsmap.items() if k}
        name = root.xpath('//r3d:repositoryName',
                          namespaces=RepositoryHelper.ns)
        url = root.xpath('//r3d:repositoryURL', namespaces=RepositoryHelper.ns)
        if name:
            self.repository_name = name[0].text
        if url:
            self.repository_url = url[0].text
        apis = root.xpath('//r3d:api', namespaces=RepositoryHelper.ns)
        for a in apis:
            apiType = a.attrib['apiType']
            if apiType in RepositoryHelper.RE3DATA_APITYPES:
                self.repo_apis[a.attrib['apiType']] = a.text
        standards = root.xpath(
            '//r3d:metadataStandard/r3d:metadataStandardName',
            namespaces=RepositoryHelper.ns)
        self.repo_standards = [s.text for s in standards]

    def getRe3MetadataStandards(self):
        return self.repo_standards

    def getRe3MetadataAPIs(self):
        return self.repo_apis

    def getRepoNameURL(self):
        return self.repository_name, self.repository_url
Exemplo n.º 12
0
def main():
    config = ConfigParser.ConfigParser()
    my_path = Path(__file__).parent.parent
    ini_path = os.path.join(my_path, 'config', 'server.ini')
    config.read(ini_path)
    YAML_DIR = config['SERVICE']['yaml_directory']
    METRIC_YAML = config['SERVICE']['metrics_yaml']
    METRIC_YML_PATH = os.path.join(my_path, YAML_DIR, METRIC_YAML)
    SPDX_URL = config['EXTERNAL']['spdx_license_github']
    DATACITE_API_REPO = config['EXTERNAL']['datacite_api_repo']
    RE3DATA_API = config['EXTERNAL']['re3data_api']
    METADATACATALOG_API = config['EXTERNAL']['metadata_catalog']
    isDebug = config.getboolean('SERVICE', 'debug_mode')

    preproc = Preprocessor()
    preproc.retrieve_metrics_yaml(METRIC_YML_PATH)
    print('Total metrics defined: {}'.format(preproc.get_total_metrics()))

    isDebug = config.getboolean('SERVICE', 'debug_mode')
    preproc.retrieve_licenses(SPDX_URL, isDebug)
    preproc.retrieve_datacite_re3repos(RE3DATA_API, DATACITE_API_REPO, isDebug)
    preproc.retrieve_metadata_standards(METADATACATALOG_API, isDebug)

    print('Total SPDX licenses : {}'.format(preproc.get_total_licenses()))
    print('Total re3repositories found from datacite api : {}'.format(
        len(preproc.getRE3repositories())))
    print('Total subjects area of imported metadata standards : {}'.format(
        len(preproc.metadata_standards)))

    ft = FAIRCheck(uid=identifier, oai=oai_pmh, test_debug=debug)
    uid_result, pid_result = ft.check_unique_persistent()
    core_metadata_result = ft.check_minimal_metatadata()
    content_identifier_included_result = ft.check_content_identifier_included()
    check_searchable_result = ft.check_searchable()
    license_result = ft.check_license()
    relatedresources_result = ft.check_relatedresources()
    results = [
        uid_result, pid_result, core_metadata_result,
        content_identifier_included_result, license_result
    ]
    # put the debug messages at the right place...
    for result_index, result in enumerate(results):
        results[result_index]['test_debug'] = ft.msg_filter.getMessage(
            result.get('metric_identifier'))

    print(json.dumps(results, indent=4, sort_keys=True))