def load_predata(cls): cls.FILES_LIMIT = Preprocessor.data_files_limit if not cls.METRICS: cls.METRICS = Preprocessor.get_custom_metrics(['metric_name', 'total_score']) if not cls.SPDX_LICENSES: # cls.SPDX_LICENSES, cls.SPDX_LICENSE_NAMES, cls.SPDX_LICENSE_URLS = Preprocessor.get_licenses() cls.SPDX_LICENSES, cls.SPDX_LICENSE_NAMES = Preprocessor.get_licenses() if not cls.COMMUNITY_METADATA_STANDARDS_URIS: cls.COMMUNITY_METADATA_STANDARDS_URIS = Preprocessor.get_metadata_standards_uris() cls.COMMUNITY_METADATA_STANDARDS_URIS_LIST = list(cls.COMMUNITY_METADATA_STANDARDS_URIS.keys()) if not cls.COMMUNITY_STANDARDS: cls.COMMUNITY_STANDARDS = Preprocessor.get_metadata_standards() cls.COMMUNITY_STANDARDS_NAMES = list(cls.COMMUNITY_STANDARDS.keys()) if not cls.SCIENCE_FILE_FORMATS: cls.SCIENCE_FILE_FORMATS = Preprocessor.get_science_file_formats() if not cls.LONG_TERM_FILE_FORMATS: cls.LONG_TERM_FILE_FORMATS = Preprocessor.get_long_term_file_formats() if not cls.OPEN_FILE_FORMATS: cls.OPEN_FILE_FORMATS = Preprocessor.get_open_file_formats() if not cls.DEFAULT_NAMESPACES: cls.DEFAULT_NAMESPACES = Preprocessor.getDefaultNamespaces() if not cls.VOCAB_NAMESPACES: cls.VOCAB_NAMESPACES = Preprocessor.getLinkedVocabs() if not cls.STANDARD_PROTOCOLS: cls.STANDARD_PROTOCOLS = Preprocessor.get_standard_protocols()
def get_metrics(): # noqa: E501 """Return all metrics and their definitions # noqa: E501 :rtype: Metrics """ response = Preprocessor.get_metrics() return response, 200
def getNamespacesfromIRIs(self, meta_source): extractor = URLExtract() namespaces = set() if meta_source is not None: for url in set(extractor.gen_urls(str(meta_source))): namespace_candidate = url.rsplit('/', 1)[0] if namespace_candidate != url: namespaces.add(namespace_candidate) else: namespace_candidate = url.rsplit('#', 1)[0] if namespace_candidate != url: namespaces.add(namespace_candidate) vocabs = Preprocessor.getLinkedVocabs() lod_namespaces = [ d['namespace'] for d in vocabs if 'namespace' in d ] for ns in namespaces: if ns + '/' in lod_namespaces: self.namespaces.append(ns + '/') elif ns + '#' in lod_namespaces: self.namespaces.append(ns + '#')
class IdentifierHelper: IDENTIFIERS_ORG_DATA = Preprocessor.get_identifiers_org_data() identifier_schemes = [] preferred_schema = None # the preferred schema identifier_url = None identifier = None method = 'idutils' is_persistent = False def __init__(self, idstring): self.identifier = idstring self.normalized_id = self.identifier if len(self.identifier) > 4 and not self.identifier.isnumeric(): generic_identifiers_org_pattern = '^([a-z0-9\._]+):(.+)' # idutils check self.identifier_schemes = idutils.detect_identifier_schemes( self.identifier) # identifiers.org check if not self.identifier_schemes: self.method = 'identifiers.org' idmatch = re.search(generic_identifiers_org_pattern, self.identifier) if idmatch: found_prefix = idmatch[1] found_suffix = idmatch[2] if found_prefix in self.IDENTIFIERS_ORG_DATA.keys(): if (re.search( self.IDENTIFIERS_ORG_DATA[found_prefix] ['pattern'], found_suffix)): self.identifier_schemes = [ found_prefix, 'identifiers_org' ] self.preferred_schema = found_prefix self.identifier_url = str( self.IDENTIFIERS_ORG_DATA[found_prefix] ['url_pattern']).replace('{$id}', found_suffix) self.normalized_id = found_prefix.lower( ) + ':' + found_suffix else: # preferred schema if len(self.identifier_schemes) > 0: if len(self.identifier_schemes) > 1: if 'url' in self.identifier_schemes: # ['doi', 'url'] self.identifier_schemes.remove('url') self.preferred_schema = self.identifier_schemes[0] self.normalized_id = idutils.normalize_pid( self.identifier, self.preferred_schema) self.identifier_url = idutils.to_url(self.identifier, self.preferred_schema) if self.preferred_schema in Mapper.VALID_PIDS.value or self.preferred_schema in self.IDENTIFIERS_ORG_DATA.keys( ): self.is_persistent = True def get_preferred_schema(self): return self.preferred_schema def get_identifier_schemes(self): return self.identifier_schemes def get_identifier_url(self): return self.identifier_url def get_normalized_id(self): return self.normalized_id
class MetaDataCollectorSchemaOrg (MetaDataCollector): source_name=None SCHEMA_ORG_CONTEXT = Preprocessor.get_schema_org_context() def __init__(self, sourcemetadata, mapping, loggerinst, pidurl): #self.is_pid = ispid self.pid_url = pidurl super().__init__(logger=loggerinst, mapping=mapping, sourcemetadata=sourcemetadata) def parse_metadata(self, ls=None): jsnld_metadata = {} ext_meta=None if self.source_metadata: self.source_name = self.getEnumSourceNames().SCHEMAORG_EMBED.value ext_meta = self.source_metadata[0] elif self.pid_url: self.source_name = self.getEnumSourceNames().SCHEMAORG_NEGOTIATE.value # TODO (IMPORTANT) PID agency may support Schema.org in JSON-LD # TODO (IMPORTANT) validate schema.org # fallback, request (doi) metadata specified in schema.org JSON-LD requestHelper: RequestHelper = RequestHelper(self.pid_url, self.logger) requestHelper.setAcceptType(AcceptTypes.schemaorg) neg_source,ext_meta = requestHelper.content_negotiate('FsF-F2-01M') if ext_meta is not None: self.getNamespacesfromIRIs(ext_meta) self.logger.info('FsF-F2-01M : Trying to extract schema.org JSON-LD metadata from -: {}'.format(self.source_name)) # TODO check syntax - not ending with /, type and @type # TODO (important) extend mapping to detect other pids (link to related entities)? check_context_type = ["Dataset", "Collection"] try: #if ext_meta['@context'] in check_context_type['@context'] and ext_meta['@type'] in check_context_type["@type"]: if str(ext_meta['@context']).find('://schema.org') > -1: if str(ext_meta['@type']).lower() not in self.SCHEMA_ORG_CONTEXT: self.logger.info('FsF-F2-01M : Found JSON-LD but seems not to be a schema.org object based on the given context type') elif ext_meta['@type'] not in check_context_type: self.logger.info('FsF-F2-01M : Found schema.org JSON-LD but seems not to be a research data object') else: self.logger.info('FsF-F2-01M : Found schema.org JSON-LD which seems to be valid, based on the given context type') self.namespaces.append('http://schema.org/') jsnld_metadata = jmespath.search(self.metadata_mapping.value, ext_meta) # TODO all properties with null values extracted through jmespath should be excluded if jsnld_metadata.get('creator') is None: #TODO: handle None values for first and last name first = jsnld_metadata.get('creator_first') last = jsnld_metadata.get('creator_last') if isinstance(first, list) and isinstance(last, list): if len(first) == len(last): names = [str(i) + " " + str(j) for i, j in zip(first, last)] jsnld_metadata['creator'] = names else: jsnld_metadata['creator'] = [str(first) + " " + str(last)] #TODO instead of custom check there should a valdiator to evaluate the whole schema.org metadata invalid_license = False if jsnld_metadata.get('license'): self.logger.info('FsF-R1.1-01M : License metadata found (schema.org) -: {}'.format( jsnld_metadata.get('license'))) if isinstance(jsnld_metadata.get('license'), list): jsnld_metadata['license'] = jsnld_metadata['license'][0] if isinstance(jsnld_metadata.get('license'), dict): ls_type = jsnld_metadata.get('license').get('@type') if ls_type =='CreativeWork': ls = jsnld_metadata.get('license').get('url') if not ls: ls = jsnld_metadata.get('license').get('name') if ls: jsnld_metadata['license'] = ls else: invalid_license = True else: invalid_license = True if invalid_license: self.logger.warning('FsF-R1.1-01M : Looks like schema.org representation of license is incorrect, skipping the test.') jsnld_metadata['license'] = None # filter out None values of related_resources if jsnld_metadata.get('related_resources'): relateds = [d for d in jsnld_metadata['related_resources'] if d['related_resource'] is not None] if relateds: jsnld_metadata['related_resources'] = relateds self.logger.info('FsF-I3-01M : {0} related resource(s) extracted from -: {1}'.format(len(jsnld_metadata['related_resources']), self.source_name)) else: del jsnld_metadata['related_resources'] self.logger.info('FsF-I3-01M : No related resource(s) found in Schema.org metadata') # TODO quick-fix, expand mapping expression instead if jsnld_metadata.get('object_size'): jsnld_metadata['object_size'] = str(jsnld_metadata['object_size'].get('value')) + ' '+ jsnld_metadata['object_size'].get('unitText') else: self.logger.info('FsF-F2-01M : Found JSON-LD schema.org but record is not of type "Dataset"') except Exception as err: #print(err.with_traceback()) self.logger.info('FsF-F2-01M : Failed to parse JSON-LD schema.org -: {}'.format(err)) else: self.logger.info('FsF-F2-01M : Could not identify JSON-LD schema.org metadata') return self.source_name, jsnld_metadata
def main(): logging.getLogger('connexion.operation').setLevel('INFO') ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) YAML_DIR = config['SERVICE']['yaml_directory'] METRIC_YAML = config['SERVICE']['metrics_yaml'] METRIC_YML_PATH = os.path.join(ROOT_DIR, YAML_DIR, METRIC_YAML) SPDX_URL = config['EXTERNAL']['spdx_license_github'] DATACITE_API_REPO = config['EXTERNAL']['datacite_api_repo'] RE3DATA_API = config['EXTERNAL']['re3data_api'] METADATACATALOG_API = config['EXTERNAL']['metadata_catalog'] LOV_API = config['EXTERNAL']['lov_api'] LOD_CLOUDNET = config['EXTERNAL']['lod_cloudnet'] #BIOPORTAL_REST = config['EXTERNAL']['bioportal_rest'] #BIOPORTAL_APIKEY = config['EXTERNAL']['bioportal_apikey'] data_files_limit = int(config['SERVICE']['data_files_limit']) metric_specification = config['SERVICE']['metric_specification'] #TODO further implementation on authentication needed auth_enabled = config.getboolean('USER', 'auth_enabled') usr = config['USER']['usr'] pwd = config['USER']['pwd'] authen.service_username = usr authen.service_password = pwd preproc = Preprocessor() preproc.retrieve_metrics_yaml(METRIC_YML_PATH, data_files_limit, metric_specification) logger.info('Total metrics defined: {}'.format( preproc.get_total_metrics())) isDebug = config.getboolean('SERVICE', 'debug_mode') preproc.retrieve_licenses(SPDX_URL, isDebug) preproc.retrieve_datacite_re3repos(RE3DATA_API, DATACITE_API_REPO, isDebug) preproc.retrieve_metadata_standards(METADATACATALOG_API, isDebug) #preproc.retrieve_linkedvocabs(lov_api=LOV_API, lodcloud_api=LOD_CLOUDNET, bioportal_api=BIOPORTAL_REST, bioportal_key=BIOPORTAL_APIKEY, isDebugMode=False) preproc.retrieve_linkedvocabs(lov_api=LOV_API, lodcloud_api=LOD_CLOUDNET, isDebugMode=isDebug) preproc.retrieve_default_namespaces() preproc.set_remote_log_info(config['SERVICE']['remote_log_host'], config['SERVICE']['remote_log_path']) logger.info('Total SPDX licenses : {}'.format( preproc.get_total_licenses())) logger.info('Total re3repositories found from datacite api : {}'.format( len(preproc.getRE3repositories()))) logger.info( 'Total subjects area of imported metadata standards : {}'.format( len(preproc.metadata_standards))) logger.info('Total LD vocabs imported : {}'.format( len(preproc.getLinkedVocabs()))) logger.info('Total default namespaces specified : {}'.format( len(preproc.getDefaultNamespaces()))) #you can also use Tornado or gevent as the HTTP server, to do so set server to tornado or gevent app = connexion.FlaskApp(__name__, specification_dir=YAML_DIR) API_YAML = os.path.join(ROOT_DIR, YAML_DIR, config['SERVICE']['swagger_yaml']) app.app.json_encoder = encoder.JSONEncoder api_title = 'F-UJI : FAIRsFAIR Research Data Object Assessment Service' if auth_enabled: api_args = {'title': api_title, 'security': [{'basicAuth': []}]} else: api_args = {'title': api_title} app.add_api(API_YAML, arguments=api_args, validate_responses=True) app.app.wsgi_app = ProxyFix(app.app.wsgi_app, x_for=1, x_host=1) if os.getenv('ENABLE_CORS', 'False').lower() == 'true': CORS(app.app) app.run(host=config['SERVICE']['service_host'], port=int(config['SERVICE']['service_port']))
def main(): config = ConfigParser.ConfigParser() my_path = Path(__file__).parent.parent ini_path = os.path.join(my_path, 'config', 'server.ini') config.read(ini_path) YAML_DIR = config['SERVICE']['yaml_directory'] METRIC_YAML = config['SERVICE']['metrics_yaml'] METRIC_YML_PATH = os.path.join(my_path, YAML_DIR, METRIC_YAML) SPDX_URL = config['EXTERNAL']['spdx_license_github'] DATACITE_API_REPO = config['EXTERNAL']['datacite_api_repo'] RE3DATA_API = config['EXTERNAL']['re3data_api'] METADATACATALOG_API = config['EXTERNAL']['metadata_catalog'] isDebug = config.getboolean('SERVICE', 'debug_mode') preproc = Preprocessor() preproc.retrieve_metrics_yaml(METRIC_YML_PATH) print('Total metrics defined: {}'.format(preproc.get_total_metrics())) isDebug = config.getboolean('SERVICE', 'debug_mode') preproc.retrieve_licenses(SPDX_URL, isDebug) preproc.retrieve_datacite_re3repos(RE3DATA_API, DATACITE_API_REPO, isDebug) preproc.retrieve_metadata_standards(METADATACATALOG_API, isDebug) preproc.retrieve_science_file_formats(isDebug) preproc.retrieve_long_term_file_formats(isDebug) print('Total SPDX licenses : {}'.format(preproc.get_total_licenses())) print('Total re3repositories found from datacite api : {}'.format( len(preproc.getRE3repositories()))) print('Total subjects area of imported metadata standards : {}'.format( len(preproc.metadata_standards))) start = False for identifier in testpids: print(identifier) if identifier == startpid or not startpid: start = True if start: ft = FAIRCheck(uid=identifier, test_debug=debug) uid_result, pid_result = ft.check_unique_persistent() core_metadata_result = ft.check_minimal_metatadata() content_identifier_included_result = ft.check_content_identifier_included( ) check_searchable_result = ft.check_searchable() license_result = ft.check_license() relatedresources_result = ft.check_relatedresources() access_level_result = ft.check_data_access_level() data_file_format_result = ft.check_data_file_format() data_provenance_result = ft.check_data_provenance() community_standards_result = ft.check_community_metadatastandards() data_content_metadata = ft.check_data_content_metadata() results = [ uid_result, pid_result, core_metadata_result, content_identifier_included_result, check_searchable_result, access_level_result, license_result, data_file_format_result, data_provenance_result, community_standards_result, data_content_metadata ] #results=[data_file_format_result] print(json.dumps(results, indent=4, sort_keys=True))
class RepositoryHelper: DATACITE_REPOSITORIES = Preprocessor.getRE3repositories() ns = {"r3d": "http://www.re3data.org/schema/2-2"} RE3DATA_APITYPES = ['OAI-PMH', 'SOAP', 'SPARQL', 'SWORD', 'OpenDAP'] def __init__(self, client, pidscheme, logger): self.client_id = client self.pid_scheme = pidscheme self.re3metadata_raw = None self.repository_name = None self.repository_url = None self.repo_apis = {} self.repo_standards = [] self.logger = logging.getLogger(logger) #print(__name__) def lookup_re3data(self): if self.client_id and self.pid_scheme: re3doi = RepositoryHelper.DATACITE_REPOSITORIES.get( self.client_id) # {client_id,re3doi} #print(self.client_id,'Re3DOI',re3doi, idutils.is_doi(re3doi)) if re3doi: if idutils.is_doi(re3doi): short_re3doi = idutils.normalize_pid( re3doi, scheme='doi') #https://doi.org/10.17616/R3XS37 else: re3doi = None # pid -> clientId -> repo doi-> re3id, and query repository metadata from re3api if re3doi: self.logger.info( 'FsF-R1.3-01M : Found match re3data (DOI-based) record') query_url = Preprocessor.RE3DATA_API + '?query=' + short_re3doi # https://re3data.org/api/beta/repositories?query= q = RequestHelper(url=query_url) q.setAcceptType(AcceptTypes.xml) re_source, xml = q.content_negotiate(metric_id='RE3DATA') try: if isinstance(xml, bytes): xml = xml.decode().encode() root = etree.fromstring(xml) #<link href="https://www.re3data.org/api/beta/repository/r3d100010134" rel="self" /> re3link = root.xpath('//link')[0].attrib['href'] if re3link is not None: self.logger.info( 'FsF-R1.3-01M : Found match re3data metadata record -: ' + str(re3link)) # query reposiroty metadata q2 = RequestHelper(url=re3link) q2.setAcceptType(AcceptTypes.xml) re3_source, re3_response = q2.content_negotiate( metric_id='RE3DATA') self.re3metadata_raw = re3_response self.parseRepositoryMetadata() except Exception as e: self.logger.warning( 'FsF-R1.3-01M : Malformed re3data (DOI-based) record received: ' + str(e)) else: self.logger.warning( 'FsF-R1.3-01M : No DOI of client id is available from datacite api' ) def parseRepositoryMetadata(self): #http://schema.re3data.org/3-0/re3data-example-V3-0.xml root = etree.fromstring(self.re3metadata_raw) # ns = {k: v for k, v in root.nsmap.items() if k} name = root.xpath('//r3d:repositoryName', namespaces=RepositoryHelper.ns) url = root.xpath('//r3d:repositoryURL', namespaces=RepositoryHelper.ns) if name: self.repository_name = name[0].text if url: self.repository_url = url[0].text apis = root.xpath('//r3d:api', namespaces=RepositoryHelper.ns) for a in apis: apiType = a.attrib['apiType'] if apiType in RepositoryHelper.RE3DATA_APITYPES: self.repo_apis[a.attrib['apiType']] = a.text standards = root.xpath( '//r3d:metadataStandard/r3d:metadataStandardName', namespaces=RepositoryHelper.ns) #we only use the name as the url specified in re3data is dcc-based, e.g., http://www.dcc.ac.uk/resources/metadata-standards/dif-directory-interchange-format self.repo_standards = [s.text for s in standards] def getRe3MetadataStandards(self): return self.repo_standards def getRe3MetadataAPIs(self): return self.repo_apis def getRepoNameURL(self): return self.repository_name, self.repository_url
def main(): config = ConfigParser.ConfigParser() my_path = Path(__file__).parent.parent ini_path = os.path.join(my_path, 'config', 'server.ini') config.read(ini_path) YAML_DIR = config['SERVICE']['yaml_directory'] METRIC_YAML = config['SERVICE']['metrics_yaml'] METRIC_YML_PATH = os.path.join(my_path, YAML_DIR, METRIC_YAML) SPDX_URL = config['EXTERNAL']['spdx_license_github'] DATACITE_API_REPO = config['EXTERNAL']['datacite_api_repo'] RE3DATA_API = config['EXTERNAL']['re3data_api'] METADATACATALOG_API = config['EXTERNAL']['metadata_catalog'] isDebug = config.getboolean('SERVICE', 'debug_mode') data_files_limit = int(config['SERVICE']['data_files_limit']) metric_specification = config['SERVICE']['metric_specification'] preproc = Preprocessor() preproc.retrieve_metrics_yaml(METRIC_YML_PATH, data_files_limit, metric_specification) print('Total metrics defined: {}'.format(preproc.get_total_metrics())) isDebug = config.getboolean('SERVICE', 'debug_mode') preproc.retrieve_licenses(SPDX_URL, isDebug) preproc.retrieve_datacite_re3repos(RE3DATA_API, DATACITE_API_REPO, isDebug) preproc.retrieve_metadata_standards(METADATACATALOG_API, isDebug) preproc.retrieve_science_file_formats(isDebug) preproc.retrieve_long_term_file_formats(isDebug) print('Total SPDX licenses : {}'.format(preproc.get_total_licenses())) print('Total re3repositories found from datacite api : {}'.format( len(preproc.getRE3repositories()))) print('Total subjects area of imported metadata standards : {}'.format( len(preproc.metadata_standards))) start = False usedatacite = True tracemalloc.start() n = 1 for identifier in testpids: print(identifier) print(n) n += 1 if identifier == startpid or not startpid: start = True if start: ft = FAIRCheck(uid=identifier, test_debug=debug, metadata_service_url=metadata_service_endpoint, metadata_service_type=metadata_service_type, use_datacite=usedatacite) #ft = FAIRCheck(uid=identifier, test_debug=True, use_datacite=usedatacite) uid_result, pid_result = ft.check_unique_persistent() ft.retrieve_metadata_embedded(ft.extruct_result) include_embedded = True if ft.repeat_pid_check: uid_result, pid_result = ft.check_unique_persistent() ft.retrieve_metadata_external() core_metadata_result = ft.check_minimal_metatadata() content_identifier_included_result = ft.check_content_identifier_included( ) access_level_result = ft.check_data_access_level() license_result = ft.check_license() relatedresources_result = ft.check_relatedresources() check_searchable_result = ft.check_searchable() data_content_metadata = ft.check_data_content_metadata() data_file_format_result = ft.check_data_file_format() community_standards_result = ft.check_community_metadatastandards() data_provenance_result = ft.check_data_provenance() formal_representation_result = ft.check_formal_metadata() semantic_vocabulary_result = ft.check_semantic_vocabulary() metadata_preserved_result = ft.check_metadata_preservation() standard_protocol_metadata_result = ft.check_standardised_protocol_metadata( ) standard_protocol_data_result = ft.check_standardised_protocol_data( ) results = [ uid_result, pid_result, core_metadata_result, content_identifier_included_result, check_searchable_result, access_level_result, formal_representation_result, semantic_vocabulary_result, license_result, data_file_format_result, data_provenance_result, relatedresources_result, community_standards_result, data_content_metadata, metadata_preserved_result, standard_protocol_data_result, standard_protocol_metadata_result ] #results=[core_metadata_result,uid_result, pid_result] #print(ft.metadata_merged) debug_messages = ft.get_log_messages_dict() ft.logger_message_stream.flush() ft.get_assessment_summary(results) for res_k, res_v in enumerate(results): if ft.isDebug: debug_list = debug_messages.get(res_v['metric_identifier']) #debug_list= ft.msg_filter.getMessage(res_v['metric_identifier']) if debug_list is not None: results[res_k]['test_debug'] = debug_messages.get( res_v['metric_identifier']) else: results[res_k]['test_debug'] = [ 'INFO: No debug messages received' ] else: results[res_k]['test_debug'] = ['INFO: Debugging disabled'] debug_messages = {} print(json.dumps(results, indent=4, sort_keys=True)) #remove unused logger handlers and filters to avoid memory leaks ft.logger.handlers = [ft.logger.handlers[-1]] #ft.logger.filters = [ft.logger.filters] current, peak = tracemalloc.get_traced_memory() print( f"Current memory usage is {current / 10 ** 6}MB; Peak was {peak / 10 ** 6}MB" ) snapshot = tracemalloc.take_snapshot() top_stats = snapshot.statistics('traceback') # pick the biggest memory block stat = top_stats[0] print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024)) for line in stat.traceback.format(): print(line) for i, stat in enumerate(snapshot.statistics('filename')[:5], 1): print(i, str(stat)) #preproc.logger. gc.collect() tracemalloc.stop()
def main(): config = ConfigParser.ConfigParser() my_path = Path(__file__).parent.parent ini_path = os.path.join(my_path,'config','server.ini') config.read(ini_path) YAML_DIR = config['SERVICE']['yaml_directory'] METRIC_YAML = config['SERVICE']['metrics_yaml'] METRIC_YML_PATH = os.path.join(my_path, YAML_DIR , METRIC_YAML) SPDX_URL = config['EXTERNAL']['spdx_license_github'] DATACITE_API_REPO = config['EXTERNAL']['datacite_api_repo'] RE3DATA_API = config['EXTERNAL']['re3data_api'] METADATACATALOG_API = config['EXTERNAL']['metadata_catalog'] isDebug = config.getboolean('SERVICE', 'debug_mode') data_files_limit = int(config['SERVICE']['data_files_limit']) metric_specification = config['SERVICE']['metric_specification'] preproc = Preprocessor() preproc.retrieve_metrics_yaml(METRIC_YML_PATH, data_files_limit,metric_specification) print('Total metrics defined: {}'.format(preproc.get_total_metrics())) isDebug = config.getboolean('SERVICE', 'debug_mode') preproc.retrieve_licenses(SPDX_URL, isDebug) preproc.retrieve_datacite_re3repos(RE3DATA_API, DATACITE_API_REPO, isDebug) preproc.retrieve_metadata_standards(METADATACATALOG_API, isDebug) preproc.retrieve_science_file_formats(isDebug) preproc.retrieve_long_term_file_formats(isDebug) print('Total SPDX licenses : {}'.format(preproc.get_total_licenses())) print('Total re3repositories found from datacite api : {}'.format(len(preproc.getRE3repositories()))) print('Total subjects area of imported metadata standards : {}'.format(len(preproc.metadata_standards))) start=False for identifier in testpids: print (identifier) if identifier==startpid or not startpid: start=True if start: ft = FAIRCheck(uid=identifier, test_debug=True, use_datacite=False) uid_result, pid_result = ft.check_unique_persistent() core_metadata_result = ft.check_minimal_metatadata() content_identifier_included_result = ft.check_content_identifier_included() access_level_result=ft.check_data_access_level() license_result = ft.check_license() relatedresources_result = ft.check_relatedresources() check_searchable_result = ft.check_searchable() data_file_format_result=ft.check_data_file_format() community_standards_result=ft.check_community_metadatastandards() data_provenance_result=ft.check_data_provenance() data_content_metadata = ft.check_data_content_metadata() formal_representation_result=ft.check_formal_metadata() semantic_vocabulary_result =ft.check_semantic_vocabulary() metadata_preserved_result = ft.check_metadata_preservation() standard_protocol_data_result = ft.check_standardised_protocol_data() standard_protocol_metadata_result = ft.check_standardised_protocol_metadata() results = [uid_result, pid_result, core_metadata_result, content_identifier_included_result, check_searchable_result, access_level_result, formal_representation_result,semantic_vocabulary_result, license_result, data_file_format_result,data_provenance_result,relatedresources_result,community_standards_result,data_content_metadata,metadata_preserved_result, standard_protocol_data_result,standard_protocol_metadata_result] #results=[core_metadata_result,uid_result, pid_result] #print(ft.metadata_merged) for res_k, res_v in enumerate(results): if ft.isDebug: debug_list= ft.msg_filter.getMessage(res_v['metric_identifier']) if debug_list is not None: results[res_k]['test_debug'] = ft.msg_filter.getMessage(res_v['metric_identifier']) else: results[res_k]['test_debug'] =['INFO: No debug messages received'] else: results[res_k]['test_debug'] = ['INFO: Debugging disabled'] print(json.dumps(results, indent=4, sort_keys=True))
class RepositoryHelper: DATACITE_REPOSITORIES = Preprocessor.getRE3repositories() ns = {"r3d": "http://www.re3data.org/schema/2-2"} RE3DATA_APITYPES = ['OAI-PMH', 'SOAP', 'SPARQL', 'SWORD', 'OpenDAP'] def __init__(self, client, pidscheme): self.client_id = client self.pid_scheme = pidscheme self.re3metadata_raw = None self.repository_name = None self.repository_url = None self.repo_apis = {} self.repo_standards = [] self.logger = logging.getLogger(self.__class__.__name__) def lookup_re3data(self): if self.client_id and self.pid_scheme: re3doi = RepositoryHelper.DATACITE_REPOSITORIES.get( self.client_id) # {client_id,re3doi} short_re3doi = idutils.normalize_pid( re3doi, scheme='doi') #https://doi.org/10.17616/R3XS37 # pid -> clientId -> repo doi-> re3id, and query repository metadata from re3api if re3doi: self.logger.info('Found match re3data (DOI-based) record') query_url = Preprocessor.RE3DATA_API + '?query=' + short_re3doi # https://re3data.org/api/beta/repositories?query= q = RequestHelper(url=query_url) q.setAcceptType(AcceptTypes.xml) re_source, xml = q.content_negotiate(metric_id='RE3DATA') root = etree.fromstring(xml.content) #<link href="https://www.re3data.org/api/beta/repository/r3d100010134" rel="self" /> re3link = root.xpath('//link')[0].attrib['href'] if re3link is not None: self.logger.info('Found match re3data metadata record') # query reposiroty metadata q2 = RequestHelper(url=re3link) q2.setAcceptType(AcceptTypes.xml) re3_source, re3_response = q2.content_negotiate( metric_id='RE3DATA') self.re3metadata_raw = re3_response.content self.parseRepositoryMetadata() else: self.logger.warning( 'No DOI of client id is available from datacite api') def parseRepositoryMetadata(self): #http://schema.re3data.org/3-0/re3data-example-V3-0.xml root = etree.fromstring(self.re3metadata_raw) # ns = {k: v for k, v in root.nsmap.items() if k} name = root.xpath('//r3d:repositoryName', namespaces=RepositoryHelper.ns) url = root.xpath('//r3d:repositoryURL', namespaces=RepositoryHelper.ns) if name: self.repository_name = name[0].text if url: self.repository_url = url[0].text apis = root.xpath('//r3d:api', namespaces=RepositoryHelper.ns) for a in apis: apiType = a.attrib['apiType'] if apiType in RepositoryHelper.RE3DATA_APITYPES: self.repo_apis[a.attrib['apiType']] = a.text standards = root.xpath( '//r3d:metadataStandard/r3d:metadataStandardName', namespaces=RepositoryHelper.ns) self.repo_standards = [s.text for s in standards] def getRe3MetadataStandards(self): return self.repo_standards def getRe3MetadataAPIs(self): return self.repo_apis def getRepoNameURL(self): return self.repository_name, self.repository_url
def main(): config = ConfigParser.ConfigParser() my_path = Path(__file__).parent.parent ini_path = os.path.join(my_path, 'config', 'server.ini') config.read(ini_path) YAML_DIR = config['SERVICE']['yaml_directory'] METRIC_YAML = config['SERVICE']['metrics_yaml'] METRIC_YML_PATH = os.path.join(my_path, YAML_DIR, METRIC_YAML) SPDX_URL = config['EXTERNAL']['spdx_license_github'] DATACITE_API_REPO = config['EXTERNAL']['datacite_api_repo'] RE3DATA_API = config['EXTERNAL']['re3data_api'] METADATACATALOG_API = config['EXTERNAL']['metadata_catalog'] isDebug = config.getboolean('SERVICE', 'debug_mode') preproc = Preprocessor() preproc.retrieve_metrics_yaml(METRIC_YML_PATH) print('Total metrics defined: {}'.format(preproc.get_total_metrics())) isDebug = config.getboolean('SERVICE', 'debug_mode') preproc.retrieve_licenses(SPDX_URL, isDebug) preproc.retrieve_datacite_re3repos(RE3DATA_API, DATACITE_API_REPO, isDebug) preproc.retrieve_metadata_standards(METADATACATALOG_API, isDebug) print('Total SPDX licenses : {}'.format(preproc.get_total_licenses())) print('Total re3repositories found from datacite api : {}'.format( len(preproc.getRE3repositories()))) print('Total subjects area of imported metadata standards : {}'.format( len(preproc.metadata_standards))) ft = FAIRCheck(uid=identifier, oai=oai_pmh, test_debug=debug) uid_result, pid_result = ft.check_unique_persistent() core_metadata_result = ft.check_minimal_metatadata() content_identifier_included_result = ft.check_content_identifier_included() check_searchable_result = ft.check_searchable() license_result = ft.check_license() relatedresources_result = ft.check_relatedresources() results = [ uid_result, pid_result, core_metadata_result, content_identifier_included_result, license_result ] # put the debug messages at the right place... for result_index, result in enumerate(results): results[result_index]['test_debug'] = ft.msg_filter.getMessage( result.get('metric_identifier')) print(json.dumps(results, indent=4, sort_keys=True))