def _dump_common_links(self, obj): """Dump common links for deposits and records.""" links = {} m = obj.get('metadata', {}) doi = m.get('doi') if doi: links['badge'] = ui_link_for('badge', doi=quote(doi)) links['doi'] = idutils.to_url(doi, 'doi') conceptdoi = m.get('conceptdoi') if conceptdoi: links['conceptbadge'] = ui_link_for('badge', doi=quote(conceptdoi)) links['conceptdoi'] = idutils.to_url(conceptdoi, 'doi') files = m.get('_files', []) for f in files: if f.get('type') in thumbnail_exts: try: links['thumb250'] = self._thumbnail_url(f, 250) # First previewable image is used for preview. except RuntimeError: pass break return links
def _dump_common_links(self, obj): """Dump common links for deposits and records.""" links = {} m = obj.get('metadata', {}) doi = m.get('doi') if doi: links['badge'] = ui_link_for('badge', doi=quote(doi)) links['doi'] = idutils.to_url(doi, 'doi', 'https') conceptdoi = m.get('conceptdoi') if conceptdoi: links['conceptbadge'] = ui_link_for('badge', doi=quote(conceptdoi)) links['conceptdoi'] = idutils.to_url(conceptdoi, 'doi', 'https') files = m.get('_files', []) for f in files: if f.get('type') in thumbnail_exts: try: links['thumb250'] = self._thumbnail_url(f, 250) # First previewable image is used for preview. except RuntimeError: pass break return links
def get_id(self, obj): """Get URL for the person's ORCID or GND.""" orcid = obj.get('orcid') gnd = obj.get('gnd') if orcid: return idutils.to_url(orcid, 'orcid', 'https') if gnd: return idutils.to_url(gnd, 'gnd', 'https') return missing
def test_to_url(): """Test URL generation.""" for i, expected_schemes, normalized_value, url_value in identifiers: assert idutils.to_url( idutils.normalize_pid(i, expected_schemes[0]), expected_schemes[0] ) == url_value assert idutils.to_url( idutils.normalize_pid(i, expected_schemes[0]), expected_schemes[0], url_scheme='https', ) == (url_value.replace('http://', 'https://') # If the value is already a URL its scheme is preserved if expected_schemes[0] not in ['purl', 'url'] else url_value)
def pid_url(identifier, scheme=None): """Convert persistent identifier into a link.""" if scheme is None: try: scheme = idutils.detect_identifier_schemes(identifier)[0] except IndexError: scheme = None if scheme and identifier: return idutils.to_url(identifier, scheme) return ""
def build_id_info(id_): """Build information for the Identifier.""" data = {'ID': id_.value, 'IDScheme': id_.scheme} try: id_url = idutils.to_url(id_.value, id_.scheme) if id_url: data['IDURL'] = id_url except Exception: pass return data
def dump_links(self, obj): """Dump links.""" links = obj.get('links', {}) m = obj.get('metadata', {}) doi = m.get('doi') if current_app and doi: links['badge'] = "{base}/badge/doi/{value}.svg".format( base=current_app.config.get('THEME_SITEURL'), value=quote(doi), ) links['doi'] = idutils.to_url(doi, 'doi') if has_request_context(): if is_deposit(m): bucket_id = m.get('_buckets', {}).get('deposit') recid = m.get('recid') if m.get('_deposit', {}).get('pid') \ else None api_key = 'record' html_key = 'record_html' else: bucket_id = m.get('_buckets', {}).get('record') recid = m.get('recid') api_key = None html_key = 'html' if bucket_id: try: links['bucket'] = url_for( 'invenio_files_rest.bucket_api', bucket_id=bucket_id, _external=True, ) except BuildError: pass if recid: try: if api_key: links[api_key] = url_for( 'invenio_records_rest.recid_item', pid_value=recid, _external=True, ) if html_key: links[html_key] = \ current_app.config['RECORDS_UI_ENDPOINT'].format( host=request.host, scheme=request.scheme, pid_value=recid, ) except BuildError: pass return links
def _dump_common_links(self, obj): """Dump common links for deposits and records.""" links = {} m = obj.get('metadata', {}) doi = m.get('doi') if doi: links['badge'] = \ "{base}/badge/doi/{value}.svg".format( base=current_app.config.get('THEME_SITEURL'), value=quote(doi)) links['doi'] = idutils.to_url(doi, 'doi') conceptdoi = m.get('conceptdoi') if conceptdoi: links['conceptbadge'] = \ "{base}/badge/doi/{value}.svg".format( base=current_app.config.get('THEME_SITEURL'), value=quote(conceptdoi)) links['conceptdoi'] = idutils.to_url(conceptdoi, 'doi') return links
def __init__(self, idstring): self.identifier = idstring self.normalized_id = self.identifier if self.identifier and isinstance(self.identifier, str): if len(self.identifier) > 4 and not self.identifier.isnumeric(): #workaround to resolve lsids: #idutils.LANDING_URLS['lsid'] ='http://www.lsid.info/resolver/?lsid={pid}' #workaround to recognize https purls if 'https://purl.' in self.identifier: self.identifier = self.identifier.replace( 'https:', 'http:') generic_identifiers_org_pattern = '^([a-z0-9\._]+):(.+)' # idutils check self.identifier_schemes = idutils.detect_identifier_schemes( self.identifier) # identifiers.org check if not self.identifier_schemes: self.method = 'identifiers.org' idmatch = re.search(generic_identifiers_org_pattern, self.identifier) if idmatch: found_prefix = idmatch[1] found_suffix = idmatch[2] if found_prefix in self.IDENTIFIERS_ORG_DATA.keys(): if (re.search( self.IDENTIFIERS_ORG_DATA[found_prefix] ['pattern'], found_suffix)): self.identifier_schemes = [ found_prefix, 'identifiers_org' ] self.preferred_schema = found_prefix self.identifier_url = str( self.IDENTIFIERS_ORG_DATA[found_prefix] ['url_pattern']).replace( '{$id}', found_suffix) self.normalized_id = found_prefix.lower( ) + ':' + found_suffix else: # preferred schema if self.identifier_schemes: if len(self.identifier_schemes) > 0: if len(self.identifier_schemes) > 1: if 'url' in self.identifier_schemes: # ['doi', 'url'] self.identifier_schemes.remove('url') self.preferred_schema = self.identifier_schemes[0] self.normalized_id = idutils.normalize_pid( self.identifier, self.preferred_schema) self.identifier_url = idutils.to_url( self.identifier, self.preferred_schema) if self.preferred_schema in Mapper.VALID_PIDS.value or self.preferred_schema in self.IDENTIFIERS_ORG_DATA.keys( ): self.is_persistent = True
def _serialize_identifiers(ids, relations=None): """Serialize related and alternate identifiers to URLs. :param ids: List of related_identifier or alternate_identifier objects. :param relations: if not None, will only select IDs of specific relation :returns: List of identifiers in schema.org format. :rtype dict: """ relations = relations or [] ids = [{'@type': 'CreativeWork', '@id': idutils.to_url(i['identifier'], i['scheme'], 'https')} for i in ids if (not relations or i['relation'] in relations) and 'scheme' in i] return [id_ for id_ in ids if id_['@id']]
def pid_url(identifier, scheme=None, url_scheme="https"): """Convert persistent identifier into a link.""" if scheme is None: try: scheme = idutils.detect_identifier_schemes(identifier)[0] except IndexError: scheme = None try: if scheme and identifier: return idutils.to_url(identifier, scheme, url_scheme=url_scheme) except Exception: current_app.logger.warning("URL generation for identifier {0} failed.".format(identifier), exc_info=True) return ""
def pid_url(identifier, scheme=None, url_scheme='https'): """Convert persistent identifier into a link.""" if scheme is None: try: scheme = idutils.detect_identifier_schemes(identifier)[0] except IndexError: scheme = None try: if scheme and identifier: return idutils.to_url(identifier, scheme, url_scheme=url_scheme) except Exception: current_app.logger.warning('URL generation for identifier {0} failed.' .format(identifier), exc_info=True) return ''
def get_related_identifiers_url(record: Record, doi_prefix: str) -> List[Dict]: """Create related identifiers URL. Args: related_identifiers (Record): Record API Object from where the related identifiers will be extracted. doi_prefix (str): GEO Knowledge Hub DOI Prefix. Returns: List[Dict]: List of record related identifiers (with URL resolved) Note: The `doi_prefix` is used to check if the items are managed by the GEO Knowledge Hub. """ # extracting related identifiers related_identifiers = py_.get(record, "metadata.related_identifiers", []) new_related_identifiers = [] for related_identifier in related_identifiers: if related_identifier.get("identifier", None): pass scheme = related_identifier["scheme"] identifier = related_identifier["identifier"] related_identifier_obj = py_.set_(py_.clone_deep(related_identifier), "url", "") try: if idutils.is_url(identifier): related_identifier_obj["url"] = identifier else: # checking if the doi is internal if idutils.is_doi(identifier): identifier_split = identifier.split("/") if doi_prefix and identifier_split[0] == doi_prefix: related_identifier_obj["url"] = posixpath.join( "/records", identifier_split[1]) if not related_identifier_obj["url"]: related_identifier_obj["url"] = idutils.to_url( identifier, scheme, "https") except BaseException: related_identifier_obj["url"] = identifier new_related_identifiers.append(related_identifier_obj) return new_related_identifiers
def check_identifiers(self): uuidresult = {'id': 1, 'metric_id': 'FsF-F1-01D', 'passed': False} pidresult = {'id': 2, 'metric_id': 'FsF-F1-02D', 'passed': False} try: #try to find an identifier schema for the given string foundpids = id.detect_identifier_schemes(self.uid) if len(foundpids) > 0: #if schema found we have an id which can be found by idutils uuidresult['passed'] = True uuidresult['output'] = { 'uuid': self.uid, 'uuid_schema': foundpids } #now we check if the schema is listed in our valid pid list in this case it is also a pid realpids = [ value for value in foundpids if value in self.validpids ] if len(realpids) > 0: pidresult['passed'] = True if foundpids[0] == 'url': self.pid_url = self.uid else: # we try to find an actionable representation of the pid (URL) self.pid_url = id.to_url(pid, scheme=realpids[0]) #we should log here if this fails.. #Now we try to perform a HTTP GET request r = requests.get(self.pid_url) if r.status_code == 200: if len(realpids) > 0: self.pid = id.normalize_pid(pid, scheme=realpids[0]) self.landing_url = r.url self.landing_html = r.text pidresult['output'] = { 'pid': self.pid, 'resolved_url': self.landing_url, 'pid_schema': realpids } else: self.error.append('FsF-F1: HTTP Error: ' + str(r.status_code)) except BaseException as err: self.error.append('FsF-F1: Failed to check the given identifier' + str(err)) self.results.append(uuidresult) self.results.append(pidresult)
def __init__(self, idstring): self.identifier = idstring self.normalized_id = self.identifier if len(self.identifier) > 4 and not self.identifier.isnumeric(): generic_identifiers_org_pattern = '^([a-z0-9\._]+):(.+)' # idutils check self.identifier_schemes = idutils.detect_identifier_schemes( self.identifier) # identifiers.org check if not self.identifier_schemes: self.method = 'identifiers.org' idmatch = re.search(generic_identifiers_org_pattern, self.identifier) if idmatch: found_prefix = idmatch[1] found_suffix = idmatch[2] if found_prefix in self.IDENTIFIERS_ORG_DATA.keys(): if (re.search( self.IDENTIFIERS_ORG_DATA[found_prefix] ['pattern'], found_suffix)): self.identifier_schemes = [ found_prefix, 'identifiers_org' ] self.preferred_schema = found_prefix self.identifier_url = str( self.IDENTIFIERS_ORG_DATA[found_prefix] ['url_pattern']).replace('{$id}', found_suffix) self.normalized_id = found_prefix.lower( ) + ':' + found_suffix else: # preferred schema if len(self.identifier_schemes) > 0: if len(self.identifier_schemes) > 1: if 'url' in self.identifier_schemes: # ['doi', 'url'] self.identifier_schemes.remove('url') self.preferred_schema = self.identifier_schemes[0] self.normalized_id = idutils.normalize_pid( self.identifier, self.preferred_schema) self.identifier_url = idutils.to_url(self.identifier, self.preferred_schema) if self.preferred_schema in Mapper.VALID_PIDS.value or self.preferred_schema in self.IDENTIFIERS_ORG_DATA.keys( ): self.is_persistent = True
class DCATSerializer(object): """DCAT serializer for records.""" def __init__(self, datacite_serializer): """.""" self.datacite_serializer = datacite_serializer @cached_property def xslt_transform_func(self): """Return the DCAT XSLT transformation function.""" with resource_stream('zenodo.modules.records', 'data/datacite-to-dcat-ap.xsl') as f: xsl = ET.XML(f.read()) transform = ET.XSLT(xsl) return transform FILES_FIELDS = { '{{{dcat}}}downloadURL': lambda f, r: ui_link_for( 'record_file', id=r['recid'], filename=f['key']), '{{{dcat}}}mediaType': lambda f, r: mimetypes.guess_type(f['key'])[0], '{{{dcat}}}byteSize': lambda f, r: str(f['size']), '{{{dcat}}}accessURL': lambda f, r: idutils.to_url(r['doi'], 'doi', url_scheme='https'), # TODO: there's also "spdx:checksum", but it's not in the W3C spec yet } def _add_files(self, root, files, record): """Add files information via distribution elements.""" ns = root.nsmap for f in files: dist_wrapper = ET.SubElement(root[0], '{{{dcat}}}distribution'.format(**ns)) dist = ET.SubElement(dist_wrapper, '{{{dcat}}}Distribution'.format(**ns)) for tag, func in self.FILES_FIELDS.items(): val = func(f, record) if val: el = ET.SubElement(dist, tag.format(**ns)) el.text = val def _etree_tostring(self, root): return ET.tostring( root, pretty_print=True, xml_declaration=True, encoding='utf-8', ).decode('utf-8') def transform_with_xslt(self, pid, record, search_hit=False, **kwargs): """Transform record with XSLT.""" files_data = None if search_hit: dc_record = self.datacite_serializer.transform_search_hit( pid, record, **kwargs) if '_files' in record['_source']: files_data = record['_source']['_files'] elif '_files' in record: files_data = record['_files'] else: dc_record = self.datacite_serializer.transform_record( pid, record, **kwargs) # for single-record serialization check file read permissions if isinstance(record, Record) and '_files' in record: if not has_request_context() or has_read_files_permission( current_user, record): files_data = record['_files'] dc_etree = self.datacite_serializer.schema.dump_etree(dc_record) dc_namespace = self.datacite_serializer.schema.ns[None] dc_etree.tag = '{{{0}}}resource'.format(dc_namespace) dcat_etree = self.xslt_transform_func(dc_etree).getroot() # Inject files in results (since the XSLT can't do that by default) if files_data: self._add_files( root=dcat_etree, files=files_data, record=(record['_source'] if search_hit else record), ) return dcat_etree def serialize(self, pid, record, **kwargs): """Serialize a single record. :param pid: Persistent identifier instance. :param record: Record instance. """ return self._etree_tostring( self.transform_with_xslt(pid, record, **kwargs)) def serialize_search(self, pid_fetcher, search_result, **kwargs): """Serialize a search result. :param pid_fetcher: Persistent identifier fetcher. :param search_result: Elasticsearch search result. :param links: Dictionary of links to add to response. """ records = [] for hit in search_result['hits']['hits']: pid = pid_fetcher(hit['_id'], hit['_source']) dcat_etree = self.transform_with_xslt(pid, hit, search_hit=True, **kwargs) records.append(self._etree_tostring(dcat_etree)) return '\n'.join(records) def serialize_oaipmh(self, pid, record): """Serialize a single record for OAI-PMH.""" return self.transform_with_xslt(pid, record, search_hit=True)
def pid_url(related_identifier): identifier = related_identifier.get('identifier') scheme = related_identifier.get('scheme') if scheme and identifier: return idutils.to_url(identifier, scheme) return ""
def _serialize_subjects(ids): """Serialize subjects to URLs.""" return [{ '@type': 'CreativeWork', '@id': idutils.to_url(i['identifier'], i['scheme'], 'https') } for i in ids if 'scheme' in i]
def apply_rule(item, rule): r = copy.deepcopy(rule) r['link'] = idutils.to_url(item['identifier'], item['scheme'], 'https') return r
def _serialize_subjects(ids): """Serialize subjects to URLs.""" return [{'@type': 'CreativeWork', '@id': idutils.to_url(i['identifier'], i['scheme'], 'https')} for i in ids if 'scheme' in i]
def test_tourl(): """Test URL generation.""" for i, expected_schemes, normalized_value, url_value in identifiers: assert idutils.to_url(idutils.normalize_pid(i, expected_schemes[0]), expected_schemes[0]) == url_value
def access_url(_, record): url = idutils.to_url(record['doi'], 'doi', url_scheme='https') return None, {'{{{rdf}}}resource'.format(**ns): url}
def evaluate(self): self.result = Persistence(id=self.fuji.count, metric_identifier=self.metric_identifier, metric_name=self.metric_name) self.output = PersistenceOutput() # ======= CHECK IDENTIFIER PERSISTENCE ======= self.logger.info( 'FsF-F1-02D : PID schemes-based assessment supported by the assessment service - {}' .format(Mapper.VALID_PIDS.value)) if self.fuji.pid_scheme is not None: check_url = idutils.to_url(self.fuji.id, scheme=self.fuji.pid_scheme) elif self.fuji.id_scheme == 'url': check_url = self.fuji.id # ======= RETRIEVE METADATA FROM LANDING PAGE ======= requestHelper = RequestHelper(check_url, self.logger) requestHelper.setAcceptType(AcceptTypes.html) # request neg_source, self.fuji.extruct_result = requestHelper.content_negotiate( 'FsF-F1-02D') r = requestHelper.getHTTPResponse() signposting_pid = None if r: self.fuji.landing_url = requestHelper.redirect_url if r.status == 200: # identify signposting links in header header_link_string = requestHelper.getHTTPResponse().getheader( 'Link') if header_link_string is not None: self.logger.info( 'FsF-F1-02D : Found signposting links in response header of landingpage' ) for preparsed_link in header_link_string.split(','): found_link = None found_type, type_match = None, None found_rel, rel_match = None, None parsed_link = preparsed_link.strip().split(';') found_link = parsed_link[0].strip() for link_prop in parsed_link[1:]: if str(link_prop).startswith('rel="'): rel_match = re.search('rel=\"(.*?)\"', link_prop) elif str(link_prop).startswith('type="'): type_match = re.search('type=\"(.*?)\"', link_prop) if type_match: found_type = type_match[1] if rel_match: found_rel = rel_match[1] signposting_link_dict = { 'url': found_link[1:-1], 'type': found_type, 'rel': found_rel } if found_link: self.fuji.signposting_header_links.append( signposting_link_dict) ''' if found_rel: if self.fuji.signposting_header_links.get(found_rel[1]): self.fuji.signposting_header_links[found_rel[1]].append(found_link[1:-1]) else: self.fuji.signposting_header_links[found_rel[1]]=[found_link[1:-1]] ''' #check if there is a cite-as signposting link if self.fuji.pid_scheme is None: signposting_pid_link = self.fuji.get_signposting_links( 'cite-as') if signposting_pid_link: signposting_pid = signposting_pid_link[0].get('url') if signposting_pid: found_ids = idutils.detect_identifier_schemes( signposting_pid[0]) if len(found_ids) > 1: found_ids.remove('url') found_id = found_ids[0] if found_id in Mapper.VALID_PIDS.value: self.logger.info( 'FsF-F1-02D : Found object identifier in signposting header links' ) self.fuji.pid_scheme = found_id up = urlparse(self.fuji.landing_url) self.fuji.landing_origin = '{uri.scheme}://{uri.netloc}'.format( uri=up) self.fuji.landing_html = requestHelper.getResponseContent() self.output.resolved_url = self.fuji.landing_url # url is active, although the identifier is not based on a pid scheme self.output.resolvable_status = True self.logger.info( 'FsF-F1-02D : Object identifier active (status code = 200)' ) self.fuji.isMetadataAccessible = True elif r.status_code in [401, 402, 403]: self.fuji.isMetadataAccessible = False self.logger.warning( "Resource inaccessible, identifier returned http status code: {code}" .format(code=r.status_code)) else: self.fuji.isMetadataAccessible = False self.logger.warning( "Resource inaccessible, identifier returned http status code: {code}" .format(code=r.status_code)) else: self.fuji.isMetadataAccessible = False self.logger.warning( "FsF-F1-02D :Resource inaccessible, no response received from: {}" .format(check_url)) if self.fuji.pid_scheme is not None: # short_pid = id.normalize_pid(self.id, scheme=pid_scheme) if signposting_pid is None: self.fuji.pid_url = idutils.to_url(self.fuji.id, scheme=self.fuji.pid_scheme) else: self.fuji.pid_url = signposting_pid[0] self.output.pid_scheme = self.fuji.pid_scheme self.result.test_status = 'pass' self.output.pid = self.fuji.pid_url self.setEvaluationCriteriumScore('FsF-F1-02D-1', 0, 'pass') if self.fuji.isMetadataAccessible: self.setEvaluationCriteriumScore('FsF-F1-02D-2', 1, 'pass') self.score.earned = self.total_score # idenfier should be based on a persistence scheme and resolvable #print(self.metric_tests) self.logger.log( self.fuji.LOG_SUCCESS, 'FsF-F1-02D : Persistence identifier scheme - {}'.format( self.fuji.pid_scheme)) #self.logger.info('FsF-F1-02D : Persistence identifier scheme - {}'.format(self.fuji.pid_scheme)) else: self.score.earned = 0 self.logger.warning( 'FsF-F1-02D : Not a persistent identifier scheme - {}'.format( self.fuji.id_scheme)) self.result.score = self.score self.result.metric_tests = self.metric_tests self.result.output = self.output
def apply_rule(item, rule): r = copy.deepcopy(rule) r['link'] = idutils.to_url(item['identifier'], item['scheme']) return r
def get_doi(self, obj): """Get DOI of the record.""" data = obj['metadata'] return idutils.to_url(data['doi'], 'doi', 'https') \ if data.get('doi') \ else missing
def test_tourl(): """Test URL generation.""" for i, expected_schemes, normalized_value, url_value in identifiers: assert idutils.to_url( idutils.normalize_pid(i, expected_schemes[0]), expected_schemes[0] ) == url_value
def _serialize(self, value, attr, obj): if value is None: return None return idutils.to_url(value, 'doi')
def apply_rule(item, rule): r = copy.deepcopy(rule) r["link"] = idutils.to_url(item["identifier"], item["scheme"]) return r