def institution(self): """ The name and identifier of the latest institution associated with this researcher """ lst = jpath('activities-summary/employments/employment-summary', self.json, default=[]) lst += jpath('activities-summary/educations/education-summary', self.json, default=[]) for affiliation in lst: disamb = jpath('organization/disambiguated-organization', affiliation, default={}) source = disamb.get('disambiguation-source') inst_id = disamb.get('disambiguated-organization-identifier') name = jpath('organization/name', affiliation) country = jpath('organization/address/country', affiliation) identifier = None # we skip ringgold identifiers, because they suck: # https://github.com/ORCID/ORCID-Source/issues/3297 if source and inst_id and source.lower() != 'ringgold': identifier = unicode(source).lower() + '-' + unicode(inst_id) if name and country: return { 'identifier': identifier, 'name': name, 'country': country, } return None
def homepage(self): """ Extract an URL for that researcher (if any) """ lst = jpath('orcid-profile/orcid-bio/researcher-urls/researcher-url', self.json, default=[]) for url in lst: val = jpath('url/value', url) name = jpath('url-name/value', url) if name is not None and ('home' in name.lower() or 'personal' in name.lower()): return urlize(val) if len(lst): return urlize(jpath('url/value', lst[0])) or None
def name(self): """ Returns a parsed version of the "credit name" in the ORCID profile. If there is no such name, returns the given and family names on the profile (they should exist) """ name_item = jpath('orcid-profile/orcid-bio/personal-details', self.json) name = jpath('credit-name/value', name_item) if name is not None: return parse_comma_name(name) return (normalize_name_words(jpath('given-names/value', name_item, '')), normalize_name_words(jpath('family-name/value', name_item, '')))
def name(self): """ Returns a parsed version of the "credit name" in the ORCID profile. If there is no such name, returns the given and family names on the profile (they should exist) """ name_item = jpath('person/name', self.json) name = jpath('credit-name/value', name_item) if name: return parse_comma_name(name) return (normalize_name_words(jpath('given-names/value', name_item, '')), normalize_name_words(jpath('family-name/value', name_item, '')))
def homepage(self): """ Extract an URL for that researcher (if any) """ lst = jpath('person/researcher-urls/researcher-url', self.json, default=[]) for url in lst: val = jpath('url/value', url) name = jpath('url-name', url) if name is not None and ('home' in name.lower() or 'personal' in name.lower()): return urlize(val) if len(lst): return urlize(jpath('url/value', lst[0])) or None
def fetch_all_records(self, filters=None, cursor="*"): """ Fetches all Crossref records from their API, starting at a given date. :param filters: filters as specified by the REST API (as a dictionary) :param cursor: the initial cursor where to start the fetching (useful to resume failed ingestions) """ if filters is None: filters = {} params = {} if filters: params['filter'] = ','.join(k + ":" + v for k, v in list(filters.items())) rows = 100 next_cursor = cursor while next_cursor: params['rows'] = rows params['cursor'] = next_cursor params['mailto'] = settings.CROSSREF_MAILTO try: r = make_crossref_call('/works', params=params) r.raise_for_status() js = r.json() if js['status'] == 'failed': raise MetadataSourceException( 'Querying Crossrsef with {} failed.'.format(r.url)) found = False for item in jpath('message/items', js, default=[]): found = True yield item if not found: break next_cursor = jpath('message/next-cursor', js) logger.info( "Next cursor: {}".format(next_cursor)) # to ease recovery except ValueError as e: raise MetadataSourceException( 'Error while fetching CrossRef results:\nInvalid response.\n' + 'Parameters were: %s\nJSON parser error was: %s' % (urlencode(params), str(e))) except requests.exceptions.RequestException as e: raise MetadataSourceException( 'Error while fetching CrossRef results:\nError was: ' + str(e))
def other_names(self): """ Returns the list of other names listed on the ORCiD profile. This includes the (given,family) name if a credit name was defined. """ name_item = jpath('orcid-profile/orcid-bio/personal-details', self.json) names = [] credit_name = jpath('credit-name/value', name_item) if credit_name is not None: names.append((normalize_name_words(jpath('given-names/value', name_item)), normalize_name_words(jpath('family-name/value', name_item)))) other_names = jpath('other-names/other-name', name_item, default=[]) for name in other_names: val = name.get('value') if val is not None: names.append(parse_comma_name(val)) return names
def other_names(self): """ Returns the list of other names listed on the ORCiD profile. This includes the (given,family) name if a credit name was defined. """ person = jpath('person', self.json) names = [] credit_name = jpath('name/credit-name/value', person) if credit_name is not None: names.append((normalize_name_words( jpath('name/given-names/value', person, '')), normalize_name_words( jpath('name/family-name/value', person, '')))) other_names = jpath('other-names/other-name', person, default=[]) for name in other_names: val = name.get('content') if val is not None: names.append(parse_comma_name(val)) return names
def fetch_all_records(self, filters=None,cursor="*"): """ Fetches all Crossref records from their API, starting at a given date. :param filters: filters as specified by the REST API (as a dictionary) :param cursor: the initial cursor where to start the fetching (useful to resume failed ingestions) """ if filters is None: filters = {} params = {} if filters: params['filter'] = ','.join(k+":"+v for k, v in list(filters.items())) rows = 100 next_cursor = cursor while next_cursor: params['rows'] = rows params['cursor'] = next_cursor params['mailto'] = settings.CROSSREF_MAILTO try: r = make_crossref_call('/works', params=params) r.raise_for_status() js = r.json() if js['status'] == 'failed': raise MetadataSourceException( 'Querying Crossrsef with {} failed.'.format(r.url)) found = False for item in jpath('message/items', js, default=[]): found = True yield item if not found: break next_cursor = jpath('message/next-cursor', js) logger.info("Next cursor: {}".format(next_cursor)) # to ease recovery except ValueError as e: raise MetadataSourceException( 'Error while fetching CrossRef results:\nInvalid response.\n' + 'Parameters were: %s\nJSON parser error was: %s' % (urlencode(params), str(e))) except requests.exceptions.RequestException as e: raise MetadataSourceException('Error while fetching CrossRef results:\nError was: '+str(e))
def dois(self): dois = [] for extid in self.j( 'work-external-identifiers/work-external-identifier', []): if extid.get('work-external-identifier-type') == 'DOI': doi = to_doi(jpath('work-external-identifier-id/value', extid)) if doi: # If a DOI is available, create the paper using metadata from CrossRef. # We don't do it yet, we only store the DOI, so that we can fetch them # by batch later. dois.append(doi) return dois
def other_names(self): """ Returns the list of other names listed on the ORCiD profile. This includes the (given,family) name if a credit name was defined. """ name_item = jpath('orcid-profile/orcid-bio/personal-details', self.json) names = [] credit_name = jpath('credit-name/value', name_item) if credit_name is not None: names.append( (normalize_name_words(jpath('given-names/value', name_item, '')), normalize_name_words(jpath('family-name/value', name_item, '')))) other_names = jpath('other-names/other-name', name_item, default=[]) for name in other_names: val = name.get('value') if val is not None: names.append(parse_comma_name(val)) return names
def doi(self): """ Returns the DOI of this publication, if any. """ for external_id in jpath('external-ids/external-id', self.json, []): if (external_id.get('external-id-type') == 'doi' and external_id.get('external-id-relationship') == 'SELF' and external_id.get('external-id-value')): doi = to_doi(external_id.get('external-id-value')) if doi: return doi return None
def fetch_batch(cls, dois): """ Given a list of DOIs, return for each DOI a paper :params dois: List of DOIS :returns: list with Paper (or None) and DOI as key. Note that the key is lowered! """ # CrossRef allows only certain characters in doi, we just remove them to get better matching dois = list(map(cls.remove_unapproved_characters, dois)) # We create a dict and populate with `None`s and then override with paper objects papers = dict() for doi in dois: papers[doi.lower()] = None # We filter DOIs with comma, we do not batch them, but return them as `None` dois_to_fetch = cls._filter_dois_by_comma(dois) headers = {'User-Agent': settings.CROSSREF_USER_AGENT} url = 'https://api.crossref.org/works' s = requests.Session() while len(dois_to_fetch): dois_batch = dois_to_fetch[:cls.batch_length] dois_to_fetch = dois_to_fetch[cls.batch_length:] params = { 'filter': ','.join(['doi:{}'.format(doi) for doi in dois_batch]), 'mailto': settings.CROSSREF_MAILTO, 'rows': cls.batch_length, } try: r = request_retry( url, params=params, headers=headers, session=s, retries=0, # There is probably a user waiting ) except requests.exceptions.RequestException as e: # We skip the DOIs since we could not reach logger.info(e) continue items = jpath('message/items', r.json(), []) for item in items: try: p = cls.to_paper(item) except CiteprocError: logger.debug(item) else: papers[p.get_doi()] = p p = [papers.get(doi.lower(), None) for doi in dois] return p
def search_for_dois_incrementally( self, query, filters={}, max_batches=max_crossref_batches_per_researcher): """ Searches for DOIs for the given query and yields their metadata as it finds them. :param query: the search query to pass to CrossRef :param filters: filters as specified by the REST API :param max_batches: maximum number of queries to send to CrossRef """ params = {} if query: params['query'] = query if filters: params['filter'] = ','.join( map(lambda (k, v): k + ":" + v, filters.items())) count = 0 rows = 20 offset = 0 while not max_batches or count < max_batches: url = 'http://api.crossref.org/works' params['rows'] = rows params['offset'] = offset try: r = requests.get(url, params=params) print "CROSSREF: " + r.url js = r.json() found = False for item in jpath('message/items', js, default=[]): found = True yield item if not found: break except ValueError as e: raise MetadataSourceException( 'Error while fetching CrossRef results:\nInvalid response.\n' + 'URL was: %s\nParameters were: %s\nJSON parser error was: %s' % (url, urlencode(params), unicode(e))) except requests.exceptions.RequestException as e: raise MetadataSourceException( 'Error while fetching CrossRef results:\nUnable to open the URL: ' + request + '\nError was: ' + str(e)) offset += rows count += 1
def search_for_dois_incrementally(self, query, filters={}, max_batches=max_crossref_batches_per_researcher): """ Searches for DOIs for the given query and yields their metadata as it finds them. :param query: the search query to pass to CrossRef :param filters: filters as specified by the REST API :param max_batches: maximum number of queries to send to CrossRef """ params = {} if query: params['query'] = query if filters: params['filter'] = ','.join(map(lambda (k,v): k+":"+v, filters.items())) count = 0 rows = 20 offset = 0 while not max_batches or count < max_batches: url = 'http://api.crossref.org/works' params['rows'] = rows params['offset'] = offset try: r = requests.get(url, params=params) print "CROSSREF: "+r.url js = r.json() found = False for item in jpath('message/items', js, default=[]): found = True yield item if not found: break except ValueError as e: raise MetadataSourceException('Error while fetching CrossRef results:\nInvalid response.\n'+ 'URL was: %s\nParameters were: %s\nJSON parser error was: %s' % (url,urlencode(params),unicode(e))) except requests.exceptions.RequestException as e: raise MetadataSourceException('Error while fetching CrossRef results:\nUnable to open the URL: '+ request+'\nError was: '+str(e)) offset += rows count += 1
def title(self): """ Returns the title of this publication (always provided) """ return jpath('title/title/value', self.json)
def _fetch_day(cls, day): """ Fetches a whole day from CrossRef """ filters = { 'from-update-date': day.isoformat(), 'until-update-date': day.isoformat(), } params = { 'filter': ','.join('{}:{}'.format(key, value) for key, value in filters.items()), 'rows': cls.rows, 'mailto': settings.CROSSREF_MAILTO, } url = 'https://api.crossref.org/works' headers = { 'User-Agent': settings.CROSSREF_USER_AGENT, } s = requests.Session() cursor = '*' total_results = 0 loop_runs = 0 new_papers = 0 while cursor: params['cursor'] = cursor r = request_retry( url, params=params, headers=headers, session=s, ) if cursor == '*': total_results = jpath('message/total-results', r.json(), 0) logger.info('Fetch for day: {}, number results: {}'.format( day.isoformat(), total_results)) cursor = jpath('message/next-cursor', r.json()) items = jpath('message/items', r.json(), []) if len(items) == 0: cursor = False else: for item in items: try: cls.to_paper(item) except CiteprocError: logger.debug(item) except ValueError as e: logger.exception(e) logger.info(item) else: new_papers += 1 # After running ten times loop_runs += 1 if loop_runs % cls.emit_status_every == 0: logger.info('Parsed another {} papers. {} more to go'.format( cls.rows * cls.emit_status_every, total_results - loop_runs * cls.rows)) logger.info( 'For day {} have {} paper been added or updated out of {}.'.format( day.isoformat(), new_papers, total_results))
def fetch_orcid_records(self, orcid_identifier, profile=None, use_doi=True): """ Queries ORCiD to retrieve the publications associated with a given ORCiD. It also fetches such papers from the CrossRef search interface. :param profile: The ORCID profile if it has already been fetched before (format: parsed JSON). :param use_doi: Fetch the publications by DOI when we find one (recommended, but slow) :returns: a generator, where all the papers found are yielded. (some of them could be in free form, hence not imported) """ cr_api = CrossRefAPI() # Cleanup iD: orcid_id = validate_orcid(orcid_identifier) if orcid_id is None: raise MetadataSourceException('Invalid ORCiD identifier') # Get ORCiD profile try: if profile is None: profile = OrcidProfile(orcid_id=orcid_id) else: profile = OrcidProfile(json=profile) except MetadataSourceException as e: print e return # As we have fetched the profile, let's update the Researcher self.researcher = Researcher.get_or_create_by_orcid(orcid_identifier, profile.json, update=True) if not self.researcher: return # Reference name ref_name = profile.name # curl -H "Accept: application/orcid+json" # 'http://pub.orcid.org/v1.2/0000-0002-8612-8827/orcid-works' -L -i dois = [] # list of DOIs to fetch ignored_papers = [ ] # list of ignored papers due to incomplete metadata # Fetch publications (1st attempt with ORCiD data) pubs = jpath('orcid-profile/orcid-activities/orcid-works/orcid-work', profile, []) for pub in pubs: data_paper = ORCIDDataPaper.from_orcid_metadata( ref_name, orcid_id, pub, stop_if_dois_exists=use_doi) if not data_paper: continue if data_paper.dois and use_doi: # We want to batch it rather than manually do it. dois.extend(data_paper.dois) continue # If the paper is skipped due to invalid metadata. # We first try to reconcile it with local researcher author name. # Then, we consider it missed. if data_paper.skipped: data_paper = self.reconcile_paper( ref_name, orcid_id, pub, overrides={ 'authors': [(self.researcher.name.first, self.researcher.name.last)] }) if data_paper.skipped: print('%s is skipped due to incorrect metadata (%s)' % (data_paper, data_paper.skip_reason)) ignored_papers.append(data_paper.as_dict()) continue yield self.create_paper(data_paper) # 2nd attempt with DOIs and CrossRef if use_doi: # Let's grab papers from CrossRef #for success, paper_or_metadata in self.fetch_crossref_incrementally(cr_api, orcid_id): # if success: # yield paper_or_metadata # else: # ignored_papers.append(paper_or_metadata) # print('This metadata (%s) yields no paper.' % # (unicode(paper_or_metadata))) # Let's grab papers with DOIs found in our ORCiD profile. # FIXME(RaitoBezarius): if we fail here, we should get back the pub # and yield it. for success, paper_or_metadata in self.fetch_metadata_from_dois( cr_api, ref_name, orcid_id, dois): if success: yield paper_or_metadata else: ignored_papers.append(paper_or_metadata) print('This metadata (%s) yields no paper.' % (paper_or_metadata)) self.warn_user_of_ignored_papers(ignored_papers) if ignored_papers: print('Warning: Total ignored papers: %d' % (len(ignored_papers)))
def j(self, path, default=None): return jpath(path, self._pub, default)
def get_contrib(js): return { 'orcid': jpath('contributor-orcid', js), 'name': jpath('credit-name/value', js), }
def test_jpath(self): self.assertEqual(jpath('awesome', {}), None) self.assertEqual(jpath('awesome', {}, 41), 41) self.assertEqual(jpath('a', {'a': 'b'}, 41), 'b') self.assertEqual(jpath('a/b', {'a': {'b': 7}, 'c': None}, 41), 7) self.assertEqual(jpath('a', {'a': {'b': 7}, 'c': None}, 41), {'b': 7})
def j(self, path, default=None): return jpath(path, self.json, default)