def get_object(self): queryset = self.get_queryset() pk = self.kwargs.get('pk', None) doi = self.kwargs.get('doi', None) if doi: doi = to_doi(doi) paper = None try: if pk is not None: paper = queryset.get(pk=pk) elif doi is not None: paper = Paper.get_by_doi(doi) else: raise AttributeError("Paper view expects a DOI or a pk") except ObjectDoesNotExist: pass if not paper: paper = Paper.create_by_doi(doi) if paper is None or paper.is_orphan(): raise Http404( _("No %(verbose_name)s found matching the query") % {'verbose_name': Paper._meta.verbose_name}) return paper
def _post_filter(self, urls): if '1' in self.metadata.get('oa', []): urls['pdf'] = urls.get('splash') # Special case for PMC as their metadata includes other urls pmc_match = pmc_id_re.match(self.header.identifier()) if pmc_match: pmc_url = None for u in self.metadata.get('identifier',[]): # rationale : PMC urls are prioritary # but PMID urls can be used when no PMC url is provided # (because we know they link to PMC eventually, from the # identifier) if pmc_url_re.match(u) or (not pmc_url and pmid_url_re.match(u)): pmc_url = u urls['splash'] = pmc_url urls['pdf'] = pmc_url # Special case for DOIs if urls.get('splash'): doi = to_doi(urls.get('splash')) if doi: doi_prefix = doi.split('/')[0] if doi_prefix in free_doi_prefixes: urls['pdf'] = urls['splash'] return urls
def get_object(self, queryset=None): if queryset is None: queryset = self.get_queryset() pk = self.kwargs.get('pk', None) doi = self.kwargs.get('doi', None) if doi: doi = unquote(doi) doi = to_doi(doi) paper = None try: if pk is not None: paper = queryset.get(pk=pk) elif doi is not None: paper = Paper.get_by_doi(doi) else: raise Http404(_("Paper view expects a DOI or a pk")) except ObjectDoesNotExist: pass if paper is None or paper.is_orphan(): raise Http404( _("No %(verbose_name)s found matching the query") % {'verbose_name': Paper._meta.verbose_name}) if not paper.visible: raise Http404(_("This paper has been deleted.")) paper = queryset.prefetch_related('oairecord_set').get(pk=paper.pk) return paper
def get_object(self, queryset=None): if queryset is None: queryset = self.get_queryset() pk = self.kwargs.get('pk', None) doi = self.kwargs.get('doi', None) if doi: doi = unquote(doi) doi = to_doi(doi) paper = None try: if pk is not None: paper = queryset.get(pk=pk) elif doi is not None: paper = Paper.get_by_doi(doi) else: raise Http404(_("Paper view expects a DOI or a pk")) except ObjectDoesNotExist: pass if not paper: paper = Paper.create_by_doi(doi) if paper is None or paper.is_orphan(): raise Http404(_("No %(verbose_name)s found matching the query") % {'verbose_name': Paper._meta.verbose_name}) if not paper.visible: raise Http404(_("This paper has been deleted.")) return paper
def _get_doi(data): """ :param data: citeproc metadata :returns: doi or None """ doi = to_doi(data.get('DOI', '')) if doi is None: raise CiteprocDOIError('Invalid DOI in metadata') return doi
def dois(self): dois = [] for extid in self.j('work-external-identifiers/work-external-identifier', []): if extid.get('work-external-identifier-type') == 'DOI': doi = to_doi(jpath('work-external-identifier-id/value', extid)) if doi: # If a DOI is available, create the paper using metadata from CrossRef. # We don't do it yet, we only store the DOI, so that we can fetch them # by batch later. dois.append(doi) return dois
def doi(self): """ Returns the DOI of this publication, if any. """ for external_id in jpath('external-ids/external-id', self.json, []): if (external_id.get('external-id-type') == 'doi' and external_id.get('external-id-relationship') == 'SELF' and external_id.get('external-id-value')): doi = to_doi(external_id.get('external-id-value')) if doi: return doi return None
def redirect_by_doi(request, doi): """ This view is inherited from doai.io, migrated to this code base to preserve the existing behaviour. We could instead redirect to unpaywall, but that would not include ResearchGate urls. """ doi = unquote(doi) doi = to_doi(doi) if not doi: raise Http404(_("Invalid DOI.")) paper = Paper.get_by_doi(doi) if paper and paper.pdf_url: return HttpResponsePermanentRedirect(paper.pdf_url) return HttpResponsePermanentRedirect(doi_to_url(doi))
def create_oairecord(self, record): """ Given one line of the dump (represented as a dict), add it to the corresponding paper (if it exists) """ doi = to_doi(record['doi']) if not doi: return prefix = doi.split('/')[0] if prefix in free_doi_prefixes: return paper = Paper.get_by_doi(doi) if not paper: try: paper = Paper.create_by_doi(doi) except (MetadataSourceException, ValueError): return if not paper: print('no such paper for doi {doi}'.format(doi=doi)) return url = record['url'] # just to speed things up a bit... if paper.pdf_url == url: return identifier = 'oadoi:' + url source = self.oadoi_source if record['host_type'] == 'publisher': url = doi_to_url(doi) identifier = doi_to_crossref_identifier(doi) source = self.crossref_source record = BareOaiRecord(paper=paper, doi=doi, pubtype=paper.doctype, source=source, identifier=identifier, splash_url=url, pdf_url=record['url']) try: paper.add_oairecord(record) paper.update_availability() # TODO re-enable this #paper.update_index() except (DataError, ValueError): print('Record does not fit in the DB')
def add_oai_record(self, header, metadata, paper): """ Add a record (from OAI-PMH) to the given paper """ identifier = header.identifier() # description in oai_dc means abstract curdesc = "" for desc in metadata['description']: if len(desc) > len(curdesc): curdesc = desc curdesc = sanitize_html(curdesc) # Run extractor to find the URLs splash_url, pdf_url = self.extract_urls( header, metadata, self.oaisource.identifier) keywords = ' | '.join(metadata['subject']) contributors = ' '.join(metadata['contributor'])[:4096] typenorms = ['typenorm:'+tn for tn in metadata.get('typenorm', [])] pubtype_list = metadata.get('type', []) + typenorms pubtype = None for raw_pubtype in pubtype_list: pubtype = OAI_PUBTYPE_TRANSLATIONS.get(raw_pubtype) if pubtype is not None: break if pubtype is None: pubtype = self.oaisource.default_pubtype # Find the DOI, if any doi = None for url in metadata['identifier']+metadata['relation']+metadata['source']: if not doi: doi = to_doi(url) record = BareOaiRecord( source=self.oaisource, identifier=identifier, description=curdesc, keywords=keywords, contributors=contributors, pubtype=pubtype, pdf_url=pdf_url, splash_url=splash_url, doi=doi) paper.add_oairecord(record)
def add_oai_record(self, header, metadata, paper): """ Add a record (from OAI-PMH) to the given paper """ identifier = header.identifier() # description in oai_dc means abstract curdesc = "" for desc in metadata['description']: if len(desc) > len(curdesc): curdesc = desc curdesc = sanitize_html(curdesc) # Run extractor to find the URLs splash_url, pdf_url = self.extract_urls(header, metadata, self.oaisource.identifier) keywords = ' | '.join(metadata['subject']) contributors = ' '.join(metadata['contributor'])[:4096] typenorms = ['typenorm:' + tn for tn in metadata.get('typenorm', [])] pubtype_list = metadata.get('type', []) + typenorms pubtype = None for raw_pubtype in pubtype_list: pubtype = OAI_PUBTYPE_TRANSLATIONS.get(raw_pubtype) if pubtype is not None: break if pubtype is None: pubtype = self.oaisource.default_pubtype # Find the DOI, if any doi = None for url in metadata['identifier'] + metadata['relation'] + metadata[ 'source']: if not doi: doi = to_doi(url) record = BareOaiRecord(source=self.oaisource, identifier=identifier, description=curdesc, keywords=keywords, contributors=contributors, pubtype=pubtype, pdf_url=pdf_url, splash_url=splash_url, doi=doi) paper.add_oairecord(record)
def test_to_doi(self): self.assertEqual(to_doi('https://doi.org/10.1145/1721837.1721839'), '10.1145/1721837.1721839') self.assertEqual(to_doi('https://doi.org/10.1145/1721837.1721839'), '10.1145/1721837.1721839') self.assertEqual(to_doi('10.1145/1721837.1721839'), '10.1145/1721837.1721839') self.assertEqual(to_doi('DOI: 10.1145/1721837.1721839'), '10.1145/1721837.1721839') self.assertEqual(to_doi('info:eu-repo/semantics/altIdentifier/doi/10.1145/1721837.1721839'), '10.1145/1721837.1721839') self.assertEqual(to_doi('10.1093/jhmas/XXXI.4.480'), '10.1093/jhmas/xxxi.4.480')
def get_object(self): queryset = self.get_queryset() pk = self.kwargs.get('pk', None) doi = self.kwargs.get('doi', None) if doi: doi = to_doi(doi) try: if pk is not None: paper = queryset.filter(pk=pk).get() elif doi is not None: publi = Publication.objects.get(doi=doi) paper = publi.paper else: raise AttributeError("Paper view expects a DOI or a pk") except ObjectDoesNotExist: paper = Paper.create_by_doi(doi) if paper is None: raise Http404(_("No %(verbose_name)s found matching the query") % {'verbose_name': Publication._meta.verbose_name}) return paper
def save_doi_metadata(self, metadata, extra_orcids=None): """ Given the metadata as Citeproc+JSON or from CrossRef, create the associated paper and publication :param extra_orcids: an optional orcids list, which will be unified with the orcids extracted from the metadata. This is useful for the ORCID interface. :returns: the paper, created if needed """ # Normalize metadata if metadata is None or not isinstance(metadata, dict): raise ValueError('Invalid metadata format, expecting a dict') if not metadata.get('author'): raise ValueError('No author provided') if not metadata.get('title'): raise ValueError('No title') # the upstream function ensures that there is a non-empty title if not to_doi(metadata.get('DOI')): raise ValueError("No DOI, skipping") pubdate = get_publication_date(metadata) if pubdate is None: raise ValueError('No pubdate') title = metadata['title'] # CrossRef metadata stores titles in lists if isinstance(title, list): title = title[0] subtitle = metadata.get('subtitle') if subtitle: if isinstance(subtitle, list): subtitle = subtitle[0] title += ': '+subtitle name_pairs = list(map(convert_to_name_pair, metadata['author'])) if None in name_pairs: raise ValueError('Invalid author') authors = [BareName.create_bare(first, last) for first, last in name_pairs] def get_affiliation(author_elem): for dct in author_elem.get('affiliation', []): if 'name' in dct: return dct['name'] def get_orcid(author_elem): orcid = validate_orcid(author_elem.get('ORCID')) if orcid: return orcid new_orcids = list(map(get_orcid, metadata['author'])) if extra_orcids: # remove the extra_orcids if they already exist on different authors set_of_extra_orcids = set(x for x in extra_orcids if x != None) new_orcids = [(x if x not in set_of_extra_orcids else None) for x in new_orcids] # now do the union orcids = [new or old for (old, new) in zip( extra_orcids, new_orcids)] else: orcids = new_orcids affiliations = list(map(get_affiliation, metadata['author'])) paper = BarePaper.create(title, authors, pubdate, visible=True, affiliations=affiliations, orcids=orcids) result = create_publication(paper, metadata) if result is None: # Creating the publication failed! # Make sure the paper only appears if it is still associated # with another source. paper.update_visible() else: paper = result[0] return paper
def _create_publication(paper, metadata): if not metadata: return if not metadata.get('container-title'): return doi = to_doi(metadata.get('DOI', None)) title = metadata['container-title'] if isinstance(title, list): title = title[0] title = title[:512] issn = metadata.get('ISSN', None) if issn and isinstance(issn, list): issn = issn[0] # TODO pass all the ISSN to the RoMEO interface volume = metadata.get('volume', None) pages = metadata.get('page', None) issue = metadata.get('issue', None) date_dict = metadata.get('issued', dict()) pubdate = None if 'date-parts' in date_dict: dateparts = date_dict.get('date-parts')[0] pubdate = date_from_dateparts(dateparts) # for instance it outputs dates like 2014-2-3 publisher_name = metadata.get('publisher', None) if publisher_name: publisher_name = publisher_name[:512] pubtype = metadata.get('type', 'unknown') pubtype = CROSSREF_PUBTYPE_ALIASES.get(pubtype, pubtype) splash_url = doi_to_url(doi) # PDF availability pdf_url = None licenses = set([(license or {}).get('URL') for license in metadata.get('license', [])]) doi_prefix = doi.split('/')[0] if doi_prefix in free_doi_prefixes or any(map(is_oa_license, licenses)): pdf_url = splash_url # Lookup journal journal = Journal.find(issn=issn, title=title) publisher = None if journal: publisher = journal.publisher AliasPublisher.increment(publisher_name, journal.publisher) else: publisher = Publisher.find(publisher_name) barepub = BareOaiRecord( paper=paper, journal_title=title, issue=issue, volume=volume, pubdate=pubdate, pages=pages, doi=doi, pubtype=pubtype, publisher_name=publisher_name, journal=journal, publisher=publisher, pdf_url=pdf_url, splash_url=splash_url, source=OaiSource.objects.get(identifier='crossref'), identifier=doi_to_crossref_identifier(doi)) rec = paper.add_oairecord(barepub) paper.update_availability() return paper, rec
def process_records(self, listRecords): for record in listRecords: metadata = record[1]._map authors = get_oai_authors(metadata) # Filter the record if all(not elem.is_known for elem in authors): print "No relevant author, continue" continue if not 'title' in metadata or metadata['title'] == []: continue # Find the source sets = record[0].setSpec() source_identifier = None for s in sets: if s.startswith(PROXY_SOURCE_PREFIX): source_identifier = s[len(PROXY_SOURCE_PREFIX):] break source = None if source_identifier: try: source = OaiSource.objects.get( identifier=source_identifier) except OaiSource.DoesNotExist: pass if not source: print "Invalid source '" + str( source_identifier) + "' from the proxy, skipping" continue # Find the DOI, if any doi = None for identifier in metadata['identifier'] + metadata['relation']: if not doi: doi = to_doi(identifier) # A publication date is necessary pubdate = find_earliest_oai_date(record) if not pubdate: print "No publication date, skipping" continue print 'Saving record %s' % record[0].identifier() paper = BarePaper.create(metadata['title'][0], authors, pubdate) if doi: try: metadata = crossref.fetch_metadata_by_DOI(doi) crossref.create_publication(paper, metadata) except MetadataSourceException as e: print( "Warning, metadata source exception while fetching DOI " + doi + ":\n" + unicode(e)) pass if paper is None: print "Paper creation failed, skipping" continue # Save the record # TODO: we should check record validity *BEFORE* creating the paper try: add_oai_record(record, source, paper) yield paper except ValueError as e: print "Warning, OAI record " + record[0].identifier( ) + " skipped:\n" + unicode(e) paper.update_availability()
def process_records(self, listRecords): for record in listRecords: metadata = record[1]._map authors = get_oai_authors(metadata) # Filter the record if all(not elem.is_known for elem in authors): print "No relevant author, continue" continue if not 'title' in metadata or metadata['title'] == []: continue # Find the source sets = record[0].setSpec() source_identifier = None for s in sets: if s.startswith(PROXY_SOURCE_PREFIX): source_identifier = s[len(PROXY_SOURCE_PREFIX):] break source = None if source_identifier: try: source = OaiSource.objects.get(identifier=source_identifier) except OaiSource.DoesNotExist: pass if not source: print "Invalid source '"+str(source_identifier)+"' from the proxy, skipping" continue # Find the DOI, if any doi = None for identifier in metadata['identifier']+metadata['relation']: if not doi: doi = to_doi(identifier) # A publication date is necessary pubdate = find_earliest_oai_date(record) if not pubdate: print "No publication date, skipping" continue print 'Saving record %s' % record[0].identifier() paper = BarePaper.create(metadata['title'][0], authors, pubdate) if doi: try: metadata = crossref.fetch_metadata_by_DOI(doi) crossref.create_publication(paper, metadata) except MetadataSourceException as e: print("Warning, metadata source exception while fetching DOI "+doi+":\n"+unicode(e)) pass if paper is None: print "Paper creation failed, skipping" continue # Save the record # TODO: we should check record validity *BEFORE* creating the paper try: add_oai_record(record, source, paper) yield paper except ValueError as e: print "Warning, OAI record "+record[0].identifier()+" skipped:\n"+unicode(e) paper.update_availability()
def _create_publication(paper, metadata): if not metadata: return if not 'container-title' in metadata or not metadata['container-title']: return doi = to_doi(metadata.get('DOI',None)) title = metadata['container-title'] if type(title) == type([]): title = title[0] title = title[:512] issn = metadata.get('ISSN',None) if issn and type(issn) == type([]): issn = issn[0] # TODO pass all the ISSN to the RoMEO interface volume = metadata.get('volume',None) pages = metadata.get('page',None) issue = metadata.get('issue',None) date_dict = metadata.get('issued',dict()) pubdate = None if 'date-parts' in date_dict: dateparts = date_dict.get('date-parts')[0] pubdate = date_from_dateparts(dateparts) # for instance it outputs dates like 2014-2-3 publisher_name = metadata.get('publisher', None) if publisher_name: publisher_name = publisher_name[:512] pubtype = metadata.get('type','unknown') pubtype = CROSSREF_PUBTYPE_ALIASES.get(pubtype, pubtype) # PDF availability pdf_url = None licenses = set([(license or {}).get('URL') for license in metadata.get('license', [])]) if any(map(is_oa_license, licenses)): pdf_url = 'http://dx.doi.org/'+doi # Lookup journal search_terms = {'jtitle':title} if issn: search_terms['issn'] = issn journal = fetch_journal(search_terms) publisher = None if journal: publisher = journal.publisher AliasPublisher.increment(publisher_name, journal.publisher) else: publisher = fetch_publisher(publisher_name) barepub = BarePublication(title=title, issue=issue, volume=volume, pubdate=pubdate, paper=paper, pages=pages, doi=doi, pubtype=pubtype, publisher_name=publisher_name, journal=journal, publisher=publisher, pdf_url=pdf_url) pub = paper.add_publication(barepub) cur_pubdate = paper.pubdate if type(cur_pubdate) != type(pubdate): cur_pubdate = cur_pubdate.date() if pubdate and pubdate > cur_pubdate: paper.pubdate = pubdate paper.update_availability() return paper, pub
def save_doi_metadata(self, metadata, extra_orcids=None): """ Given the metadata as Citeproc+JSON or from CrossRef, create the associated paper and publication :param extra_orcids: an optional orcids list, which will be unified with the orcids extracted from the metadata. This is useful for the ORCID interface. :returns: the paper, created if needed """ # Normalize metadata if metadata is None or not isinstance(metadata, dict): raise ValueError('Invalid metadata format, expecting a dict') if not metadata.get('author'): raise ValueError('No author provided') if not metadata.get('title'): raise ValueError('No title') # the upstream function ensures that there is a non-empty title if not to_doi(metadata.get('DOI')): raise ValueError("No DOI, skipping") pubdate = get_publication_date(metadata) if pubdate is None: raise ValueError('No pubdate') title = metadata['title'] # CrossRef metadata stores titles in lists if isinstance(title, list): title = title[0] subtitle = metadata.get('subtitle') if subtitle: if isinstance(subtitle, list): subtitle = subtitle[0] title += ': '+subtitle name_pairs = map(convert_to_name_pair, metadata['author']) if None in name_pairs: raise ValueError('Invalid author') authors = [BareName.create_bare(first, last) for first, last in name_pairs] def get_affiliation(author_elem): for dct in author_elem.get('affiliation', []): if 'name' in dct: return dct['name'] def get_orcid(author_elem): orcid = validate_orcid(author_elem.get('ORCID')) if orcid: return orcid new_orcids = map(get_orcid, metadata['author']) if extra_orcids: orcids = [new or old for (old, new) in zip( extra_orcids, new_orcids)] else: orcids = new_orcids affiliations = map(get_affiliation, metadata['author']) paper = BarePaper.create(title, authors, pubdate, visible=True, affiliations=affiliations, orcids=orcids) result = create_publication(paper, metadata) if result is None: # Creating the publication failed! # Make sure the paper only appears if it is still associated # with another source. paper.update_visible() else: paper = result[0] return paper
def create_oairecord(self, record, update_index=True, create_missing_dois=True): """ Given one line of the dump (represented as a dict), add it to the corresponding paper (if it exists) """ doi = to_doi(record['doi']) if not doi: return prefix = doi.split('/')[0] if prefix in free_doi_prefixes: return if not record.get('oa_locations'): return paper = Paper.get_by_doi(doi) if not paper: if not create_missing_dois: return try: paper = Paper.create_by_doi(doi) except (MetadataSourceException, ValueError): return if not paper: logger.info('no such paper for doi {doi}'.format(doi=doi)) return logger.info(doi) paper.cache_oairecords() for oa_location in record.get('oa_locations') or []: url = oa_location['url'] # just to speed things up a bit... if paper.pdf_url == url: return identifier='oadoi:'+url source = self.oadoi_source if oa_location['host_type'] == 'publisher': url = doi_to_url(doi) identifier = doi_to_crossref_identifier(doi) source = self.crossref_source record = BareOaiRecord( paper=paper, doi=doi, pubtype=paper.doctype, source=source, identifier=identifier, splash_url=url, pdf_url=oa_location['url']) try: # We disable checks by DOI since we know the paper has been looked up by DOI already. old_pdf_url = paper.pdf_url paper.add_oairecord(record, check_by_doi=False) super(Paper, paper).update_availability() if old_pdf_url != paper.pdf_url: paper.save() if update_index: paper.update_index() except (DataError, ValueError): logger.warning('Record does not fit in the DB')
def _create_publication(paper, metadata): if not metadata: return if not 'container-title' in metadata or not metadata['container-title']: return doi = to_doi(metadata.get('DOI', None)) title = metadata['container-title'] if type(title) == type([]): title = title[0] title = title[:512] issn = metadata.get('ISSN', None) if issn and type(issn) == type([]): issn = issn[0] # TODO pass all the ISSN to the RoMEO interface volume = metadata.get('volume', None) pages = metadata.get('page', None) issue = metadata.get('issue', None) date_dict = metadata.get('issued', dict()) pubdate = None if 'date-parts' in date_dict: dateparts = date_dict.get('date-parts')[0] pubdate = date_from_dateparts(dateparts) # for instance it outputs dates like 2014-2-3 publisher_name = metadata.get('publisher', None) if publisher_name: publisher_name = publisher_name[:512] pubtype = metadata.get('type', 'unknown') pubtype = CROSSREF_PUBTYPE_ALIASES.get(pubtype, pubtype) # PDF availability pdf_url = None licenses = set([(license or {}).get('URL') for license in metadata.get('license', [])]) if any(map(is_oa_license, licenses)): pdf_url = 'http://dx.doi.org/' + doi # Lookup journal search_terms = {'jtitle': title} if issn: search_terms['issn'] = issn journal = fetch_journal(search_terms) publisher = None if journal: publisher = journal.publisher AliasPublisher.increment(publisher_name, journal.publisher) else: publisher = fetch_publisher(publisher_name) barepub = BarePublication(title=title, issue=issue, volume=volume, pubdate=pubdate, paper=paper, pages=pages, doi=doi, pubtype=pubtype, publisher_name=publisher_name, journal=journal, publisher=publisher, pdf_url=pdf_url) pub = paper.add_publication(barepub) cur_pubdate = paper.pubdate if type(cur_pubdate) != type(pubdate): cur_pubdate = cur_pubdate.date() if pubdate and pubdate > cur_pubdate: paper.pubdate = pubdate paper.update_availability() return paper, pub
def fetch_orcid_records(self, id, profile=None, use_doi=True): """ Queries ORCiD to retrieve the publications associated with a given ORCiD. It also fetches such papers from the CrossRef search interface. :param profile: The ORCID profile if it has already been fetched before (format: parsed JSON). :param use_doi: Fetch the publications by DOI when we find one (recommended, but slow) :returns: a generator, where all the papers found are yielded. (some of them could be in free form, hence not imported) """ crps = CrossRefPaperSource(self.ccf) # Cleanup iD: id = validate_orcid(id) if id is None: raise MetadataSourceException('Invalid ORCiD identifier') # Get ORCiD profile try: if profile is None: profile = OrcidProfile(id=id) else: profile = OrcidProfile(json=profile) except MetadataSourceException as e: print e return # Reference name ref_name = profile.name # curl -H "Accept: application/orcid+json" 'http://pub.orcid.org/v1.2/0000-0002-8612-8827/orcid-works' -L -i dois = [] # list of DOIs to fetch papers = [] # list of papers created records_found = 0 # how many records did we successfully import from the profile? # Fetch publications pubs = jpath('orcid-profile/orcid-activities/orcid-works/orcid-work', profile, []) for pub in pubs: def j(path, default=None): return jpath(path, pub, default) # DOI doi = None for extid in j('work-external-identifiers/work-external-identifier', []): if extid.get('work-external-identifier-type') == 'DOI': doi = to_doi(jpath('work-external-identifier-id/value', extid)) if doi: # If a DOI is available, create the paper using metadata from CrossRef. # We don't do it yet, we only store the DOI, so that we can fetch them # by batch later. dois.append(doi) if doi and use_doi: continue # Extract information from ORCiD # Title title = j('work-title/title/value') if title is None: print "Warning: Skipping ORCID publication: no title" # Type doctype = orcid_to_doctype(j('work-type', 'other')) # Contributors (ignored for now as they are very often not present) def get_contrib(js): return { 'orcid':jpath('contributor-orcid', js), 'name': jpath('credit-name/value', js), } contributors = map(get_contrib, j('work-contributors/contributor',[])) author_names = filter(lambda x: x is not None, map( lambda x: x['name'], contributors)) authors = map(parse_comma_name, author_names) pubdate = None # ORCiD internal id identifier = j('put-code') affiliations = map(lambda x: x['orcid'], contributors) # Pubdate year = parse_int(j('publication-date/year/value'), 1970) month = parse_int(j('publication-date/month/value'), 01) day = parse_int(j('publication-date/day/value'), 01) pubdate = None try: pubdate = date(year=year, month=01, day=01) pubdate = date(year=year, month=month, day=01) pubdate = date(year=year, month=month, day=day) except ValueError: if pubdate is None: print "Invalid publication date in ORCID publication, skipping" continue # Citation type: metadata format citation_format = j('work-citation/work-citation-type') print citation_format bibtex = j('work-citation/citation') if bibtex is not None: try: entry = parse_bibtex(bibtex) if entry.get('author', []) == []: print "Warning: Skipping ORCID publication: no authors." print j('work-citation/citation') if not authors: authors = entry['author'] except ValueError: pass affiliations = affiliate_author_with_orcid(ref_name, id, authors, initial_affiliations=affiliations) authors = map(name_lookup_cache.lookup, authors) if not authors: print "No authors found, skipping" continue # Create paper: paper = BarePaper.create(title, authors, pubdate, 'VISIBLE', affiliations) record = BareOaiRecord( source=orcid_oai_source, identifier=identifier, splash_url='http://orcid.org/'+id, pubtype=doctype) paper.add_oairecord(record) yield paper if use_doi: for metadata in crps.search_for_dois_incrementally('', {'orcid':id}): try: paper = crps.save_doi_metadata(metadata) if paper: yield paper except ValueError as e: print "Saving CrossRef record from ORCID failed: %s" % unicode(e) # Now we add the DOIs found in the ORCID profile. doi_metadata = fetch_dois(dois) for metadata in doi_metadata: try: authors = map(convert_to_name_pair, metadata['author']) affiliations = affiliate_author_with_orcid(ref_name, id, authors) paper = crps.save_doi_metadata(metadata, affiliations) if not paper: continue record = BareOaiRecord( source=orcid_oai_source, identifier='orcid:'+id+':'+metadata['DOI'], splash_url='http://orcid.org/'+id, pubtype=paper.doctype) paper.add_oairecord(record) yield paper except (KeyError, ValueError, TypeError): pass
def _create_publication(paper, metadata): if not metadata: return if not metadata.get('container-title'): return doi = to_doi(metadata.get('DOI', None)) title = metadata['container-title'] if isinstance(title, list): title = title[0] title = title[:512] issn = metadata.get('ISSN', None) if issn and isinstance(issn, list): issn = issn[0] # TODO pass all the ISSN to the RoMEO interface volume = metadata.get('volume', None) pages = metadata.get('page', None) issue = metadata.get('issue', None) date_dict = metadata.get('issued', dict()) pubdate = None if 'date-parts' in date_dict: dateparts = date_dict.get('date-parts')[0] pubdate = date_from_dateparts(dateparts) # for instance it outputs dates like 2014-2-3 publisher_name = metadata.get('publisher', None) if publisher_name: publisher_name = publisher_name[:512] pubtype = metadata.get('type', 'unknown') pubtype = CROSSREF_PUBTYPE_ALIASES.get(pubtype, pubtype) splash_url = doi_to_url(doi) # PDF availability pdf_url = None licenses = set([(license or {}).get('URL') for license in metadata.get('license', [])]) doi_prefix = doi.split('/')[0] if doi_prefix in free_doi_prefixes or any(map(is_oa_license, licenses)): pdf_url = splash_url # Lookup journal search_terms = {'jtitle': title} if issn: search_terms['issn'] = issn journal = fetch_journal(search_terms) publisher = None if journal: publisher = journal.publisher AliasPublisher.increment(publisher_name, journal.publisher) else: publisher = fetch_publisher(publisher_name) barepub = BareOaiRecord( paper=paper, journal_title=title, issue=issue, volume=volume, pubdate=pubdate, pages=pages, doi=doi, pubtype=pubtype, publisher_name=publisher_name, journal=journal, publisher=publisher, pdf_url=pdf_url, splash_url=splash_url, source=OaiSource.objects.get(identifier='crossref'), identifier=doi_to_crossref_identifier(doi)) rec = paper.add_oairecord(barepub) paper.update_availability() return paper, rec
def fetch_orcid_records(self, id, profile=None, use_doi=True): """ Queries ORCiD to retrieve the publications associated with a given ORCiD. It also fetches such papers from the CrossRef search interface. :param profile: The ORCID profile if it has already been fetched before (format: parsed JSON). :param use_doi: Fetch the publications by DOI when we find one (recommended, but slow) :returns: a generator, where all the papers found are yielded. (some of them could be in free form, hence not imported) """ crps = CrossRefPaperSource(self.ccf) # Cleanup iD: id = validate_orcid(id) if id is None: raise MetadataSourceException('Invalid ORCiD identifier') # Get ORCiD profile try: if profile is None: profile = OrcidProfile(id=id) else: profile = OrcidProfile(json=profile) except MetadataSourceException as e: print e return # Reference name ref_name = profile.name # curl -H "Accept: application/orcid+json" 'http://pub.orcid.org/v1.2/0000-0002-8612-8827/orcid-works' -L -i dois = [] # list of DOIs to fetch papers = [] # list of papers created records_found = 0 # how many records did we successfully import from the profile? # Fetch publications pubs = jpath('orcid-profile/orcid-activities/orcid-works/orcid-work', profile, []) for pub in pubs: def j(path, default=None): return jpath(path, pub, default) # DOI doi = None for extid in j( 'work-external-identifiers/work-external-identifier', []): if extid.get('work-external-identifier-type') == 'DOI': doi = to_doi( jpath('work-external-identifier-id/value', extid)) if doi: # If a DOI is available, create the paper using metadata from CrossRef. # We don't do it yet, we only store the DOI, so that we can fetch them # by batch later. dois.append(doi) if doi and use_doi: continue # Extract information from ORCiD # Title title = j('work-title/title/value') if title is None: print "Warning: Skipping ORCID publication: no title" # Type doctype = orcid_to_doctype(j('work-type', 'other')) # Contributors (ignored for now as they are very often not present) def get_contrib(js): return { 'orcid': jpath('contributor-orcid', js), 'name': jpath('credit-name/value', js), } contributors = map(get_contrib, j('work-contributors/contributor', [])) author_names = filter(lambda x: x is not None, map(lambda x: x['name'], contributors)) authors = map(parse_comma_name, author_names) pubdate = None # ORCiD internal id identifier = j('put-code') affiliations = map(lambda x: x['orcid'], contributors) # Pubdate year = parse_int(j('publication-date/year/value'), 1970) month = parse_int(j('publication-date/month/value'), 01) day = parse_int(j('publication-date/day/value'), 01) pubdate = None try: pubdate = date(year=year, month=01, day=01) pubdate = date(year=year, month=month, day=01) pubdate = date(year=year, month=month, day=day) except ValueError: if pubdate is None: print "Invalid publication date in ORCID publication, skipping" continue # Citation type: metadata format citation_format = j('work-citation/work-citation-type') print citation_format bibtex = j('work-citation/citation') if bibtex is not None: try: entry = parse_bibtex(bibtex) if entry.get('author', []) == []: print "Warning: Skipping ORCID publication: no authors." print j('work-citation/citation') if not authors: authors = entry['author'] except ValueError: pass affiliations = affiliate_author_with_orcid( ref_name, id, authors, initial_affiliations=affiliations) authors = map(name_lookup_cache.lookup, authors) if not authors: print "No authors found, skipping" continue # Create paper: paper = BarePaper.create(title, authors, pubdate, 'VISIBLE', affiliations) record = BareOaiRecord(source=orcid_oai_source, identifier=identifier, splash_url='http://orcid.org/' + id, pubtype=doctype) paper.add_oairecord(record) yield paper if use_doi: for metadata in crps.search_for_dois_incrementally( '', {'orcid': id}): try: paper = crps.save_doi_metadata(metadata) if paper: yield paper except ValueError as e: print "Saving CrossRef record from ORCID failed: %s" % unicode( e) # Now we add the DOIs found in the ORCID profile. doi_metadata = fetch_dois(dois) for metadata in doi_metadata: try: authors = map(convert_to_name_pair, metadata['author']) affiliations = affiliate_author_with_orcid( ref_name, id, authors) paper = crps.save_doi_metadata(metadata, affiliations) if not paper: continue record = BareOaiRecord(source=orcid_oai_source, identifier='orcid:' + id + ':' + metadata['DOI'], splash_url='http://orcid.org/' + id, pubtype=paper.doctype) paper.add_oairecord(record) yield paper except (KeyError, ValueError, TypeError): pass
def save_doi_metadata(self, metadata, extra_affiliations=None, allow_unknown_authors=False): """ Given the metadata as Citeproc+JSON or from CrossRef, create the associated paper and publication :param extra_affiliations: an optional affiliations list, which will be unified with the affiliations extracted from the metadata. This is useful for the ORCID interface. :param allow_unknown_authors: create the paper even if no author matches our researchers :returns: the paper, created if needed """ # Normalize metadata if metadata is None or type(metadata) != dict: if metadata is not None: print "WARNING: Invalid metadata: type is " + str( type(metadata)) print "The doi proxy is doing something nasty!" raise ValueError('Invalid metadata format, expecting a dict') if not 'author' in metadata: raise ValueError('No author provided') if not 'title' in metadata or not metadata['title']: raise ValueError('No title') # the upstream function ensures that there is a non-empty title if not 'DOI' in metadata or not metadata['DOI']: raise ValueError("No DOI, skipping") doi = to_doi(metadata['DOI']) pubdate = get_publication_date(metadata) if pubdate is None: raise ValueError('No pubdate') title = metadata['title'] # CrossRef metadata stores titles in lists if type(title) == list: title = title[0] subtitle = metadata.get('subtitle') if subtitle: if type(subtitle) == list: subtitle = subtitle[0] title += ': ' + subtitle authors = map(name_lookup_cache.lookup, map(convert_to_name_pair, metadata['author'])) authors = filter(lambda x: x != None, authors) if (not allow_unknown_authors and all(not elem.is_known for elem in authors)) or authors == []: raise ValueError('No known author') def get_affiliation(author_elem): # First, look for an ORCID id orcid = validate_orcid(author_elem.get('ORCID')) if orcid: return orcid # Otherwise return the plain affiliation, if any for dct in author_elem.get('affiliation', []): if 'name' in dct: return dct['name'] affiliations = map(get_affiliation, metadata['author']) if extra_affiliations and len(affiliations) == len(extra_affiliations): for i in range(len(affiliations)): if affiliation_is_greater(extra_affiliations[i], affiliations[i]): affiliations[i] = extra_affiliations[i] paper = BarePaper.create(title, authors, pubdate, 'VISIBLE', affiliations) result = create_publication(paper, metadata) if result is None: # Creating the publication failed! paper.update_visibility() # Make sure the paper only appears if it is still associated # with another source. # TODO add unit test for this else: paper = result[0] return paper
def save_doi_metadata(self, metadata, extra_affiliations=None, allow_unknown_authors=False): """ Given the metadata as Citeproc+JSON or from CrossRef, create the associated paper and publication :param extra_affiliations: an optional affiliations list, which will be unified with the affiliations extracted from the metadata. This is useful for the ORCID interface. :param allow_unknown_authors: create the paper even if no author matches our researchers :returns: the paper, created if needed """ # Normalize metadata if metadata is None or type(metadata) != dict: if metadata is not None: print "WARNING: Invalid metadata: type is "+str(type(metadata)) print "The doi proxy is doing something nasty!" raise ValueError('Invalid metadata format, expecting a dict') if not 'author' in metadata: raise ValueError('No author provided') if not 'title' in metadata or not metadata['title']: raise ValueError('No title') # the upstream function ensures that there is a non-empty title if not 'DOI' in metadata or not metadata['DOI']: raise ValueError("No DOI, skipping") doi = to_doi(metadata['DOI']) pubdate = get_publication_date(metadata) if pubdate is None: raise ValueError('No pubdate') title = metadata['title'] # CrossRef metadata stores titles in lists if type(title) == list: title = title[0] subtitle = metadata.get('subtitle') if subtitle: if type(subtitle) == list: subtitle = subtitle[0] title += ': '+subtitle authors = map(name_lookup_cache.lookup, map(convert_to_name_pair, metadata['author'])) authors = filter(lambda x: x != None, authors) if (not allow_unknown_authors and all(not elem.is_known for elem in authors)) or authors == []: raise ValueError('No known author') def get_affiliation(author_elem): # First, look for an ORCID id orcid = validate_orcid(author_elem.get('ORCID')) if orcid: return orcid # Otherwise return the plain affiliation, if any for dct in author_elem.get('affiliation', []): if 'name' in dct: return dct['name'] affiliations = map(get_affiliation, metadata['author']) if extra_affiliations and len(affiliations) == len(extra_affiliations): for i in range(len(affiliations)): if affiliation_is_greater(extra_affiliations[i],affiliations[i]): affiliations[i] = extra_affiliations[i] paper = BarePaper.create(title, authors, pubdate, 'VISIBLE', affiliations) result = create_publication(paper, metadata) if result is None: # Creating the publication failed! paper.update_visibility() # Make sure the paper only appears if it is still associated # with another source. # TODO add unit test for this else: paper = result[0] return paper