def test_cleanup_desc(self): r = BareOaiRecord() r.description = "International audience ; While price and data…" r.cleanup_description() self.assertEqual(r.description, "While price and data…") r.description = " Abstract: While price and data…" r.cleanup_description() self.assertEqual(r.description, "While price and data…") r.description = None r.cleanup_description() self.assertEqual(r.description, None)
def create_paper(self, work): assert (not work.skipped) # Create paper authors, orcids = work.authors_and_orcids paper = BarePaper.create( work.title, authors, work.pubdate, visible=True, affiliations=None, orcids=orcids, ) record = BareOaiRecord(source=self.oai_source, identifier=work.api_uri, splash_url=work.splash_url, pubtype=work.pubtype) paper.add_oairecord(record) try: p = Paper.from_bare(paper) p = self.associate_researchers(p) p.save() p.update_index() except ValueError: p = None return p
def submit_deposit_wrapper(self, *args, **kwargs): """ Wrapper of the submit_deposit method (that should not need to be reimplemented). It catches DepositErrors raised in the deposit process and adds the logs to its return value. """ try: # Small hack to get notifications name = getattr(self.user, 'name', None) first_name = getattr(self.user, 'first_name', None) last_name = getattr(self.user, 'last_name', None) if first_name and last_name: name = '%s %s' % (first_name, last_name) notification_payload = { 'name': str(name), 'repo': self.repository.name, 'paperurl': self.paper.url, } result = self.submit_deposit(*args, **kwargs) result.logs = self._logs # Create the corresponding OAI record if result.splash_url: rec = BareOaiRecord( source=self.repository.oaisource, identifier=('deposition:%d:%s' % (self.repository.id, str(result.identifier))), splash_url=result.splash_url, pdf_url=result.pdf_url) result.oairecord = self.paper.add_oairecord(rec) settings.DEPOSIT_NOTIFICATION_CALLBACK(notification_payload) # In case that the paper is on user todo list, remove it # If it's not on the list, nothing happens here, since m2m field self.paper.todolist.remove(self.user) return result except DepositError as e: self.log('Message: ' + e.args[0]) notification_payload['paperurl'] += ' ' + e.args[0] settings.DEPOSIT_NOTIFICATION_CALLBACK(notification_payload) return DepositResult(logs=self._logs, status='failed', message=e.args[0]) except Exception as e: self.log("Caught exception:") self.log(str(type(e)) + ': ' + str(e) + '') self.log(traceback.format_exc()) return DepositResult( logs=self._logs, status='failed', message= _('Failed to connect to the repository. Please try again later.' ))
def create_oairecord(self, record): """ Given one line of the dump (represented as a dict), add it to the corresponding paper (if it exists) """ doi = to_doi(record['doi']) if not doi: return prefix = doi.split('/')[0] if prefix in free_doi_prefixes: return paper = Paper.get_by_doi(doi) if not paper: try: paper = Paper.create_by_doi(doi) except (MetadataSourceException, ValueError): return if not paper: print('no such paper for doi {doi}'.format(doi=doi)) return url = record['url'] # just to speed things up a bit... if paper.pdf_url == url: return identifier = 'oadoi:' + url source = self.oadoi_source if record['host_type'] == 'publisher': url = doi_to_url(doi) identifier = doi_to_crossref_identifier(doi) source = self.crossref_source record = BareOaiRecord(paper=paper, doi=doi, pubtype=paper.doctype, source=source, identifier=identifier, splash_url=url, pdf_url=record['url']) try: paper.add_oairecord(record) paper.update_availability() # TODO re-enable this #paper.update_index() except (DataError, ValueError): print('Record does not fit in the DB')
def add_oai_record(self, header, metadata, paper): """ Add a record (from OAI-PMH) to the given paper """ identifier = header.identifier() # description in oai_dc means abstract curdesc = "" for desc in metadata['description']: if len(desc) > len(curdesc): curdesc = desc curdesc = sanitize_html(curdesc) # Run extractor to find the URLs splash_url, pdf_url = self.extract_urls(header, metadata, self.oaisource.identifier) keywords = ' | '.join(metadata['subject']) contributors = ' '.join(metadata['contributor'])[:4096] typenorms = ['typenorm:' + tn for tn in metadata.get('typenorm', [])] pubtype_list = metadata.get('type', []) + typenorms pubtype = None for raw_pubtype in pubtype_list: pubtype = OAI_PUBTYPE_TRANSLATIONS.get(raw_pubtype) if pubtype is not None: break if pubtype is None: pubtype = self.oaisource.default_pubtype # Find the DOI, if any doi = None for url in metadata['identifier'] + metadata['relation'] + metadata[ 'source']: if not doi: doi = to_doi(url) record = BareOaiRecord(source=self.oaisource, identifier=identifier, description=curdesc, keywords=keywords, contributors=contributors, pubtype=pubtype, pdf_url=pdf_url, splash_url=splash_url, doi=doi) paper.add_oairecord(record)
def create_paper(self, data_paper): assert (not data_paper.skipped) # Create paper paper = BarePaper.create( data_paper.title, data_paper.authors, data_paper.pubdate, visible=True, affiliations=None, orcids=data_paper.orcids, ) record = BareOaiRecord(source=orcid_oai_source(), identifier=data_paper.identifier, splash_url=data_paper.splash_url, pubtype=data_paper.doctype) paper.add_oairecord(record) return paper
def add_oai_record(record, source, paper): """ Add a record (from OAI-PMH) to the given paper """ header = record[0] identifier = header.identifier() # A description is useful curdesc = "" for desc in record[1]._map['description']: if len(desc) > len(curdesc): curdesc = desc curdesc = sanitize_html(curdesc) # Run extractor to find the URLs pdf_url = None splash_url = None if source.identifier: try: extractor = REGISTERED_EXTRACTORS[source.identifier] urls = extractor.extract(record) pdf_url = urls.get('pdf') splash_url = urls.get('splash') except KeyError: print "Warning, invalid extractor for source " + source.name keywords = ' '.join(record[1]._map['subject']) contributors = ' '.join(record[1]._map['contributor'])[:4096] pubtype_list = record[1]._map.get('type') pubtype = None if len(pubtype_list) > 0: pubtype = pubtype_list[0] #pubtype = source.default_pubtype pubtype = PUBTYPE_TRANSLATIONS.get(pubtype, source.default_pubtype) record = BareOaiRecord(source=source, identifier=identifier, description=curdesc, keywords=keywords, contributors=contributors, pubtype=pubtype, pdf_url=pdf_url, splash_url=splash_url) paper.add_oairecord(record)
def create_paper(self, work): assert (not work.skipped) # Create paper authors, orcids = work.authors_and_orcids paper = BarePaper.create( work.title, authors, work.pubdate, visible=True, affiliations=None, orcids=orcids, ) record = BareOaiRecord(source=orcid_oai_source(), identifier=work.api_uri, splash_url=work.splash_url, pubtype=work.pubtype) paper.add_oairecord(record) return paper
def fetch_metadata_from_dois(self, crps, ref_name, orcid_id, dois): doi_metadata = fetch_dois(dois) for metadata in doi_metadata: try: authors = map(convert_to_name_pair, metadata['author']) affiliations = affiliate_author_with_orcid(ref_name, orcid_id, authors) paper = crps.save_doi_metadata(metadata, affiliations) if not paper: yield False, metadata continue record = BareOaiRecord( source=orcid_oai_source, identifier='orcid:%s:%s' % (orcid_id, metadata['DOI']), splash_url='http://%s/%s' % (settings.ORCID_BASE_DOMAIN, orcid_id), pubtype=paper.doctype) paper.add_oairecord(record) yield True, paper except (KeyError, ValueError, TypeError): yield False, metadata
def create_paper(self, data_paper): assert (not data_paper.skipped) # Create paper paper = BarePaper.create( data_paper.title, data_paper.authors, data_paper.pubdate, 'VISIBLE', data_paper.affiliations ) record = BareOaiRecord( source=orcid_oai_source, identifier=data_paper.identifier, splash_url=data_paper.splash_url, pubtype=data_paper.doctype ) paper.add_oairecord(record) return paper
def to_paper(cls, data): """ Call this function to convert citeproc metadata into a paper object Our strategy is as follows: We collect first all data necessary, if me miss something, then we raise CiteprocError. If we have collected everything, we pass that to the corresponding baremodels. :param data: citeproc metadata. Note that CrossRef does put its citeproc into a message block :returns: Paper object :raises: CiteprocError """ if not isinstance(data, dict): raise CiteprocError('Invalid metadaformat, expecting dict') bare_paper_data = cls._get_paper_data(data) bare_oairecord_data = cls._get_oairecord_data(data) bare_paper = BarePaper.create(**bare_paper_data) bare_oairecord = BareOaiRecord(paper=bare_paper, **bare_oairecord_data) bare_paper.add_oairecord(bare_oairecord) bare_paper.update_availability() paper = Paper.from_bare(bare_paper) paper.update_index() return paper
def _create_publication(paper, metadata): if not metadata: return if not metadata.get('container-title'): return doi = to_doi(metadata.get('DOI', None)) title = metadata['container-title'] if isinstance(title, list): title = title[0] title = title[:512] issn = metadata.get('ISSN', None) if issn and isinstance(issn, list): issn = issn[0] # TODO pass all the ISSN to the RoMEO interface volume = metadata.get('volume', None) pages = metadata.get('page', None) issue = metadata.get('issue', None) date_dict = metadata.get('issued', dict()) pubdate = None if 'date-parts' in date_dict: dateparts = date_dict.get('date-parts')[0] pubdate = date_from_dateparts(dateparts) # for instance it outputs dates like 2014-2-3 publisher_name = metadata.get('publisher', None) if publisher_name: publisher_name = publisher_name[:512] pubtype = metadata.get('type', 'unknown') pubtype = CROSSREF_PUBTYPE_ALIASES.get(pubtype, pubtype) splash_url = doi_to_url(doi) # PDF availability pdf_url = None licenses = set([(license or {}).get('URL') for license in metadata.get('license', [])]) doi_prefix = doi.split('/')[0] if doi_prefix in free_doi_prefixes or any(map(is_oa_license, licenses)): pdf_url = splash_url # Lookup journal search_terms = {'jtitle': title} if issn: search_terms['issn'] = issn journal = fetch_journal(search_terms) publisher = None if journal: publisher = journal.publisher AliasPublisher.increment(publisher_name, journal.publisher) else: publisher = fetch_publisher(publisher_name) barepub = BareOaiRecord( paper=paper, journal_title=title, issue=issue, volume=volume, pubdate=pubdate, pages=pages, doi=doi, pubtype=pubtype, publisher_name=publisher_name, journal=journal, publisher=publisher, pdf_url=pdf_url, splash_url=splash_url, source=OaiSource.objects.get(identifier='crossref'), identifier=doi_to_crossref_identifier(doi)) rec = paper.add_oairecord(barepub) paper.update_availability() return paper, rec
def fetch_orcid_records(self, id, profile=None, use_doi=True): """ Queries ORCiD to retrieve the publications associated with a given ORCiD. It also fetches such papers from the CrossRef search interface. :param profile: The ORCID profile if it has already been fetched before (format: parsed JSON). :param use_doi: Fetch the publications by DOI when we find one (recommended, but slow) :returns: a generator, where all the papers found are yielded. (some of them could be in free form, hence not imported) """ crps = CrossRefPaperSource(self.ccf) # Cleanup iD: id = validate_orcid(id) if id is None: raise MetadataSourceException('Invalid ORCiD identifier') # Get ORCiD profile try: if profile is None: profile = OrcidProfile(id=id) else: profile = OrcidProfile(json=profile) except MetadataSourceException as e: print e return # Reference name ref_name = profile.name # curl -H "Accept: application/orcid+json" 'http://pub.orcid.org/v1.2/0000-0002-8612-8827/orcid-works' -L -i dois = [] # list of DOIs to fetch papers = [] # list of papers created records_found = 0 # how many records did we successfully import from the profile? # Fetch publications pubs = jpath('orcid-profile/orcid-activities/orcid-works/orcid-work', profile, []) for pub in pubs: def j(path, default=None): return jpath(path, pub, default) # DOI doi = None for extid in j( 'work-external-identifiers/work-external-identifier', []): if extid.get('work-external-identifier-type') == 'DOI': doi = to_doi( jpath('work-external-identifier-id/value', extid)) if doi: # If a DOI is available, create the paper using metadata from CrossRef. # We don't do it yet, we only store the DOI, so that we can fetch them # by batch later. dois.append(doi) if doi and use_doi: continue # Extract information from ORCiD # Title title = j('work-title/title/value') if title is None: print "Warning: Skipping ORCID publication: no title" # Type doctype = orcid_to_doctype(j('work-type', 'other')) # Contributors (ignored for now as they are very often not present) def get_contrib(js): return { 'orcid': jpath('contributor-orcid', js), 'name': jpath('credit-name/value', js), } contributors = map(get_contrib, j('work-contributors/contributor', [])) author_names = filter(lambda x: x is not None, map(lambda x: x['name'], contributors)) authors = map(parse_comma_name, author_names) pubdate = None # ORCiD internal id identifier = j('put-code') affiliations = map(lambda x: x['orcid'], contributors) # Pubdate year = parse_int(j('publication-date/year/value'), 1970) month = parse_int(j('publication-date/month/value'), 01) day = parse_int(j('publication-date/day/value'), 01) pubdate = None try: pubdate = date(year=year, month=01, day=01) pubdate = date(year=year, month=month, day=01) pubdate = date(year=year, month=month, day=day) except ValueError: if pubdate is None: print "Invalid publication date in ORCID publication, skipping" continue # Citation type: metadata format citation_format = j('work-citation/work-citation-type') print citation_format bibtex = j('work-citation/citation') if bibtex is not None: try: entry = parse_bibtex(bibtex) if entry.get('author', []) == []: print "Warning: Skipping ORCID publication: no authors." print j('work-citation/citation') if not authors: authors = entry['author'] except ValueError: pass affiliations = affiliate_author_with_orcid( ref_name, id, authors, initial_affiliations=affiliations) authors = map(name_lookup_cache.lookup, authors) if not authors: print "No authors found, skipping" continue # Create paper: paper = BarePaper.create(title, authors, pubdate, 'VISIBLE', affiliations) record = BareOaiRecord(source=orcid_oai_source, identifier=identifier, splash_url='http://orcid.org/' + id, pubtype=doctype) paper.add_oairecord(record) yield paper if use_doi: for metadata in crps.search_for_dois_incrementally( '', {'orcid': id}): try: paper = crps.save_doi_metadata(metadata) if paper: yield paper except ValueError as e: print "Saving CrossRef record from ORCID failed: %s" % unicode( e) # Now we add the DOIs found in the ORCID profile. doi_metadata = fetch_dois(dois) for metadata in doi_metadata: try: authors = map(convert_to_name_pair, metadata['author']) affiliations = affiliate_author_with_orcid( ref_name, id, authors) paper = crps.save_doi_metadata(metadata, affiliations) if not paper: continue record = BareOaiRecord(source=orcid_oai_source, identifier='orcid:' + id + ':' + metadata['DOI'], splash_url='http://orcid.org/' + id, pubtype=paper.doctype) paper.add_oairecord(record) yield paper except (KeyError, ValueError, TypeError): pass
def create_oairecord(self, record, update_index=True, create_missing_dois=True): """ Given one line of the dump (represented as a dict), add it to the corresponding paper (if it exists) """ doi = to_doi(record['doi']) if not doi: return prefix = doi.split('/')[0] if prefix in free_doi_prefixes: return if not record.get('oa_locations'): return paper = Paper.get_by_doi(doi) if not paper: if not create_missing_dois: return try: paper = Paper.create_by_doi(doi) except (MetadataSourceException, ValueError): return if not paper: logger.info('no such paper for doi {doi}'.format(doi=doi)) return logger.info(doi) paper.cache_oairecords() for oa_location in record.get('oa_locations') or []: url = oa_location['url'] # just to speed things up a bit... if paper.pdf_url == url: return identifier='oadoi:'+url source = self.oadoi_source if oa_location['host_type'] == 'publisher': url = doi_to_url(doi) identifier = doi_to_crossref_identifier(doi) source = self.crossref_source record = BareOaiRecord( paper=paper, doi=doi, pubtype=paper.doctype, source=source, identifier=identifier, splash_url=url, pdf_url=oa_location['url']) try: # We disable checks by DOI since we know the paper has been looked up by DOI already. old_pdf_url = paper.pdf_url paper.add_oairecord(record, check_by_doi=False) super(Paper, paper).update_availability() if old_pdf_url != paper.pdf_url: paper.save() if update_index: paper.update_index() except (DataError, ValueError): logger.warning('Record does not fit in the DB')