def changeAuthor(request): response = dict() try: author = Author.objects.get(pk=request.POST.get('pk')) first = request.POST.get('value[first]') if first: first = sanitize_html(first) last = request.POST.get('value[last]') if last: last = sanitize_html(last) if not first or not last: return {'message':'First and last names are required.'}, 403 if author.name.first != first or author.name.last != last: new_name = Name.lookup_name((first,last)) new_name.save() author.name_id = new_name.pk author.save() author.paper.invalidate_cache() response['status'] = 'OK' researcher_id = author.researcher_id if not researcher_id: researcher_id = False response['value'] = {'first':first,'last':last,'researcher_id':researcher_id} # The fingerprint might have changed and might collide with another paper merged = author.paper.recompute_fingerprint_and_merge_if_needed() response['merged'] = '' if merged: response['merged'] = merged.pk response['merged_title'] = merged.title return response except ObjectDoesNotExist: return response, 404
def get_or_create_publisher(romeo_xml_description): """ Retrieves from the model, or creates into the model, the publisher corresponding to the <publisher> description from RoMEO """ xml = romeo_xml_description romeo_id = None try: romeo_id = xml.attrib['id'] except KeyError: raise MetadataSourceException( 'RoMEO did not provide a publisher id.\n' + 'URL was: ' + request) name = None try: raw_name = xml.findall('./name')[0].text.strip() name = fromstring(kill_html(sanitize_html(raw_name))).text except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the publisher\'s name.\n' + 'URL was: ' + request) alias = None try: alias = nstrip(xml.findall('./alias')[0].text) if alias: alias = fromstring(kill_html(sanitize_html(alias))).text except KeyError, IndexError: pass
def get_or_create_publisher(romeo_xml_description): """ Retrieves from the model, or creates into the model, the publisher corresponding to the <publisher> description from RoMEO """ xml = romeo_xml_description romeo_id = None try: romeo_id = xml.attrib['id'] except KeyError: raise MetadataSourceException('RoMEO did not provide a publisher id.\n'+ 'URL was: '+request) name = None try: raw_name = xml.findall('./name')[0].text.strip() name = fromstring(kill_html(sanitize_html(raw_name))).text except (KeyError, IndexError, AttributeError): raise MetadataSourceException('RoMEO did not provide the publisher\'s name.\n'+ 'URL was: '+request) alias = None try: alias = nstrip(xml.findall('./alias')[0].text) if alias: alias = fromstring(kill_html(sanitize_html(alias))).text except KeyError, IndexError: pass
def create(cls, first, last): """ Creates an instance of the Name object without saving it. Useful for name lookups where we are not sure we want to keep the name in the model. """ instance = cls() instance.first = sanitize_html(first[:MAX_NAME_LENGTH].strip()) instance.last = sanitize_html(last[:MAX_NAME_LENGTH].strip()) instance.full = iunaccent(instance.first+' '+instance.last) return instance
def get_or_create_publisher(romeo_xml_description): """ Retrieves from the model, or creates into the model, the publisher corresponding to the <publisher> description from RoMEO """ xml = romeo_xml_description romeo_id = None try: romeo_id = xml.attrib['id'] except KeyError: raise MetadataSourceException('RoMEO did not provide a publisher id.') name = None try: raw_name = xml.findall('./name')[0].text.strip() name = fromstring(kill_html(sanitize_html(raw_name))).text except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the publisher\'s name.') alias = None try: alias = nstrip(xml.findall('./alias')[0].text) if alias: alias = fromstring(kill_html(sanitize_html(alias))).text except (KeyError, IndexError): pass # Check if we already have it matches = None if alias: matches = Publisher.objects.filter(romeo_id=romeo_id, name__iexact=name, alias__iexact=alias) else: matches = Publisher.objects.filter(romeo_id=romeo_id, name__iexact=name, alias__isnull=True) if matches: return matches[0] # Otherwise, create it url = None try: url = nstrip(xml.findall('./homeurl')[0].text) except KeyError, IndexError: pass
def process_ajax_change(request, model, allowedFields): response = dict() try: instance = model.objects.get(pk=request.POST.get('pk')) field = request.POST.get('name') if field in allowedFields: val = request.POST.get('value') # TODO check that 'value' is actually present if type(val) == type(''): val = sanitize_html(val) setattr(instance, field, val) instance.save(update_fields=[field]) if hasattr(instance, "invalidate_cache"): instance.invalidate_cache() if model == Paper: merged = instance.recompute_fingerprint_and_merge_if_needed() response['merged'] = '' if merged: response['merged'] = merged.pk response['merged_title'] = merged.title response['status'] = 'OK' response['value'] = val return response else: raise ObjectDoesNotExist except ObjectDoesNotExist: return response, 404
def _get_abstract(data): """ Tries to get the abstract an sanitize it """ abstract = data.get('abstract', '') abstract = sanitize_html(abstract) return abstract
def process_ajax_change(request, model, allowedFields): """ General function used to change a CharField in a model with ajax """ response = dict() try: instance = model.objects.get(pk=request.POST.get('pk')) field = request.POST.get('name') if field in allowedFields: val = request.POST.get('value') # TODO check that 'value' is actually present if type(val) == type(''): val = sanitize_html(val) setattr(instance, field, val) instance.save(update_fields=[field]) if hasattr(instance, "invalidate_cache"): instance.invalidate_cache() if model == Paper: merged = instance.recompute_fingerprint_and_merge_if_needed() response['merged'] = '' if merged: response['merged'] = merged.pk response['merged_title'] = merged.title response['status'] = 'OK' response['value'] = val return response else: raise ObjectDoesNotExist except ObjectDoesNotExist: return response, 404
def cleanup_abstracts(): """ Run HTML sanitizing on the abstracts (this is normally done on creation of the papers, but not for old dumps of the database) """ for p in Publication.objects.all(): if p.abstract: new_abstract = sanitize_html(p.abstract) if new_abstract != p.abstract: p.abstract = new_abstract p.save() for p in OaiRecord.objects.all(): if p.description: new_abstract = sanitize_html(p.description) if new_abstract != p.description: p.description = new_abstract p.save()
def cleanup_titles(): """ Run HTML sanitizing on all the titles of the papers (this is normally done on creation of the papers, but not for old dumps of the database) """ papers = Paper.objects.all() for p in papers: p.title = sanitize_html(p.title) p.save(update_fields=['title'])
def create(cls, title, author_names, pubdate, visible=True, affiliations=None, orcids=None): """ Creates a (bare) paper. To save it to the database, we need to run the clustering algorithm to resolve Researchers for the authors, using `from_bare` from the (non-bare) :class:`Paper` subclass.. :param title: The title of the paper (as a string). If it is too long for the database, ValueError is raised. :param author_names: The ordered list of author names, as Name objects. :param pubdate: The publication date, as a python date object :param visible: The visibility of the paper if it is created. If another paper exists, the visibility will be set to the maximum of the two possible visibilities. :param affiliations: A list of (possibly None) affiliations for the authors. It has to have the same length as the list of author names. :param orcids: same as affiliations, but for ORCID ids. """ if not title or not author_names or not pubdate: raise ValueError( "A title, pubdate and authors have to be provided to create a paper.") if affiliations is not None and len(author_names) != len(affiliations): raise ValueError( "The number of affiliations and authors have to be equal.") if orcids is not None and len(author_names) != len(orcids): raise ValueError( "The number of ORCIDs (or Nones) and authors have to be equal.") if not isinstance(visible, bool): raise ValueError("Invalid paper visibility: %s" % str(visible)) title = sanitize_html(title) title = maybe_recapitalize_title(title) p = cls() p.title = title p.pubdate = pubdate # pubdate will be checked in fingerprint computation p.visible = visible for idx, n in enumerate(author_names): a = BareAuthor() a.name = n if affiliations is not None: a.affiliation = affiliations[idx] if orcids is not None: orcid = validate_orcid(orcids[idx]) if orcid: a.orcid = orcid p.add_author(a, position=idx) p.fingerprint = p.new_fingerprint() return p
def changeAuthor(request): response = dict() try: author = Author.objects.get(pk=request.POST.get('pk')) first = request.POST.get('value[first]') if first: first = sanitize_html(first) last = request.POST.get('value[last]') if last: last = sanitize_html(last) if not first or not last: return {'message': 'First and last names are required.'}, 403 if author.name.first != first or author.name.last != last: new_name = Name.lookup_name((first, last)) new_name.save() author.name_id = new_name.pk author.save() author.paper.invalidate_cache() response['status'] = 'OK' researcher_id = author.researcher_id if not researcher_id: researcher_id = False response['value'] = { 'first': first, 'last': last, 'researcher_id': researcher_id } # The fingerprint might have changed and might collide with another paper merged = author.paper.recompute_fingerprint_and_merge_if_needed() response['merged'] = '' if merged: response['merged'] = merged.pk response['merged_title'] = merged.title return response except ObjectDoesNotExist: return response, 404
def add_oai_record(self, header, metadata, paper): """ Add a record (from OAI-PMH) to the given paper """ identifier = header.identifier() # description in oai_dc means abstract curdesc = "" for desc in metadata['description']: if len(desc) > len(curdesc): curdesc = desc curdesc = sanitize_html(curdesc) # Run extractor to find the URLs splash_url, pdf_url = self.extract_urls(header, metadata, self.oaisource.identifier) keywords = ' | '.join(metadata['subject']) contributors = ' '.join(metadata['contributor'])[:4096] typenorms = ['typenorm:' + tn for tn in metadata.get('typenorm', [])] pubtype_list = metadata.get('type', []) + typenorms pubtype = None for raw_pubtype in pubtype_list: pubtype = OAI_PUBTYPE_TRANSLATIONS.get(raw_pubtype) if pubtype is not None: break if pubtype is None: pubtype = self.oaisource.default_pubtype # Find the DOI, if any doi = None for url in metadata['identifier'] + metadata['relation'] + metadata[ 'source']: if not doi: doi = to_doi(url) record = BareOaiRecord(source=self.oaisource, identifier=identifier, description=curdesc, keywords=keywords, contributors=contributors, pubtype=pubtype, pdf_url=pdf_url, splash_url=splash_url, doi=doi) paper.add_oairecord(record)
def add_oai_record(self, header, metadata, paper): """ Add a record (from OAI-PMH) to the given paper """ identifier = header.identifier() # description in oai_dc means abstract curdesc = "" for desc in metadata['description']: if len(desc) > len(curdesc): curdesc = desc curdesc = sanitize_html(curdesc) # Run extractor to find the URLs splash_url, pdf_url = self.extract_urls( header, metadata, self.oaisource.identifier) keywords = ' | '.join(metadata['subject']) contributors = ' '.join(metadata['contributor'])[:4096] typenorms = ['typenorm:'+tn for tn in metadata.get('typenorm', [])] pubtype_list = metadata.get('type', []) + typenorms pubtype = None for raw_pubtype in pubtype_list: pubtype = OAI_PUBTYPE_TRANSLATIONS.get(raw_pubtype) if pubtype is not None: break if pubtype is None: pubtype = self.oaisource.default_pubtype # Find the DOI, if any doi = None for url in metadata['identifier']+metadata['relation']+metadata['source']: if not doi: doi = to_doi(url) record = BareOaiRecord( source=self.oaisource, identifier=identifier, description=curdesc, keywords=keywords, contributors=contributors, pubtype=pubtype, pdf_url=pdf_url, splash_url=splash_url, doi=doi) paper.add_oairecord(record)
def consolidate_publication(publi): """ Fetches the abstract from Zotero and adds it to the publication if it succeeds. """ zotero = fetch_zotero_by_DOI(publi.doi) if zotero is None: return publi for item in zotero: if 'abstractNote' in item: publi.description = sanitize_html(item['abstractNote']) publi.save(update_fields=['description']) for attachment in item.get('attachments', []): if attachment.get('mimeType') == 'application/pdf': publi.pdf_url = attachment.get('url') publi.save(update_fields=['pdf_url']) publi.about.update_availability() return publi
def consolidate_publication(publi): """ Fetches the abstract from Zotero and adds it to the publication if it succeeds. """ zotero = fetch_zotero_by_DOI(publi.doi) if zotero is None: return publi for item in zotero: if 'abstractNote' in item: publi.abstract = sanitize_html(item['abstractNote']) publi.save(update_fields=['abstract']) for attachment in item.get('attachments', []): if attachment.get('mimeType') == 'application/pdf': publi.pdf_url = attachment.get('url') publi.save(update_fields=['pdf_url']) publi.paper.update_availability() return publi
def add_oai_record(record, source, paper): """ Add a record (from OAI-PMH) to the given paper """ header = record[0] identifier = header.identifier() # A description is useful curdesc = "" for desc in record[1]._map['description']: if len(desc) > len(curdesc): curdesc = desc curdesc = sanitize_html(curdesc) # Run extractor to find the URLs pdf_url = None splash_url = None if source.identifier: try: extractor = REGISTERED_EXTRACTORS[source.identifier] urls = extractor.extract(record) pdf_url = urls.get('pdf') splash_url = urls.get('splash') except KeyError: print "Warning, invalid extractor for source "+source.name keywords = ' '.join(record[1]._map['subject']) contributors = ' '.join(record[1]._map['contributor'])[:4096] pubtype_list = record[1]._map.get('type') pubtype = None if len(pubtype_list) > 0: pubtype = pubtype_list[0] #pubtype = source.default_pubtype pubtype = PUBTYPE_TRANSLATIONS.get(pubtype, source.default_pubtype) record = BareOaiRecord( source=source, identifier=identifier, description=curdesc, keywords=keywords, contributors=contributors, pubtype=pubtype, pdf_url=pdf_url, splash_url=splash_url) paper.add_oairecord(record)
def add_oai_record(record, source, paper): """ Add a record (from OAI-PMH) to the given paper """ header = record[0] identifier = header.identifier() # A description is useful curdesc = "" for desc in record[1]._map['description']: if len(desc) > len(curdesc): curdesc = desc curdesc = sanitize_html(curdesc) # Run extractor to find the URLs pdf_url = None splash_url = None if source.identifier: try: extractor = REGISTERED_EXTRACTORS[source.identifier] urls = extractor.extract(record) pdf_url = urls.get('pdf') splash_url = urls.get('splash') except KeyError: print "Warning, invalid extractor for source " + source.name keywords = ' '.join(record[1]._map['subject']) contributors = ' '.join(record[1]._map['contributor'])[:4096] pubtype_list = record[1]._map.get('type') pubtype = None if len(pubtype_list) > 0: pubtype = pubtype_list[0] #pubtype = source.default_pubtype pubtype = PUBTYPE_TRANSLATIONS.get(pubtype, source.default_pubtype) record = BareOaiRecord(source=source, identifier=identifier, description=curdesc, keywords=keywords, contributors=contributors, pubtype=pubtype, pdf_url=pdf_url, splash_url=splash_url) paper.add_oairecord(record)
def test_sanitize_html(self): self.assertEqual(sanitize_html('My title<sub>is</sub><a href="http://dissem.in"><sup>nice</sup></a>'), 'My title<sub>is</sub><sup>nice</sup>') self.assertEqual(sanitize_html('$\\alpha$-conversion'), '$\u03b1$-conversion') self.assertEqual(sanitize_html('$$\\eta + \\omega$$'), '$\u03b7 + \u03c9$') self.assertEqual(sanitize_html('abc & def'), 'abc & def') self.assertEqual(sanitize_html('Universitat Aut\\uFFFDnoma de Barcelona'), 'Universitat Aut�noma de Barcelona')
def get_or_create_publisher(self, romeo_xml_description): """ Retrieves from the model, or creates into the model, the publisher corresponding to the <publisher> description from RoMEO. If the data from RoMEO is more fresh than what we have in cache, we update our model. """ xml = romeo_xml_description romeo_id = None try: romeo_id = xml.attrib['id'] except KeyError: raise MetadataSourceException('RoMEO did not provide a publisher id.') romeo_parent_id = None try: romeo_parent_id = xml.attrib['parentid'] except KeyError: pass name = None try: raw_name = xml.findall('./name')[0].text.strip() name = fromstring(kill_html(sanitize_html(raw_name))).text except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the publisher\'s name.') alias = None try: alias = nstrip(xml.findall('./alias')[0].text) if alias: alias = fromstring(kill_html(sanitize_html(alias))).text except (KeyError, IndexError): pass last_update = self._get_romeo_date(xml, './dateupdated') # Check if we already have it. # Sadly the romeo_id is not unique (as publishers imported from doaj # all get the same id, so we have to use the name too). matches = None if re.match(r'\d+', romeo_id): # numeric ids are unambiguous matches = Publisher.objects.filter(romeo_id=romeo_id) elif alias: matches = Publisher.objects.filter( romeo_id=romeo_id, name__iexact=name, alias__iexact=alias) else: matches = Publisher.objects.filter( romeo_id=romeo_id, name__iexact=name, alias__isnull=True) if matches: first_match = matches[0] if first_match.last_updated is not None and first_match.last_updated >= last_update: return matches[0] # Otherwise, create it url = None try: url = nstrip(xml.findall('./homeurl')[0].text) except (KeyError, IndexError): pass preprint = None try: preprint = xml.findall('./preprints/prearchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the preprint policy.') postprint = None try: postprint = xml.findall('./postprints/postarchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the postprint policy.') pdfversion = None try: pdfversion = xml.findall('./pdfversion/pdfarchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the pdf archiving policy.') # Compute OA status of the publisher status = 'UNK' if not matches: publisher = Publisher() else: publisher = matches[0] publisher.name = name publisher.alias = alias publisher.url = url publisher.preprint = preprint publisher.postprint = postprint publisher.pdfversion = pdfversion publisher.romeo_id = romeo_id publisher.romeo_parent_id = romeo_parent_id publisher.oa_status = status publisher.last_updated = last_update publisher.save() if matches: publisher.publishercopyrightlink_set.all().delete() publisher.publisherrestrictiondetail_set.all().delete() publisher.publishercondition_set.all().delete() # Add the conditions, restrictions, and copyright for restriction in xml.findall('./preprints/prerestrictions/prerestriction'): self.add_restriction(restriction, 'preprint', publisher) for restriction in xml.findall('./postprints/postrestrictions/postrestriction'): self.add_restriction(restriction, 'postprint', publisher) for restriction in xml.findall('./pdfversion/pdfrestrictions/pdfrestriction'): self.add_restriction(restriction, 'pdfversion', publisher) for condition in xml.findall('./conditions/condition'): if condition.text: c = PublisherCondition(publisher=publisher, text=condition.text.strip()) c.save() # Update the publisher status publisher.oa_status = publisher.classify_oa_status() publisher.save(update_fields=['oa_status']) # TODO: if the OA status has changed, then we should update the journals and papers accordingly with the # adequate task for link in xml.findall('./copyrightlinks/copyrightlink'): text = None url = None texts = link.findall('./copyrightlinktext') if texts: text = nstrip(texts[0].text) urls = link.findall('./copyrightlinkurl') if urls: url = nstrip(urls[0].text) if url and text: cplink = PublisherCopyrightLink( text=text, url=url[:1024], publisher=publisher) cplink.save() return publisher