def get_or_create_publisher(romeo_xml_description): """ Retrieves from the model, or creates into the model, the publisher corresponding to the <publisher> description from RoMEO """ xml = romeo_xml_description romeo_id = None try: romeo_id = xml.attrib['id'] except KeyError: raise MetadataSourceException('RoMEO did not provide a publisher id.') name = None try: raw_name = xml.findall('./name')[0].text.strip() name = fromstring(kill_html(sanitize_html(raw_name))).text except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the publisher\'s name.') alias = None try: alias = nstrip(xml.findall('./alias')[0].text) if alias: alias = fromstring(kill_html(sanitize_html(alias))).text except (KeyError, IndexError): pass # Check if we already have it matches = None if alias: matches = Publisher.objects.filter(romeo_id=romeo_id, name__iexact=name, alias__iexact=alias) else: matches = Publisher.objects.filter(romeo_id=romeo_id, name__iexact=name, alias__isnull=True) if matches: return matches[0] # Otherwise, create it url = None try: url = nstrip(xml.findall('./homeurl')[0].text) except KeyError, IndexError: pass
def get_or_create_publisher(romeo_xml_description): """ Retrieves from the model, or creates into the model, the publisher corresponding to the <publisher> description from RoMEO """ xml = romeo_xml_description romeo_id = None try: romeo_id = xml.attrib['id'] except KeyError: raise MetadataSourceException('RoMEO did not provide a publisher id.\n'+ 'URL was: '+request) name = None try: raw_name = xml.findall('./name')[0].text.strip() name = fromstring(kill_html(sanitize_html(raw_name))).text except (KeyError, IndexError, AttributeError): raise MetadataSourceException('RoMEO did not provide the publisher\'s name.\n'+ 'URL was: '+request) alias = None try: alias = nstrip(xml.findall('./alias')[0].text) if alias: alias = fromstring(kill_html(sanitize_html(alias))).text except KeyError, IndexError: pass
def fetch_journal(search_terms, matching_mode='exact'): """ Fetch the journal data from RoMEO. Returns an Journal object. search_terms should be a dictionnary object containing at least one of these fields: """ allowed_fields = ['issn', 'jtitle'] # Make the title HTML-safe before searching for it in the database or in the API if 'title' in search_terms: search_terms['title'] = kill_html(search_terms['title']) original_search_terms = search_terms.copy() # Check the arguments if not all( map(lambda x: x in allowed_fields, (key for key in search_terms))): raise ValueError('The search terms have to belong to ' + str(allowed_fields) + 'but the dictionary I got is ' + str(search_terms)) # Remove diacritics (because it has to be sent in ASCII to ROMEO) for key in search_terms: search_terms[key] = remove_diacritics(search_terms[key]) # First check we don't have it already journal = find_journal_in_model(search_terms) if journal: return journal # Perform the query root = perform_romeo_query(search_terms) # Find the matching journals (if any) journals = list(root.findall('./journals/journal')) if not journals: # Retry with a less restrictive matching type if matching_mode == 'exact': return fetch_journal(original_search_terms, 'contains') return None if len(journals) > 1: print("Warning, " + str(len(journals)) + " journals match the RoMEO request, " + "defaulting to the first one") # TODO different behaviour: get the ISSN and try again. journal = journals[0] names = list(journal.findall('./jtitle')) if not names: raise MetadataSourceException( 'RoMEO returned a journal without title.\n' + 'URL was: ' + request) if len(names) > 1: print("Warning, " + str(len(names)) + " names provided for one journal, " + "defaulting to the first one") name = kill_html(names[0].text) issn = None try: issn = nstrip(journal.findall('./issn')[0].text) except KeyError, IndexError: pass
def addRestriction(xml, applies_to, publisher): text = nstrip(xml.text) if text: r = PublisherRestrictionDetail(publisher=publisher, applies_to=applies_to, text=text) r.save()
def get_or_create_publisher(romeo_xml_description): """ Retrieves from the model, or creates into the model, the publisher corresponding to the <publisher> description from RoMEO """ xml = romeo_xml_description romeo_id = None try: romeo_id = xml.attrib['id'] except KeyError: raise MetadataSourceException( 'RoMEO did not provide a publisher id.\n' + 'URL was: ' + request) name = None try: raw_name = xml.findall('./name')[0].text.strip() name = fromstring(kill_html(sanitize_html(raw_name))).text except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the publisher\'s name.\n' + 'URL was: ' + request) alias = None try: alias = nstrip(xml.findall('./alias')[0].text) if alias: alias = fromstring(kill_html(sanitize_html(alias))).text except KeyError, IndexError: pass
def add_restriction(self, xml, applies_to, publisher): """ Creates a sharing restriction (SAD!) for a publisher """ text = nstrip(xml.text) if text: r = PublisherRestrictionDetail( publisher=publisher, applies_to=applies_to, text=text) r.save()
def fetch_journal(search_terms, matching_mode = 'exact'): """ Fetch the journal data from RoMEO. Returns an Journal object. search_terms should be a dictionnary object containing at least one of these fields: """ allowed_fields = ['issn', 'jtitle'] # Make the title HTML-safe before searching for it in the database or in the API if 'title' in search_terms: search_terms['title'] = kill_html(search_terms['title']) original_search_terms = search_terms.copy() # Check the arguments if not all(map(lambda x: x in allowed_fields, (key for key in search_terms))): raise ValueError('The search terms have to belong to '+str(allowed_fields)+ 'but the dictionary I got is '+str(search_terms)) # Remove diacritics (because it has to be sent in ASCII to ROMEO) for key in search_terms: search_terms[key] = remove_diacritics(search_terms[key]) # First check we don't have it already journal = find_journal_in_model(search_terms) if journal: return journal # Perform the query root = perform_romeo_query(search_terms) # Find the matching journals (if any) journals = list(root.findall('./journals/journal')) if not journals: # Retry with a less restrictive matching type if matching_mode == 'exact': return fetch_journal(original_search_terms, 'contains') return None if len(journals) > 1: print ("Warning, "+str(len(journals))+" journals match the RoMEO request, "+ "defaulting to the first one") # TODO different behaviour: get the ISSN and try again. journal = journals[0] names = list(journal.findall('./jtitle')) if not names: raise MetadataSourceException('RoMEO returned a journal without title.\n'+ 'URL was: '+request) if len(names) > 1: print("Warning, "+str(len(names))+" names provided for one journal, "+ "defaulting to the first one") name = kill_html(names[0].text) issn = None try: issn = nstrip(journal.findall('./issn')[0].text) except KeyError, IndexError: pass
def test_nstrip(self): self.assertTrue(nstrip(None) is None) self.assertEqual(nstrip('aa'), 'aa') self.assertEqual(nstrip(' aa \n'), 'aa')
except KeyError, IndexError: pass # Check if we already have it matches = None if alias: matches = Publisher.objects.filter(romeo_id=romeo_id, name__iexact=name,alias__iexact=alias) else: matches = Publisher.objects.filter(romeo_id=romeo_id, name__iexact=name,alias__isnull=True) if matches: return matches[0] # Otherwise, create it url = None try: url = nstrip(xml.findall('./homeurl')[0].text) except KeyError, IndexError: pass preprint = None try: preprint = xml.findall('./preprints/prearchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException('RoMEO did not provide the preprint policy.\n'+ 'URL was: '+request) postprint = None try: postprint = xml.findall('./postprints/postarchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException('RoMEO did not provide the postprint policy.\n'+
matches = None if alias: matches = Publisher.objects.filter(romeo_id=romeo_id, name__iexact=name, alias__iexact=alias) else: matches = Publisher.objects.filter(romeo_id=romeo_id, name__iexact=name, alias__isnull=True) if matches: return matches[0] # Otherwise, create it url = None try: url = nstrip(xml.findall('./homeurl')[0].text) except KeyError, IndexError: pass preprint = None try: preprint = xml.findall('./preprints/prearchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the preprint policy.\n' + 'URL was: ' + request) postprint = None try: postprint = xml.findall('./postprints/postarchiving')[0].text.strip() except (KeyError, IndexError, AttributeError):
def fetch_journal(search_terms, matching_mode='exact'): """ Fetch the journal data from RoMEO. Returns an Journal object. search_terms should be a dictionnary object containing at least one of these fields: """ allowed_fields = ['issn', 'jtitle'] terms = search_terms.copy() # Make the title HTML-safe before searching for it in the database or in # the API if 'title' in terms: terms['title'] = kill_html(terms['title']) # Check the arguments if not all(key in allowed_fields for key in terms): raise ValueError('The search terms have to belong to ' + str(allowed_fields) + 'but the dictionary I got is ' + str(terms)) # Remove diacritics (because it has to be sent in ASCII to ROMEO) for key in terms: terms[key] = remove_diacritics(terms[key]) if len(terms[key]) > 256: return None # First check we don't have it already journal = find_journal_in_model(terms) if journal: return journal # Perform the query if matching_mode != 'exact': terms['qtype'] = matching_mode root = perform_romeo_query(terms) # Find the matching journals (if any) journals = list(root.findall('./journals/journal')) if not journals: return None elif len(journals) > 1: print("Warning, " + str(len(journals)) + " journals match the RoMEO request, " + "defaulting to the first one") # TODO different behaviour: get the ISSN and try again. journal = journals[0] names = list(journal.findall('./jtitle')) if not names: raise MetadataSourceException( 'RoMEO returned a journal without title.\n' + 'Terms were: ' + unicode(terms)) if len(names) > 1: print("Warning, " + str(len(names)) + " names provided for one journal, " + "defaulting to the first one") name = kill_html(names[0].text) issn = None try: issn = nstrip(journal.findall('./issn')[0].text) except (KeyError, IndexError): pass # Now we may have additional info, so it's worth trying again in the model model_journal = find_journal_in_model({'issn': issn, 'jtitle': name}) if model_journal: return model_journal # Otherwise we need to find the publisher publishers = root.findall('./publishers/publisher') if not publishers: return None # TODO here we shouldn't default to the first one but look it up using the # <romeopub> publisher_desc = publishers[0] publisher = get_or_create_publisher(publisher_desc) result = Journal(title=name, issn=issn, publisher=publisher) result.save() return result
for condition in xml.findall('./conditions/condition'): if condition.text: c = PublisherCondition(publisher=publisher, text=condition.text.strip()) c.save() # Update the publisher status publisher.oa_status = publisher.classify_oa_status() publisher.save(update_fields=['oa_status']) for link in xml.findall('./copyrightlinks/copyrightlink'): text = None url = None texts = link.findall('./copyrightlinktext') if texts: text = nstrip(texts[0].text) urls = link.findall('./copyrightlinkurl') if urls: url = nstrip(urls[0].text) if url and text: cplink = PublisherCopyrightLink(text=text, url=url, publisher=publisher) cplink.save() return publisher def addRestriction(xml, applies_to, publisher): text = nstrip(xml.text) if text:
def get_or_create_publisher(self, romeo_xml_description): """ Retrieves from the model, or creates into the model, the publisher corresponding to the <publisher> description from RoMEO. If the data from RoMEO is more fresh than what we have in cache, we update our model. """ xml = romeo_xml_description romeo_id = None try: romeo_id = xml.attrib['id'] except KeyError: raise MetadataSourceException('RoMEO did not provide a publisher id.') romeo_parent_id = None try: romeo_parent_id = xml.attrib['parentid'] except KeyError: pass name = None try: raw_name = xml.findall('./name')[0].text.strip() name = fromstring(kill_html(sanitize_html(raw_name))).text except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the publisher\'s name.') alias = None try: alias = nstrip(xml.findall('./alias')[0].text) if alias: alias = fromstring(kill_html(sanitize_html(alias))).text except (KeyError, IndexError): pass last_update = self._get_romeo_date(xml, './dateupdated') # Check if we already have it. # Sadly the romeo_id is not unique (as publishers imported from doaj # all get the same id, so we have to use the name too). matches = None if re.match(r'\d+', romeo_id): # numeric ids are unambiguous matches = Publisher.objects.filter(romeo_id=romeo_id) elif alias: matches = Publisher.objects.filter( romeo_id=romeo_id, name__iexact=name, alias__iexact=alias) else: matches = Publisher.objects.filter( romeo_id=romeo_id, name__iexact=name, alias__isnull=True) if matches: first_match = matches[0] if first_match.last_updated is not None and first_match.last_updated >= last_update: return matches[0] # Otherwise, create it url = None try: url = nstrip(xml.findall('./homeurl')[0].text) except (KeyError, IndexError): pass preprint = None try: preprint = xml.findall('./preprints/prearchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the preprint policy.') postprint = None try: postprint = xml.findall('./postprints/postarchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the postprint policy.') pdfversion = None try: pdfversion = xml.findall('./pdfversion/pdfarchiving')[0].text.strip() except (KeyError, IndexError, AttributeError): raise MetadataSourceException( 'RoMEO did not provide the pdf archiving policy.') # Compute OA status of the publisher status = 'UNK' if not matches: publisher = Publisher() else: publisher = matches[0] publisher.name = name publisher.alias = alias publisher.url = url publisher.preprint = preprint publisher.postprint = postprint publisher.pdfversion = pdfversion publisher.romeo_id = romeo_id publisher.romeo_parent_id = romeo_parent_id publisher.oa_status = status publisher.last_updated = last_update publisher.save() if matches: publisher.publishercopyrightlink_set.all().delete() publisher.publisherrestrictiondetail_set.all().delete() publisher.publishercondition_set.all().delete() # Add the conditions, restrictions, and copyright for restriction in xml.findall('./preprints/prerestrictions/prerestriction'): self.add_restriction(restriction, 'preprint', publisher) for restriction in xml.findall('./postprints/postrestrictions/postrestriction'): self.add_restriction(restriction, 'postprint', publisher) for restriction in xml.findall('./pdfversion/pdfrestrictions/pdfrestriction'): self.add_restriction(restriction, 'pdfversion', publisher) for condition in xml.findall('./conditions/condition'): if condition.text: c = PublisherCondition(publisher=publisher, text=condition.text.strip()) c.save() # Update the publisher status publisher.oa_status = publisher.classify_oa_status() publisher.save(update_fields=['oa_status']) # TODO: if the OA status has changed, then we should update the journals and papers accordingly with the # adequate task for link in xml.findall('./copyrightlinks/copyrightlink'): text = None url = None texts = link.findall('./copyrightlinktext') if texts: text = nstrip(texts[0].text) urls = link.findall('./copyrightlinkurl') if urls: url = nstrip(urls[0].text) if url and text: cplink = PublisherCopyrightLink( text=text, url=url[:1024], publisher=publisher) cplink.save() return publisher