def match_first_names(pair): """ Returns true when the given pair of first names is compatible. >>> match_first_names(('A','Amanda')) True >>> match_first_names(('Amanda','Amanda')) True >>> match_first_names(('Alfred','Amanda')) False >>> match_first_names(('patrick','P')) True >>> match_first_names((None,'Iryna')) True >>> match_first_names(('Clément','Clement')) True """ a, b = pair if a is None or b is None: return True if len(a) == 1 and len(b) > 0: return a.lower() == b[0].lower() elif len(b) == 1 and len(a) > 0: return b.lower() == a[0].lower() else: return remove_diacritics(a).lower() == remove_diacritics(b).lower()
def search(self): self.queryset = self.searchqueryset.models(Paper) q = remove_diacritics(self.cleaned_data['q']) if q: self.queryset = self.queryset.auto_query(q) visible = self.cleaned_data['visible'] if visible == '': self.filter(visible=True) elif visible == 'invisible': self.filter(visible=False) self.form_filter('availability', 'availability') self.form_filter('oa_status__in', 'oa_status') self.form_filter('pubdate__gte', 'pub_after') self.form_filter('pubdate__lte', 'pub_before') self.form_filter('doctype__in', 'doctypes') # Filter by authors. # authors field: a comma separated list of full/last names. # Items with no whitespace of prefixed with 'last:' are considered as # last names; others are full names. for name in self.cleaned_data['authors'].split(','): name = remove_diacritics(name.strip()) if name.startswith('last:'): is_lastname = True name = name[5:].strip() else: is_lastname = ' ' not in name if not name: continue if is_lastname: self.filter(authors_last=name) else: self.filter(authors_full=Sloppy(name, slop=1)) self.queryset = aggregate_combined_status(self.queryset) status = self.cleaned_data['status'] if status: self.queryset = self.queryset.post_filter( combined_status__in=status) # Default ordering by decreasing publication date order = self.cleaned_data['sort_by'] or 'pubdate' reverse_order = not self.cleaned_data['reverse_order'] if reverse_order: order = '-' + order self.queryset = self.queryset.order_by(order).load_all() return self.queryset
def fetch_publisher(self, publisher_name): """ Retrieve a publisher from the RoMEO API. """ if publisher_name is None: return # Prepare the query search_terms = dict() search_terms['pub'] = remove_diacritics(publisher_name) search_terms['qtype'] = 'all' root = self.perform_romeo_query(search_terms) # Find the publisher publishers = root.findall('./publishers/publisher') if len(publishers) == 0: return elif len(publishers) > 1: search_terms['qtype'] = 'exact' root = self.perform_romeo_query(search_terms) publishers = root.findall('./publishers/publisher') if len(publishers) != 1: return publisher = self.get_or_create_publisher(publishers[0]) return publisher
def fetch_journal(search_terms, matching_mode='exact'): """ Fetch the journal data from RoMEO. Returns an Journal object. search_terms should be a dictionnary object containing at least one of these fields: """ allowed_fields = ['issn', 'jtitle'] # Make the title HTML-safe before searching for it in the database or in the API if 'title' in search_terms: search_terms['title'] = kill_html(search_terms['title']) original_search_terms = search_terms.copy() # Check the arguments if not all( map(lambda x: x in allowed_fields, (key for key in search_terms))): raise ValueError('The search terms have to belong to ' + str(allowed_fields) + 'but the dictionary I got is ' + str(search_terms)) # Remove diacritics (because it has to be sent in ASCII to ROMEO) for key in search_terms: search_terms[key] = remove_diacritics(search_terms[key]) # First check we don't have it already journal = find_journal_in_model(search_terms) if journal: return journal # Perform the query root = perform_romeo_query(search_terms) # Find the matching journals (if any) journals = list(root.findall('./journals/journal')) if not journals: # Retry with a less restrictive matching type if matching_mode == 'exact': return fetch_journal(original_search_terms, 'contains') return None if len(journals) > 1: print("Warning, " + str(len(journals)) + " journals match the RoMEO request, " + "defaulting to the first one") # TODO different behaviour: get the ISSN and try again. journal = journals[0] names = list(journal.findall('./jtitle')) if not names: raise MetadataSourceException( 'RoMEO returned a journal without title.\n' + 'URL was: ' + request) if len(names) > 1: print("Warning, " + str(len(names)) + " names provided for one journal, " + "defaulting to the first one") name = kill_html(names[0].text) issn = None try: issn = nstrip(journal.findall('./issn')[0].text) except KeyError, IndexError: pass
def name_normalization(ident): ident = remove_diacritics(ident).lower() ident = ident.strip() ident = nn_separator_re.sub('_',ident) ident = nn_escaping_chars_re.sub('',ident) ident = nn_final_nontext_re.sub('',ident) ident = nn_nontext_re.sub('-',ident) return ident
def name_normalization(ident): ident = remove_diacritics(ident).lower() ident = ident.strip() ident = nn_separator_re.sub('_', ident) ident = nn_escaping_chars_re.sub('', ident) ident = nn_final_nontext_re.sub('', ident) ident = nn_nontext_re.sub('-', ident) return ident
def fetch_journal(search_terms, matching_mode = 'exact'): """ Fetch the journal data from RoMEO. Returns an Journal object. search_terms should be a dictionnary object containing at least one of these fields: """ allowed_fields = ['issn', 'jtitle'] # Make the title HTML-safe before searching for it in the database or in the API if 'title' in search_terms: search_terms['title'] = kill_html(search_terms['title']) original_search_terms = search_terms.copy() # Check the arguments if not all(map(lambda x: x in allowed_fields, (key for key in search_terms))): raise ValueError('The search terms have to belong to '+str(allowed_fields)+ 'but the dictionary I got is '+str(search_terms)) # Remove diacritics (because it has to be sent in ASCII to ROMEO) for key in search_terms: search_terms[key] = remove_diacritics(search_terms[key]) # First check we don't have it already journal = find_journal_in_model(search_terms) if journal: return journal # Perform the query root = perform_romeo_query(search_terms) # Find the matching journals (if any) journals = list(root.findall('./journals/journal')) if not journals: # Retry with a less restrictive matching type if matching_mode == 'exact': return fetch_journal(original_search_terms, 'contains') return None if len(journals) > 1: print ("Warning, "+str(len(journals))+" journals match the RoMEO request, "+ "defaulting to the first one") # TODO different behaviour: get the ISSN and try again. journal = journals[0] names = list(journal.findall('./jtitle')) if not names: raise MetadataSourceException('RoMEO returned a journal without title.\n'+ 'URL was: '+request) if len(names) > 1: print("Warning, "+str(len(names))+" names provided for one journal, "+ "defaulting to the first one") name = kill_html(names[0].text) issn = None try: issn = nstrip(journal.findall('./issn')[0].text) except KeyError, IndexError: pass
def fetch_publisher(publisher_name): if publisher_name is None: return # First, let's see if we have a publisher with that name matching_publishers = Publisher.objects.filter(name=publisher_name) if len(matching_publishers) == 1: return matching_publishers[0] # Second, let's see if the publisher name has often been associated to a known publisher aliases = list( AliasPublisher.objects.filter( name=publisher_name).order_by('-count')[:2]) if len(aliases) == 1: # Only one publisher found. If it has been seen often enough under that name, # keep it! if aliases[0].count > PUBLISHER_NAME_ASSOCIATION_THRESHOLD: AliasPublisher.increment(publisher_name, aliases[0].publisher) return aliases[0].publisher elif len(aliases) == 2: # More than one publisher found (two aliases returned as we limited to the two first # results). Then we need to make sure the first one appears a lot more often than # the first if (aliases[0].count > PUBLISHER_NAME_ASSOCIATION_THRESHOLD and aliases[0].count > PUBLISHER_NAME_ASSOCIATION_FACTOR * aliases[1].count): AliasPublisher.increment(publisher_name, aliases[0].publisher) return aliases[0].publisher # Otherwise, let's try to fetch the publisher from RoMEO! # Prepare the query search_terms = dict() search_terms['pub'] = remove_diacritics(publisher_name) search_terms['qtype'] = 'all' root = perform_romeo_query(search_terms) # Find the publisher publishers = root.findall('./publishers/publisher') if len(publishers) == 0: return elif len(publishers) > 1: print str(len(publishers)) + " results" search_terms['qtype'] = 'exact' root = perform_romeo_query(search_terms) publishers = root.findall('./publishers/publisher') if len(publishers) != 1: return publisher = get_or_create_publisher(publishers[0]) AliasPublisher.increment(publisher_name, publisher) return publisher
def fetch_publisher(publisher_name): if publisher_name is None: return # First, let's see if we have a publisher with that name matching_publishers = Publisher.objects.filter(name=publisher_name) if len(matching_publishers) == 1: return matching_publishers[0] # Second, let's see if the publisher name has often been associated to a known publisher aliases = list(AliasPublisher.objects.filter(name=publisher_name).order_by('-count')[:2]) if len(aliases) == 1: # Only one publisher found. If it has been seen often enough under that name, # keep it! if aliases[0].count > PUBLISHER_NAME_ASSOCIATION_THRESHOLD: AliasPublisher.increment(publisher_name, aliases[0].publisher) return aliases[0].publisher elif len(aliases) == 2: # More than one publisher found (two aliases returned as we limited to the two first # results). Then we need to make sure the first one appears a lot more often than # the first if (aliases[0].count > PUBLISHER_NAME_ASSOCIATION_THRESHOLD and aliases[0].count > PUBLISHER_NAME_ASSOCIATION_FACTOR*aliases[1].count): AliasPublisher.increment(publisher_name, aliases[0].publisher) return aliases[0].publisher # Otherwise, let's try to fetch the publisher from RoMEO! # Prepare the query search_terms = dict() search_terms['pub'] = remove_diacritics(publisher_name) search_terms['qtype'] = 'all' root = perform_romeo_query(search_terms) # Find the publisher publishers = root.findall('./publishers/publisher') if len(publishers) == 0: return elif len(publishers) > 1: print str(len(publishers)) + " results" search_terms['qtype'] = 'exact' root = perform_romeo_query(search_terms) publishers = root.findall('./publishers/publisher') if len(publishers) != 1: return publisher = get_or_create_publisher(publishers[0]) AliasPublisher.increment(publisher_name, publisher) return publisher
def normalize_last_name(last): """ Removes diacritics and hyphens from last names for comparison """ return remove_diacritics(last.replace('-', ' ')).lower()
def prepare_authors_last(self, obj): return [remove_diacritics(a['name']['last']) for a in obj.authors_list]
def create_paper_plain_fingerprint(title, authors, year): """ Creates a robust summary of a bibliographic reference. This plain fingerprint should then be converted to an actual fingerprint by hashing it (so that the length remains constant). :param title: the title of the paper :param authors: the list of author names, represented as (first_name, last_name) pairs :param year: the year of publication of the paper >>> create_paper_plain_fingerprint(' It cleans whitespace And Case\\n',[('John','Doe')], 2015) u'it-cleans-whitespace-and-case/doe' >>> create_paper_plain_fingerprint('HTML tags are <emph>removed</emph>',[('John','Doe')], 2015) u'html-tags-are-removed/doe' >>> create_paper_plain_fingerprint('Les accents sont supprimés', [('John','Doe')],2015) u'les-accents-sont-supprimes/doe' >>> create_paper_plain_fingerprint('Long titles are unambiguous enough to be unique by themselves, no need for authors', [('John','Doe')], 2015) u'long-titles-are-unambiguous-enough-to-be-unique-by-themselves-no-need-for-authors' >>> create_paper_plain_fingerprint('Ambiguity', [('John','Doe')], 2014) u'ambiguity-2014/doe' """ title = kill_html(title) title = remove_diacritics(title).lower() title = stripped_chars.sub('', title) title = title.strip() title = re.sub('[ -]+', '-', title) buf = title # If the title is long enough, we return the fingerprint as is if len(buf) > 50: return buf # If the title is very short, we add the year (for "Preface", "Introduction", "New members" cases) # if len(title) <= 16: if not '-' in title: buf += '-' + str(year) author_names_list = [] for author in authors: if not author: continue author = (remove_diacritics(author[0]), remove_diacritics(author[1])) # Last name, without the small words such as "van", "der", "de"… last_name_words, last_name_separators = split_name_words(author[1]) last_words = [] for i, w in enumerate(last_name_words): if (w[0].isupper() or (i > 0 and last_name_separators[i - 1] == '-')): last_words.append(w) # If no word was uppercased, fall back on all the words if not last_words: last_words = last_name_words # Lowercase last_words = map(ulower, last_words) fp = '-'.join(last_words) author_names_list.append(fp) author_names_list.sort() for fp in author_names_list: buf += '/' + fp return buf
def is_fully_capitalized(s): return lowercase_re.search(remove_diacritics(s)) == None
def prepare_text(self, obj): return remove_diacritics(obj.title + ' ' + (' '.join(self.prepare_authors_full(obj))))
def prepare_text(self, obj): return remove_diacritics(obj.title+' '+(' '.join( self.prepare_authors_full(obj))))
def search(self): self.queryset = self.searchqueryset.models(Paper) q = remove_diacritics(self.cleaned_data['q']) if q: self.queryset = self.queryset.auto_query(q) visible = self.cleaned_data['visible'] if visible == '': self.filter(visible=True) elif visible == 'invisible': self.filter(visible=False) self.form_filter('availability', 'availability') self.form_filter('oa_status__in', 'oa_status') self.form_filter('pubdate__gte', 'pub_after') self.form_filter('pubdate__lte', 'pub_before') self.form_filter('doctype__in', 'doctypes') # Filter by authors. # authors field: a comma separated list of full/last names. # Items with no whitespace of prefixed with 'last:' are considered as # last names; others are full names. for name in self.cleaned_data['authors'].split(','): name = name.strip() # If part of this author name matches ORCID identifiers, consider # these as orcid ids and do the filtering orcid_ids = [x for x in name.split(' ') if validate_orcid(x)] for orcid_id in orcid_ids: try: researcher = Researcher.objects.get(orcid=orcid_id) self.filter(researchers=researcher.id) except Researcher.DoesNotExist: pass continue # Rebuild a full name excluding the ORCID id terms name = ' '.join([x for x in name.split(' ') if x not in orcid_ids]) name = remove_diacritics(name.strip()) if name.startswith('last:'): is_lastname = True name = name[5:].strip() else: is_lastname = ' ' not in name if not name: continue if is_lastname: self.filter(authors_last=name) else: reversed_name = ' '.join(reversed(name.split(' '))) sq = SQ() sq.add(SQ(authors_full=Sloppy(name, slop=1)), SQ.OR) sq.add(SQ(authors_full=Sloppy(reversed_name, slop=1)), SQ.OR) self.queryset = self.queryset.filter(sq) self.queryset = aggregate_combined_status(self.queryset) status = self.cleaned_data['status'] if status: self.queryset = self.queryset.post_filter( combined_status__in=status) # Default ordering by decreasing publication date order = self.cleaned_data['sort_by'] or '-pubdate' self.queryset = self.queryset.order_by(order).load_all() return self.queryset
def fetch_journal(search_terms, matching_mode='exact'): """ Fetch the journal data from RoMEO. Returns an Journal object. search_terms should be a dictionnary object containing at least one of these fields: """ allowed_fields = ['issn', 'jtitle'] terms = search_terms.copy() # Make the title HTML-safe before searching for it in the database or in # the API if 'title' in terms: terms['title'] = kill_html(terms['title']) # Check the arguments if not all(key in allowed_fields for key in terms): raise ValueError('The search terms have to belong to ' + str(allowed_fields) + 'but the dictionary I got is ' + str(terms)) # Remove diacritics (because it has to be sent in ASCII to ROMEO) for key in terms: terms[key] = remove_diacritics(terms[key]) if len(terms[key]) > 256: return None # First check we don't have it already journal = find_journal_in_model(terms) if journal: return journal # Perform the query if matching_mode != 'exact': terms['qtype'] = matching_mode root = perform_romeo_query(terms) # Find the matching journals (if any) journals = list(root.findall('./journals/journal')) if not journals: return None elif len(journals) > 1: print("Warning, " + str(len(journals)) + " journals match the RoMEO request, " + "defaulting to the first one") # TODO different behaviour: get the ISSN and try again. journal = journals[0] names = list(journal.findall('./jtitle')) if not names: raise MetadataSourceException( 'RoMEO returned a journal without title.\n' + 'Terms were: ' + unicode(terms)) if len(names) > 1: print("Warning, " + str(len(names)) + " names provided for one journal, " + "defaulting to the first one") name = kill_html(names[0].text) issn = None try: issn = nstrip(journal.findall('./issn')[0].text) except (KeyError, IndexError): pass # Now we may have additional info, so it's worth trying again in the model model_journal = find_journal_in_model({'issn': issn, 'jtitle': name}) if model_journal: return model_journal # Otherwise we need to find the publisher publishers = root.findall('./publishers/publisher') if not publishers: return None # TODO here we shouldn't default to the first one but look it up using the # <romeopub> publisher_desc = publishers[0] publisher = get_or_create_publisher(publisher_desc) result = Journal(title=name, issn=issn, publisher=publisher) result.save() return result
def test_remove_diacritics(self): self.assertEqual(remove_diacritics('aéèï'), 'aeei') self.assertEqual(remove_diacritics('aéè'.encode('utf-8')), b'a\xc3\xa9\xc3\xa8')
def google_scholar_link(self): """ Link to search for the paper in Google Scholar """ return 'http://scholar.google.com/scholar?'+urlencode({'q': remove_diacritics(self.title)})
def create_paper_plain_fingerprint(title, authors, year): """ Creates a robust summary of a bibliographic reference. This plain fingerprint should then be converted to an actual fingerprint by hashing it (so that the length remains constant). :param title: the title of the paper :param authors: the list of author names, represented as (first_name, last_name) pairs :param year: the year of publication of the paper >>> create_paper_plain_fingerprint(' It cleans whitespace And Case\\n',[('John','Doe')], 2015) 'it-cleans-whitespace-and-case/doe' >>> create_paper_plain_fingerprint('HTML tags are <emph>removed</emph>',[('John','Doe')], 2015) 'html-tags-are-removed/doe' >>> create_paper_plain_fingerprint('Les accents sont supprimés', [('John','Doe')],2015) 'les-accents-sont-supprimes/doe' >>> create_paper_plain_fingerprint('Long titles are unambiguous enough to be unique by themselves, no need for authors', [('John','Doe')], 2015) 'long-titles-are-unambiguous-enough-to-be-unique-by-themselves-no-need-for-authors' >>> create_paper_plain_fingerprint('Ambiguity', [('John','Doe')], 2014) 'ambiguity-2014/doe' """ title = kill_html(title) title = remove_diacritics(title).lower() title = stripped_chars.sub('', title) title = title.strip() title = re.sub('[ -]+', '-', title) buf = title # If the title is long enough, we return the fingerprint as is if len(buf) > 50: return buf # If the title is very short, we add the year (for "Preface", "Introduction", "New members" cases) # if len(title) <= 16: if not '-' in title: buf += '-'+str(year) author_names_list = [] for author in authors: if not author: continue author = (remove_diacritics(author[0]), remove_diacritics(author[1])) # Last name, without the small words such as "van", "der", "de"… last_name_words, last_name_separators = split_name_words(author[1]) last_words = [] for i, w in enumerate(last_name_words): if (w[0].isupper() or (i > 0 and last_name_separators[i-1] == '-')): last_words.append(w) # If no word was uppercased, fall back on all the words if not last_words: last_words = last_name_words # Lowercase last_words = list(map(ulower, last_words)) fp = '-'.join(last_words) author_names_list.append(fp) author_names_list.sort() for fp in author_names_list: buf += '/'+fp return buf
def core_link(self): """ Link to search for the paper in CORE """ return 'http://core.ac.uk/search/'+quote(remove_diacritics(self.title))
def normalize_last_name(last): """ Removes diacritics and hyphens from last names for comparison """ return remove_diacritics(last.replace('-',' ')).lower()
def search(self): self.queryset = self.searchqueryset.models(Paper) q = remove_diacritics(self.cleaned_data['q']) if q: self.queryset = self.queryset.auto_query(q) visible = self.cleaned_data['visible'] if visible == '': self.filter(visible=True) elif visible == 'invisible': self.filter(visible=False) self.form_filter('availability', 'availability') self.form_filter('oa_status__in', 'oa_status') self.form_filter('pubdate__gte', 'pub_after') self.form_filter('pubdate__lte', 'pub_before') self.form_filter('doctype__in', 'doctypes') # Filter by authors. # authors field: a comma separated list of full/last names. # Items with no whitespace of prefixed with 'last:' are considered as # last names; others are full names. for name in self.cleaned_data['authors'].split(','): name = name.strip() # If part of this author name matches ORCID identifiers, consider # these as orcid ids and do the filtering orcid_ids = [x for x in name.split(' ') if validate_orcid(x)] for orcid_id in orcid_ids: self.filter(orcids=orcid_id) # Rebuild a full name excluding the ORCID id terms name = ' '.join([x for x in name.split(' ') if x not in orcid_ids]) name = remove_diacritics(name.strip()) if name.startswith('last:'): is_lastname = True name = name[5:].strip() else: is_lastname = ' ' not in name if not name: continue if is_lastname: self.filter(authors_last=name) else: reversed_name = ' '.join(reversed(name.split(' '))) sq = SQ() sq.add(SQ(authors_full=Sloppy(name, slop=1)), SQ.OR) sq.add(SQ(authors_full=Sloppy(reversed_name, slop=1)), SQ.OR) self.queryset = self.queryset.filter(sq) self.queryset = aggregate_combined_status(self.queryset) status = self.cleaned_data['status'] if status: self.queryset = self.queryset.post_filter( combined_status__in=status) # Default ordering by decreasing publication date order = self.cleaned_data['sort_by'] or '-pubdate' self.queryset = self.queryset.order_by(order).load_all() return self.queryset
def is_fully_capitalized(s): """ Is this word fully capitalized? """ return lowercase_re.search(remove_diacritics(s)) == None