def tags(record): record = b.customization.convert_to_unicode(record) record = c.author(record) record = c.editor(record) tags = set() if 'tags' in record: tags.update([ i.strip() for i in re.split(',|;', record["tags"].replace('\n', '')) ]) record['tags'] = tags record['p_authors'] = [] logging.debug(f"Handling: {record['ID']}") if 'author' in record: try: record['p_authors'] = [ c.splitname(x, False) for x in record['author'] ] except Exception as err: breakpoint() if 'editor' in record: record['p_authors'] = [c.splitname(x, False) for x in record['editor']] return record
def test_splitname_cases(self): """Test customization.splitname() vs output from BibTeX """ for name, expected in splitname_test_cases: result = splitname(name) self.assertEqual(result, expected, msg="Input name: {0}".format(name))
def format_author_list(authors): tidy_authors = [] for i_author in authors: i_author = cleaner.clean_braces(i_author) author_name_parts = splitname(i_author, strict_mode=False) formatted_author = format_author_name(author_name_parts) tidy_authors.append(formatted_author) return tidy_authors
def clean_name(author): if type(author) == dict: if "name" in author: out_author = clean_text(author["name"].strip()) out_author = enclose_braces(out_author) return out_author if "given" in author and "family" in author: given_name = author["given"] family_name = author["family"] full_name = given_name + " " + family_name name_parts = splitname(full_name) elif "given" in author: full_name = author["given"] print("Check me: " + full_name) name_parts = splitname(full_name) elif "family" in author: full_name = author["family"] print("Check me: " + full_name) name_parts = splitname(full_name) else: print(author) out_author = clean_text(author["name"].strip()) out_author = enclose_braces(out_author) return out_author elif type(author) == str: institute_author, is_institute = clean_institute_author(author) if is_institute: return institute_author author = clean_braces(author) name_parts = splitname(author) else: raise Exception("Unknown author type!") out_author = formatter.format_author_name(name_parts) return out_author
def split_authors_name( authors: List[str], separator: str = "and") -> List[Dict[str, str]]: """ Convert a list of authors to papis formatted data. :arg authors: A list of single author names or multiple authors separated by *separator*. """ from bibtexparser.customization import splitname author_list = [] for subauthors in authors: for author in re.split(r"\s+{}\s+".format(separator), subauthors): parts = splitname(author) given = " ".join(parts["first"]) family = " ".join(parts["von"] + parts["last"] + parts["jr"]) author_list.append(dict(family=family, given=given)) return author_list
def handle_authors(entry: dict, ) -> dict: """ Sets 'author' and 'editor' each to: 1) list of dicts of lists (each author passed through bibtexparser.customization.splitname) 2) None, if 'author'/'editor' not in item_ Example: item_['author'] = [ { 'first': ['J.', 'L.'], 'last': 'Bredas', 'von': '', 'jr': '', }, { 'first': ['Georg', 'Henrik'], 'last': 'Wright', 'von': 'von', 'jr': '', }, ] :param entry: item_-dict :return: item_-dict with formatted author """ for field in 'author editor'.split(): if field in entry: authors = entry[field].split(' and ') authors_ = [] for au in authors: au_dict = bib_custom.splitname(au) au_dict_new = { k: (v[0] if v else '') for k, v in au_dict.items() } au_dict_new['first'] = au_dict['first'] authors_.append(au_dict_new) entry[field] = authors_ else: entry[field] = None return entry
def custom(record): record = c.type(record) record = c.author(record) record = c.editor(record) record = c.journal(record) record = c.keyword(record) record = c.link(record) record = c.doi(record) tags = set() if 'tags' in record: tags.update([i.strip() for i in re.split(',|;', record["tags"].replace('\n', ''))]) if "keywords" in record: tags.update([i.strip() for i in re.split(',|;', record["keywords"].replace('\n', ''))]) if "mendeley-tags" in record: tags.update([i.strip() for i in re.split(',|;', record["mendeley-tags"].replace('\n', ''))]) record['tags'] = tags record['p_authors'] = [] if 'author' in record: record['p_authors'] = [c.splitname(x, False) for x in record['author']] return record
def clean_full(record): record = c.type(record) record = c.author(record) record = c.editor(record) record = c.journal(record) record = c.keyword(record) record = c.link(record) record = c.doi(record) tags = set() if 'tags' in record: tags.update([ i.strip() for i in re.split(',|;', record["tags"].replace('\n', '')) ]) if "keywords" in record: tags.update([ i.strip() for i in re.split(',|;', record["keywords"].replace('\n', '')) ]) if "mendeley-tags" in record: tags.update([ i.strip() for i in re.split(',|;', record["mendeley-tags"].replace('\n', '')) ]) record['tags'] = tags record['p_authors'] = [] if 'author' in record: record['p_authors'] += [x.split(' and ') for x in record['author']] if 'editor' in record: record['p_authors'] += [ c.splitname(x, False) for x in record['editor'] ] return record
def custom(record): try: record = c.convert_to_unicode(record) except TypeError as e: logging.warning("Unicode Error on: {}".format(record['ID'])) record['error'] = 'unicode' try: #add md5 of associated files files = [add_slash_if_necessary(y) for x in record['file'].split(';') for y in x.split(':') if bool(y.strip()) and y.strip().lower() != 'pdf'] file_set = set(files) if not 'hashes' in record: hashes = [file_to_hash(x) for x in file_set] record['hashes'] = ";".join(hashes) #regularize format of files list record['file'] = ";".join(file_set) except Exception as e: logging.warning("File Error: {} : {}".format(record['ID'], e.args[0])) record['error'] = 'file' #todo: if file is not in the library common prefix, move it there #look for year, then first surname, then copy in, making dir if necessary if file_set: for x in file_set: try: current_path = realpath(x) common = commonpath([current_path, args.library]) if common != args.library: logging.info("Found file outside library: {}".format(current_path)) logging.info("Common: {}".format(common)) #get the author and year year = record['year'] authors = c.getnames([i.strip() for i in record["author"].replace('\n', ' ').split(" and ")]) authors_split = [c.splitname(a) for a in authors] author_surnames = [a['last'][0] for a in authors_split] new_path = join(args.library, year, ", ".join(author_surnames)) logging.info("New Path: {}".format(new_path)) #create directory if necessary #copy file full_new_path = join(new_path, split(current_path)[1]) logging.info("Copying file") logging.info("From: {}".format(current_path)) logging.info("To: {}".format(full_new_path)) response = input("Enter to confirm: ") if response == "": logging.info("Proceeding") if not exists(new_path): mkdir(new_path) if exists(full_new_path): raise Exception("File already exists") copyfile(x, full_new_path) file_set.remove(x) file_set.add(full_new_path) record['file'] = ";".join(file_set) except Exception as e: logging.info("Issue copying file for: {}".format(x)) logging.info(e) record['error'] = 'file_copy' #regularize keywords try: keywords = set() if 'tags' not in record: if 'keywords' in record: keywords.update([x.strip() for x in record['keywords'].split(',')]) del record['keywords'] if 'mendeley-tags' in record: keywords.update([x.strip() for x in record['mendeley-tags'].split(',')]) del record['mendeley-tags'] record['tags'] = ",".join(keywords) except Error as e: logging.warning("Tag Error: {}".format(record['ID'])) record['error'] = 'tag' # record = c.type(record) # record = c.author(record) # record = c.editor(record) # record = c.journal(record) # record = c.keyword(record) # record = c.link(record) # record = c.doi(record) # record['p_authors'] = [] # if 'author' in record: # record['p_authors'] = [c.splitname(x, False) for x in record['author']] return record
def test_splitname_basic(self): """Basic tests of customization.splitname() """ # Empty input. result = splitname("") expected = {} self.assertEqual(result, expected, msg="Invalid output for empty name") # Non-whitespace names. result = splitname(" ") expected = {} self.assertEqual(result, expected, msg="Invalid output for space-only name") result = splitname(" \t~~") expected = {} self.assertEqual(result, expected, msg="Invalid output for whitespace name") # Test strict mode. with self.assertRaises(InvalidName): # Trailing comma (4 cases). splitname("BB,", strict_mode=True) with self.assertRaises(InvalidName): splitname("BB, ", strict_mode=True) with self.assertRaises(InvalidName): splitname("BB, ~\t", strict_mode=True) with self.assertRaises(InvalidName): splitname(", ~\t", strict_mode=True) with self.assertRaises(InvalidName): # Too many sections. splitname("AA, BB, CC, DD", strict_mode=True) with self.assertRaises( InvalidName): # Unterminated opening brace (x3). splitname("AA {BB CC", strict_mode=True) with self.assertRaises(InvalidName): splitname("AA {{{BB CC", strict_mode=True) with self.assertRaises(InvalidName): splitname("AA {{{BB} CC}", strict_mode=True) with self.assertRaises(InvalidName): # Unmatched closing brace (x3). splitname("AA BB CC}", strict_mode=True) with self.assertRaises(InvalidName): splitname("AA BB CC}}}", strict_mode=True) with self.assertRaises(InvalidName): splitname("{AA {BB CC}}}", strict_mode=True) # Test strict mode off for trailing comma. expected = {'first': [], 'von': [], 'last': ["BB"], 'jr': []} result = splitname("BB,", strict_mode=False) self.assertEqual( result, expected, msg="Invalid output for trailing comma with strict mode off") result = splitname("BB, ", strict_mode=False) self.assertEqual( result, expected, msg="Invalid output for trailing comma with strict mode off") result = splitname("BB, ~\t ", strict_mode=False) self.assertEqual( result, expected, msg="Invalid output for trailing comma with strict mode off") expected = {} result = splitname(", ~\t", strict_mode=False) self.assertEqual( result, expected, msg="Invalid output for trailing comma with strict mode off") # Test strict mode off for too many sections. expected = { 'first': ["CC", "DD"], 'von': [], 'last': ["AA"], 'jr': ["BB"] } result = splitname("AA, BB, CC, DD", strict_mode=False) self.assertEqual( result, expected, msg="Invalid output for too many sections with strict mode off") # Test strict mode off for an unterminated opening brace. result = splitname("AA {BB CC", strict_mode=False) expected = {'first': ["AA"], 'von': [], 'last': ["{BB CC}"], 'jr': []} self.assertEqual( result, expected, msg= "Invalid output for unterminated opening brace with strict mode off" ) result = splitname("AA {{{BB CC", strict_mode=False) expected = { 'first': ["AA"], 'von': [], 'last': ["{{{BB CC}}}"], 'jr': [] } self.assertEqual( result, expected, msg= "Invalid output for unterminated opening brace with strict mode off" ) result = splitname("AA {{{BB} CC}", strict_mode=False) expected = { 'first': ["AA"], 'von': [], 'last': ["{{{BB} CC}}"], 'jr': [] } self.assertEqual( result, expected, msg= "Invalid output for unterminated opening brace with strict mode off" ) # Test strict mode off for an unmatched closing brace. result = splitname("AA BB CC}", strict_mode=False) expected = { 'first': ["AA", "BB"], 'von': [], 'last': ["{CC}"], 'jr': [] } self.assertEqual( result, expected, msg= "Invalid output for unmatched closing brace with strict mode off") result = splitname("AA BB CC}}}", strict_mode=False) expected = { 'first': ["AA", "BB"], 'von': [], 'last': ["{{{CC}}}"], 'jr': [] } self.assertEqual( result, expected, msg= "Invalid output for unmatched closing brace with strict mode off") result = splitname("{AA {BB CC}}}", strict_mode=False) expected = { 'first': [], 'von': [], 'last': ["{{AA {BB CC}}}"], 'jr': [] } self.assertEqual( result, expected, msg= "Invalid output for unmatched closing brace with strict mode off") # Test it handles commas at higher brace levels. result = splitname("CC, dd, {AA, BB}") expected = { 'first': ["{AA, BB}"], 'von': [], 'last': ["CC"], 'jr': ["dd"] } self.assertEqual(result, expected, msg="Invalid output for braced commas")
def test_splitname_basic(self): """Basic tests of customization.splitname() """ # Empty input. result = splitname("") expected = {} self.assertEqual(result, expected, msg="Invalid output for empty name") # Non-whitespace names. result = splitname(" ") expected = {} self.assertEqual(result, expected, msg="Invalid output for space-only name") result = splitname(" \t~~") expected = {} self.assertEqual(result, expected, msg="Invalid output for whitespace name") # Test strict mode. with self.assertRaises(InvalidName): # Trailing comma (4 cases). splitname("BB,", strict_mode=True) with self.assertRaises(InvalidName): splitname("BB, ", strict_mode=True) with self.assertRaises(InvalidName): splitname("BB, ~\t", strict_mode=True) with self.assertRaises(InvalidName): splitname(", ~\t", strict_mode=True) with self.assertRaises(InvalidName): # Too many sections. splitname("AA, BB, CC, DD", strict_mode=True) with self.assertRaises(InvalidName): # Unterminated opening brace (x3). splitname("AA {BB CC", strict_mode=True) with self.assertRaises(InvalidName): splitname("AA {{{BB CC", strict_mode=True) with self.assertRaises(InvalidName): splitname("AA {{{BB} CC}", strict_mode=True) with self.assertRaises(InvalidName): # Unmatched closing brace (x3). splitname("AA BB CC}", strict_mode=True) with self.assertRaises(InvalidName): splitname("AA BB CC}}}", strict_mode=True) with self.assertRaises(InvalidName): splitname("{AA {BB CC}}}", strict_mode=True) # Test strict mode off for trailing comma. expected = {'first': [], 'von': [], 'last': ["BB"], 'jr': []} result = splitname("BB,", strict_mode=False) self.assertEqual(result, expected, msg="Invalid output for trailing comma with strict mode off") result = splitname("BB, ", strict_mode=False) self.assertEqual(result, expected, msg="Invalid output for trailing comma with strict mode off") result = splitname("BB, ~\t ", strict_mode=False) self.assertEqual(result, expected, msg="Invalid output for trailing comma with strict mode off") expected = {} result = splitname(", ~\t", strict_mode=False) self.assertEqual(result, expected, msg="Invalid output for trailing comma with strict mode off") # Test strict mode off for too many sections. expected = {'first': ["CC", "DD"], 'von': [], 'last': ["AA"], 'jr': ["BB"]} result = splitname("AA, BB, CC, DD", strict_mode=False) self.assertEqual(result, expected, msg="Invalid output for too many sections with strict mode off") # Test strict mode off for an unterminated opening brace. result = splitname("AA {BB CC", strict_mode=False) expected = {'first': ["AA"], 'von': [], 'last': ["{BB CC}"], 'jr': []} self.assertEqual(result, expected, msg="Invalid output for unterminated opening brace with strict mode off") result = splitname("AA {{{BB CC", strict_mode=False) expected = {'first': ["AA"], 'von': [], 'last': ["{{{BB CC}}}"], 'jr': []} self.assertEqual(result, expected, msg="Invalid output for unterminated opening brace with strict mode off") result = splitname("AA {{{BB} CC}", strict_mode=False) expected = {'first': ["AA"], 'von': [], 'last': ["{{{BB} CC}}"], 'jr': []} self.assertEqual(result, expected, msg="Invalid output for unterminated opening brace with strict mode off") # Test strict mode off for an unmatched closing brace. result = splitname("AA BB CC}", strict_mode=False) expected = {'first': ["AA", "BB"], 'von': [], 'last': ["{CC}"], 'jr': []} self.assertEqual(result, expected, msg="Invalid output for unmatched closing brace with strict mode off") result = splitname("AA BB CC}}}", strict_mode=False) expected = {'first': ["AA", "BB"], 'von': [], 'last': ["{{{CC}}}"], 'jr': []} self.assertEqual(result, expected, msg="Invalid output for unmatched closing brace with strict mode off") result = splitname("{AA {BB CC}}}", strict_mode=False) expected = {'first': [], 'von': [], 'last': ["{{AA {BB CC}}}"], 'jr': []} self.assertEqual(result, expected, msg="Invalid output for unmatched closing brace with strict mode off") # Test it handles commas at higher brace levels. result = splitname("CC, dd, {AA, BB}") expected = {'first': ["{AA, BB}"], 'von': [], 'last': ["CC"], 'jr': ["dd"]} self.assertEqual(result, expected, msg="Invalid output for braced commas")
def get_doi(entry, config): has_doi = bib_parser.has_doi(entry) my_etiquette = Etiquette(constants.PROJECT_NAME, constants.VERSION, constants.URL, constants.EMAIL) max_levenshtein_distance = config.get_max_levenshtein_distance() update_URL = config.get_update_URL() works = Works(etiquette=my_etiquette) if not has_doi and bib_parser.has_url(entry): entry_url = bib_parser.get_url(entry) if "doi" in entry_url: doi = cleaner.clean_doi(entry_url) if is_crossref_work(doi): crossref_info = works.doi(doi) if crossref_is_similar(crossref_info, entry, max_levenshtein_distance): entry = set_doi(entry, doi, update_URL) has_doi = True if not has_doi: # we try to find the doi for the title entry_title = bib_parser.get_title(entry) entry_title = cleaner.clean_braces(entry_title) author = bib_parser.get_author(entry) first_author = splitname(author[0], strict_mode=False) first_author_last_name = first_author["last"][0] query_parameters = { "author": first_author_last_name, "bibliographic": entry_title } works_query = works.query(**query_parameters) works_query = works_query.sort("score").order("desc").select( ["title", "DOI"]) i_i_item = 0 max_items = min(works_query.count(), 10) works_results = iter(works_query) while i_i_item < max_items and not has_doi: i_item = next(works_results) if crossref_is_similar(i_item, entry, max_levenshtein_distance): doi = cr_parser.get_doi(i_item) entry = set_doi(entry, doi, update_URL) has_doi = True i_i_item += 1 else: # We check to see if the doi is correct doi = bib_parser.get_doi(entry) doi = cleaner.clean_doi(doi) if is_crossref_work(doi): crossref_info = works.doi(doi) if crossref_is_similar(crossref_info, entry, max_levenshtein_distance): entry = set_doi(entry, doi, update_URL) else: entry.pop("doi", None) if "doi" in bib_parser.get_url(entry): entry.pop("url", None) has_doi = False else: entry = set_doi(entry, doi, update_URL) return entry, has_doi
def format_authors(entry, abbreviate_first=True, et_al_at=1000): """ this is the way i like it, tweak as needed. """ # Split author field into a list of “Name, Surname”. seems to be inplace, # thats why we copy first r = entry.copy() btxc.author(r) names = r["author"] authors = [] for name in names: # {'first': ['F.', 'Paul'], 'last': ['Spitzner'], 'von': [], 'jr': []} split = btxc.splitname(name) # print(split) if not abbreviate_first: first = " ".join(split["first"]) else: first = "" for f in split["first"]: # name spelled out if len(f) > 2: first += f[0] + "." elif f[1] in ".:;": first += f[0] + "." else: print( f"Adapt the `format_authors` script to your needs for entry {r['ID']}" ) last = " ".join(split["last"]) von = " ".join(split["von"]) jr = " ".join(split["jr"]) # stitch the name together and fix capitalziation temp = first.title() if len(von) > 0: temp += " " + von.lower() temp += " " + last # do not title case this, breaks e.g. "de Heuvel" if len(jr) > 0: temp += " " + jr.lower() authors.append(temp) res = "" # now we have a list of authors nicely formatted, make this a readable # one-liner for the webiste if len(authors) > et_al_at: res = authors[0] + " et al." elif len(authors) == 1: res = authors[0] else: res = authors[0] for a in authors[1:-1]: res += ", " + a res += " and " + authors[-1] # cleanup bibtex brackets res = cleanup(res) # res = res.replace("{", "") # res = res.replace("}", "") return res
def safe_splitname(s): s = s.strip() if s.endswith(","): s = s[:-1] return c.splitname(s)
def sort(self): self._data = OrderedDict( sorted(self._data.items(), key=lambda key_val: key_val[1])) def _handle_duplicates(self): self.sort() # delete duplicates duplicates = [] last_entry = object() for k, entry in self._data.items(): if entry == last_entry: duplicates.append(k) last_entry = entry for k in duplicates: self._data.pop(k, None) # set numbers/letters for pseudo-duplicates for k, v in self.unique_authors_years.items(): if 1 < len(v): for i, id_ in enumerate(v): self._data[id_]['letter_number'] = i + 1 else: id_ = v[0] self._data[id_]['letter_number'] = None if __name__ == '__main__': n = "Orti, E. and Bredas, J. L. and Clarisse, C.".split(' and ') print(bib_custom.splitname("von Wright, Georg Henrik"))