def main(experiment, collaboration): authors = {} affiliations = [] affiliation_count = 1 search = "693__e:" + experiment x = perform_request_search(p = search, cc = 'HepNames') for r in x: foaf_name = get_fieldvalues(r, '100__q') cal_authorNameNative = get_fieldvalues(r, '400__a') name = get_fieldvalues(r, '100__a')[0] foaf_givenName = re.sub(r'.*\, ', '', name) foaf_familyName = re.sub(r'\,.*', '', name) author_id = find_inspire_id_from_record(r) orcid = get_hepnames_anyid_from_recid(r, 'ORCID') if VERBOSE: print r affiliation = get_hepnames_aff_from_recid(r, 'Current') if not affiliation: print 'No aff - find recid', r d = {} d['foaf_givenName'] = foaf_givenName d['foaf_familyName'] = foaf_familyName d['affiliation'] = affiliation d['author_id'] = author_id authors[name.lower()] = d affiliations.append(affiliation) affiliations = affiliations_process(affiliations) for key in authors: affiliation = authors[key]['affiliation'] affiliation_number = affiliations.index(affiliation) + 1 authors[key]['affiliation_id'] = affiliation_number print xml_frontmatter(experiment, collaboration) print xml_affiliations(affiliations) print xml_authors(authors)
def convert_search_to_inspire_id(search): inspire_id = None orcid = None recid = get_hepnames_recid_from_search(search) if recid: inspire_id = find_inspire_id_from_record(recid) orcid = get_hepnames_anyid_from_recid(recid, 'ORCID') return [inspire_id, orcid]
def main(recids): """ Gets name and email from each HEPNames record. """ if VERBOSE: print recids icount = 1 for recid in recids: if recid in BAD_RECIDS: break recid_str = str(recid) recid_int = int(recid) if re.search(r'INSPIRE-', recid_str): search = '035__a:' + recid_str result = perform_request_search(p=search, cc='HepNames') recid = result[0] recid_str = str(recid) recid_int = int(recid) if get_hepnames_anyid_from_recid(recid_int, 'ORCID'): print recid_str, 'already has an ORCID\n' icount += 1 continue try: contact_email = get_fieldvalues(recid_int, '371__m')[0] except: contact_email = '*****@*****.**' try: contact_name = get_fieldvalues(recid_int, '100__a')[0] if "," in contact_name: contact_name = " ".join(contact_name.split(", ")[::-1]) except: contact_name = 'Sir or Madam' #contact_email = '*****@*****.**' #contact_email = "*****@*****.**" #contact_email = "*****@*****.**" #contact_email = "*****@*****.**" #contact_email = "*****@*****.**" #contact_email = "*****@*****.**" #contact_email = "*****@*****.**" #contact_email = "*****@*****.**" print icount, '/', len(recids) print 'recid = ', recid_str print 'email = ', contact_email print 'name = ', contact_name print ' ' try: send_jobs_mail(recid_str, contact_email, contact_name) time.sleep(1) except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) print 'PROBLEM sending mail to:' print recid, contact_email, contact_name, '\n' icount += 1
def process_line(author, aff, experiment_id): """Process each line of the author file to check INSPIRE for the authors. """ author = author_first_last(process_author_name(author)) search = 'find a ' + author + ' and exp ' + EXPERIMENT recid = get_hepnames_recid_from_search(search) if recid: #print create_xml(recid, experiment, experiment_id) inspire_aff = get_hepnames_aff_from_recid(recid, 'current') inspire_author = get_fieldvalues(recid, '100__a')[0] inspire_id = get_hepnames_anyid_from_recid(recid, 'INSPIRE') orcid = get_hepnames_anyid_from_recid(recid, 'ORCID') return "{0}\t{1}\t{2}\t||\t{3}\t{4}\t{5}\t{6}\n".format( author, aff, experiment_id, inspire_author, inspire_aff, inspire_id, orcid) else: return "{0}\t{1}\t{2}\n".format(author, aff, experiment_id)
def create_report(filename='', extid='ORCID'): """ Write list of profile info with identifier duplicates """ if not filename: print('Must provide a filename for the report') return i = 0 eidold = '' with open(filename, 'w') as output: for pid, eid in find_dup_extid(extid=extid): if eidold != eid and i > 1: output.write('-'*30 + "\n\n") eidold = eid output.write("https://inspirehep.net/author/profile/%s\n\n" % pid) author = orcid = inspire = None for tag, data in run_sql( "select tag, data from aidPERSONIDDATA where" + " personid={0} and tag <> 'arxiv_papers'". format(pid)): recid = None if tag == 'canonical_name': canonical_name = data recid = get_recid_from_id(canonical_name) if recid: #bai = get_hepnames_anyid_from_recid(recid, 'BAI') orcid = get_hepnames_anyid_from_recid(recid, 'ORCID') inspire = get_hepnames_anyid_from_recid(recid, 'INSPIRE') author = get_fieldvalues(recid, '100__a')[0] output.write("{0}{1:18s}{2:22s}\n".\ format(' '*4, tag, data)) output.write("{0}{1:20s}{2:20s}{3:20}\n".\ format(' HEPNames: ', orcid, inspire, author)) hep_records = show_papers(pid, eid, orcid, inspire) if hep_records: output.write("\n HEP records with other profile's IDs\n") output.write(hep_records) #try: # output.write(show_papers(pid, eid)) #except TypeError: # pass i += 1 output.write("\n")
def find_authors(): search = SEARCH print search result = perform_request_search(p=search,cc='HepNames') authors = [] for recid in result: bai = get_hepnames_anyid_from_recid(recid, 'BAI') if bai: authors.append(bai) return authors
def convert_search_to_inspire_id(search): """Convert a search to an INSPIRE ID.""" inspire_id = None orcid = None recid = get_hepnames_recid_from_search(search) if recid: inspire_id = find_inspire_id_from_record(recid) orcid = get_hepnames_anyid_from_recid(recid, 'ORCID') return [inspire_id, orcid]
def create_report(filename='', extid='ORCID'): """ Write list of profile info with identifier duplicates """ if not filename: print('Must provide a filename for the report') return i = 0 eidold = '' with open(filename, 'w') as output: for pid, eid in find_dup_extid(extid=extid): if eidold != eid and i > 1: output.write('-'*30 + "\n\n") eidold = eid output.write("https://inspirehep.net/author/profile/%s\n\n" % pid) author = orcid = inspire = None for tag, data in run_sql( "select tag, data from aidPERSONIDDATA where" + " personid={0} and tag <> 'arxiv_papers'". format(pid)): recid = None if tag == 'canonical_name': canonical_name = data recid = get_recid_from_id(canonical_name) if recid: #bai = get_hepnames_anyid_from_recid(recid, 'BAI') orcid = get_hepnames_anyid_from_recid(recid, 'ORCID') inspire = get_hepnames_anyid_from_recid(recid, 'INSPIRE') author = get_fieldvalues(recid, '100__a')[0] output.write("{0}{1:18s}{2:22s}\n".\ format(' '*4, tag, data)) output.write("{0}{1:20s}{2:20s}{3:20}\n".\ format(' HEPNames: ', orcid, inspire, author)) hep_records = show_papers(pid, eid, orcid, inspire) if hep_records: output.write('\n HEP records with wrong IDs\n') output.write(hep_records) #try: # output.write(show_papers(pid, eid)) #except TypeError: # pass i += 1 output.write("\n")
def find_authors(): search = SEARCH print search result = perform_request_search(p=search, cc='HepNames') authors = [] for recid in result: bai = get_hepnames_anyid_from_recid(recid, 'BAI') if bai: authors.append(bai) return authors
def check_ids(letter=None): """Go through HEPNames looking for bad IDs.""" already_seen = {} duplicates = set() bad_id_set = set() fields = ['035__a', '035__z', '371__m'] print 'check_ids: letter =', letter if letter: fields.append('100__a') for recid, field in [(recid, field) for recid in RECIDS_HEPN \ for field in fields]: skip = False field_values = get_fieldvalues(recid, field) if field == '100__a': try: if not field_values[0].startswith(letter): skip = True except IndexError: print "No name on record:", recid if not skip: for field_value in field_values: if field_value in already_seen: duplicates.add(field + ':"' + field_value + '"') continue already_seen[field_value] = field if bad_id_check(field_value): bad_id_set.add(field + ':"' + field_value + '"') print "Duplicates" for duplicate in sorted(duplicates): if duplicate.startswith('100__a'): result = perform_request_search(p=duplicate, cc='HepNames') for recid in result: #name = duplicate.replace('100__a:"', '') #name = name.replace('"', '') print '{0:37s} {1:18s} {2:20s}'.\ format(duplicate, \ find_inspire_id_from_record(recid), \ get_hepnames_anyid_from_recid(recid, 'ORCID')) elif duplicate.startswith('035__a'): print duplicate #search = r'100__a:{0} or 700__a:{0}'.\ # format(duplicate.strip('035__a:')) search = r'100:{0} or 700:{0}'.\ format(duplicate.strip('035__a:')) result = perform_request_search(p=duplicate, cc='HEP') if len(result) > 0: print "Duplicate ID in HEP records" print " ", search else: print duplicate print "Bad metadata" for bad_id in sorted(bad_id_set): print bad_id print "Bad ORCIDS in BAI" bad_orcid_bai() print "Finding new ORCIDs in HEP" new_orcids(already_seen)
def preprocess_file(read_data): """Get file into a form that can be properly processed.""" read_data = preprocess_file_braces(read_data) #Process any user commands in latex. command_dict = {} for line in read_data.split('\n'): match = None if re.search('command', line): match = re.search(r'\\r?e?newcommand\*?\{\\(\w+)\}\{(.*)\}', line) elif re.search(r'\\def\\', line): match = re.search(r'\\def\\(\w+)\{(.*)\}', line) if match: command_value = match.group(2) if re.search(r'^\\\w', command_value): command_value = '\\' + command_value command_dict[match.group(1)] = command_value for key in command_dict: try: command_string = re.compile(r'\\%s\b' % key) read_data = re.sub(command_string, command_dict[key], read_data) except re.error: print '!!! Problem with user commands:', key, command_dict[key] sys.exit() read_data = read_data.replace('{+}', '{WXYZ}') for line in read_data.split('\n'): #\href{http://inspirehep.net/record/1068305}{J.~Alimena}$^{7}$ match_obj = re.search(r'/record/(\d+).*$', line) if match_obj: for id_type in ['ORCID', 'INSPIRE']: id_num = get_hepnames_anyid_from_recid(match_obj.group(1), id_type) if id_num: #print line line_new = re.sub(r'.*\}\{(.*)\}(\$\^.*)', r'\1 [' + id_num + r']\2', line) read_data = read_data.replace(line, line_new) #print line_new continue #orcid = get_hepnames_anyid_from_recid(match_obj.group(1), 'ORCID') #if orcid: # line_new = re.sub(r'.*' + match_obj.group(1) + '}', # r'\\author[' + orcid + ']', line) # line_new = re.sub(r'\}(\$\^\{.+\}\$)', r'\1}', line_new) # read_data = read_data.replace(line, line_new) # print line_new # continue #else: # inspire = get_hepnames_anyid_from_recid(match_obj.group(1), # 'INSPIRE') # if inspire: # line_new = re.sub(r'.*' + match_obj.group(1) + '}', # r'\\author[' + inspire + ']', line) # read_data = read_data.replace(line, line_new) # print line_new # continue match_obj = re.search(r'record/(\d+)', line) if match_obj: try: inst = get_fieldvalues(match_obj.group(1), '110__u')[0] line_new = re.sub(r'\\href{http://inspirehep.net/record/\d+}', inst + ' %', line) print line_new read_data = read_data.replace(line, line_new) except IndexError: pass #John Smith (University of Somewhere) if re.search(r'^[A-Z].* \(.*\)\s*$', line): line_new = re.sub(r'(.*)\s+\((.*)\)', r'\\author{\1}\n\\affiliation{\2}', line) read_data = read_data.replace(line, line_new) #\AddAuthor{C.~Lindsey}{11}{}{} if re.search(r'\\AddAuthor{', line): line_new = \ re.sub(r'\\AddAuthor{(.*)}{([^\}]*)}{([^\}]*)}{([^\}]*)}', r'\1$^{\2,\3,\4}$', line) line_new = re.sub('[,]+}', '}', line_new) line_new = re.sub('{[,]+', '{', line_new) line_new = line_new.replace(',,', ',') read_data = read_data.replace(line, line_new) #\AddInstitute{1a}{Blah blah} \AddExternalInstitute line = line.replace('\\AddExternalInstitute', '\\AddInstitute') if re.search(r'\\AddInstitute{([^\}]+)}{', line): line_new = re.sub(r'\\AddInstitute{([^\}]+)}', r'$^{\1}$ ', line) read_data = read_data.replace('\\AddExternalInstitute', '\\AddInstitute') read_data = read_data.replace(line, line_new) #\firstname{C.-H.} \lastname{Yu} \inst{4} if re.search(r'\\firstname{', line) and re.search(r'\\inst{', line): line_new = re.sub( r'\\firstname{(.*)}\s*\\lastname{(.*)}\s*\\inst(\{.*\}).*', r'YYYY\2, \1$^\3$', line) read_data = read_data.replace(line, line_new) #\firstname{C.-H.} \lastname{Yu} elif re.search(r'\\firstname{', line): #line_new = re.sub(r'\\firstname{([^\}]+)}\s*\\lastname{([^\}]+)}', line_new = re.sub(r'\\firstname{(.*)}\s*\\lastname{(.*)}', r'YYYY\2, \1', line) read_data = read_data.replace(line, line_new) #I.J.~Arnquist\inst{10} if re.search(r'\\inst\{', line): line_new = re.sub(r'\\inst({[^\}]+\})', r'$^\1$', line) read_data = read_data.replace(line, line_new) #print "read_data =", read_data #Special treatment for BaBar for line in read_data.split('\n'): #BaBar \affiliation{Fermilab$^{a}$, SLAC$^{b}$} if re.search(r'\\affiliation\{.*\$\^\{?[abc]\}?\$', line): line_new = re.sub(r'\$\^\{?[abc]\}?\$', ' and ', line) read_data = read_data.replace(line, line_new) elif re.search(r'\\author\{.*\$\^\{?[abc]+\}?\$', line): line_new = re.sub(r'[ ]*\$\^\{?[abc]+\}?\$[ ]*', '', line) read_data = read_data.replace(line, line_new) elif re.search(r'\\author\{.*\\altaffiliation', line): line_new = re.sub(r'\\altaffiliation.*', '', line) read_data = read_data.replace(line, line_new) if VERBOSE: try: print "BABAR LINE =", line_new except UnboundLocalError: pass #Special treatment for DES and Fermi-LAT and Planck astro_aff_counter = 0 for line in read_data.split('\n'): #Get rid of newcommand lines now if re.search('newcommand', line): read_data = read_data.replace(line, '') if VERBOSE: print "ASTRO LINE =", line if re.search(r'\\section\*\{Affiliations\}', line) or \ re.search(r'\\institute\{\\small', line): astro_aff_counter = 1 if astro_aff_counter and re.search(r'^\\item', line): line_new = \ re.sub(r'^\\item', r'$^{' + str(astro_aff_counter) + r'}$', \ line) read_data = read_data.replace(line, line_new, 1) astro_aff_counter += 1 elif astro_aff_counter and re.search(r'\\goodbreak[ ]*$', line): line_new = \ re.sub(r'(.*)[ ]*\\goodbreak[ ]*$', r'$^{' + \ str(astro_aff_counter) + r'}$ \1', \ line) read_data = read_data.replace(line, line_new, 1) if VERBOSE: print astro_aff_counter, line print line_new astro_aff_counter += 1 elif astro_aff_counter and re.search(r'.\\and[ ]*$', line): line_new = \ re.sub(r'(.*)[ ]*\\and[ ]*$', r'$^{' + \ str(astro_aff_counter) + r'}$ \1', \ line) read_data = read_data.replace(line, line_new, 1) astro_aff_counter += 1 #print read_data #Special treatment for LIGO and Virgo pattern_au = re.compile(r"^([A-Z])[\~\.]([^-]*)([A-Z])([^A-Z]+)\s*\%\s*" r"([a-z])([a-z]+)\.([a-z])([a-z]+)") pattern_af = re.compile(r"\\affiliation\s*\{(.*)\}\s*\%.*(\{\d+\})") for line in read_data.split('\n'): match = re.match(pattern_au, line) if match: if match.group(5).upper() == match.group(1) and \ match.group(7).upper() == match.group(3): line_new = match.group(1) + match.group(6) + ' ' + \ match.group(2) + ' ' + match.group(3) + \ match.group(4) #print line_new, '\t\t', line read_data = read_data.replace(line, line_new) match = re.match(pattern_af, line) if match: line_new = "$^" + match.group(2) + "$" + match.group(1) #print line_new read_data = read_data.replace(line, line_new) #Remove spaces around braces and commas read_data = re.sub(r'[ ]*([\]\}\[\{\,])[ ]*', r'\1', read_data) read_data = re.sub(r'^[ ]+', '', read_data) read_data = re.sub(r'\-+', r'-', read_data) read_data = re.sub(r'%.*\n', '\n', read_data) read_data = re.sub(r'}\$,\s*', '}$\n', read_data) read_data = re.sub(r'\$\^(\w)\$,\s*', r'$^\1$\n', read_data) read_data = re.sub(r'\\thanks\{[^\}]+(0000-0[\d\-]+[\dX])[^\}]*\}', r'\\affiliation{\1}', read_data) read_data = re.sub(r'\}?\\thanks\{[^\}]+\}?', r'', read_data) read_data = re.sub(r'\\item\[(\$\^\{?\w+\}?\$)\]', r'\1', read_data) read_data = re.sub(r'\\llap\{(\$\S+\$)\}', r'\1 ', read_data) read_data = re.sub(r'\\textsuperscript\{([a-z\d\,]+)\}\{', r'$^{\1$} \{', read_data) read_data = re.sub(r'\\textsuperscript\{([a-z\d\,]+)\}', r'$^{\1$}\n', read_data) read_data = re.sub(r'\\address', r'\\affiliation', read_data) read_data = re.sub(r'\\affil\b', r'\\affiliation', read_data) read_data = re.sub(r'\\email\{', r'\\affiliation{', read_data) read_data = re.sub(r'}\s*\\affiliation', '}\n\\\\affiliation', read_data) read_data = re.sub(r'}\s*\\author', '}\n\\\\author', read_data) read_data = re.sub(r'[ ]*\\scriptsize[ ]+', '', read_data) read_data = re.sub(r'\\and[ ]+', '', read_data) read_data = re.sub(r'\$\s*\^', '$^', read_data) if VERBOSE: print "read_data =", read_data read_data = re.sub(r'Irefn{(\w+)}\\Aref{(\w+)}\\Aref{(\w+)}', \ r'Irefn{\1,\2,\3}', read_data) read_data = re.sub(r'Irefn+\{(.*)\}\\?A?r?e?f?s?\{(.*)\}', \ r'Irefn{\1,\2}', read_data) read_data = re.sub(r'Arefs?{(\w+)}', r'Irefn{\1}', read_data) read_data = re.sub(r'\\Idef{(\w+)}', r'$^{\1}$', read_data) #read_data = \ # re.sub(r'(\w\.?)[ \,]*\\(inst|altaffilmark|Irefn)\{(.*)\}', \ # r'\1$^{\3}$', read_data) read_data = \ re.sub(r'[ \,]*\\(inst|altaffilmark|Irefn|thanksref)\{([^\}]+)\}', \ r'$^{\2}$', read_data) #\altaffiltext{2}{Fermilab, Batavia} read_data = \ re.sub(r'\\(altaffiltext|thankstext)\{([\w\,\-]+)\}\{(.*)\}', \ r'$^{\2}$ \3', read_data) read_data = \ re.sub(r'\\item\s*\\[IA]def\{([\w\,\-]+)\}\{(.*)\}', r'$^{\1}$ \2', \ read_data) read_data = \ re.sub(r'\\[IA]def\{([\w\,\-]+)\}\{(.*)\}', r'$^{\1}$ \2', \ read_data) read_data = \ re.sub(r'(.*)\s*\\label\{(.*)\}', r'$^{\2}$ \1', \ read_data) #\author[b,c]{M. Zimmermann} \affiliation[b]{Fermilab} read_data = \ re.sub(r'\\author\[([\w\,\-]+)\]\{(.*)\}', r'\2$^{\1}$', read_data) read_data = \ re.sub(r'\\affiliation\[([\w\,\-]+)\]\{(.*)\}', r'$^{\1}$ \2', \ read_data) #\author{M. Zimmermann$^{b,c}$} \affiliation{$^{b}$Fermilab} remove \author read_data = \ re.sub(r'\\author\{(.*\$\^\{?[\w\,\-]+\}?\$)\}', r'\1', read_data) read_data = \ re.sub(r'\\affiliation\{(\$\^\{?[\w\,\-]+\}?\$.*)\}', r'\1', read_data) read_data = re.sub(r'[\, ]+\}', '}', read_data) read_data = re.sub(r'[\, ]+\$\^', '$^', read_data) #print read_data new_read_data = [] for line in read_data.split('\n'): if re.search('abstract', line, re.IGNORECASE) and astro_aff_counter < 1: break else: new_read_data.append(line) if VERBOSE: print "new_read_data =", new_read_data return new_read_data
def main(recids): """ Gets name and email from each HEPNames record. """ if VERBOSE: print recids icount = 1 for recid in recids: recid_str = str(recid) try: recid_int = int(recid) except ValueError: pass if re.search(r'INSPIRE-', recid_str) or re.search(r'@', recid_str): search = '035__a:' + recid_str result = perform_request_search(p=search, cc='HepNames') if re.search(r'@', recid_str): result = [get_hepnames_recid_from_email(recid_str)] try: recid = result[0] except IndexError: print "Problem with:", search continue recid_str = str(recid) recid_int = int(recid) if recid_int in BAD_RECIDS: print 'Bad recid', recid, '\n' continue if get_hepnames_anyid_from_recid(recid_int, 'ORCID'): print recid_str, 'already has an ORCID\n' icount += 1 continue try: contact_email = get_fieldvalues(recid_int, '371__m')[0] except IndexError: try: #The hidden contact address contact_email = get_fieldvalues(recid_int, '595__m')[0] except IndexError: contact_email = '*****@*****.**' try: contact_name = get_fieldvalues(recid_int, '100__a')[0] if "," in contact_name: contact_name = " ".join(contact_name.split(", ")[::-1]) except IndexError: contact_name = 'Sir or Madam' #contact_email = '*****@*****.**' #contact_email = "*****@*****.**" #contact_email = "*****@*****.**" #contact_email = "*****@*****.**" #contact_email = "*****@*****.**" #contact_email = "*****@*****.**" #contact_email = "*****@*****.**" #contact_email = "*****@*****.**" if TEST: contact_email = '*****@*****.**' print icount, '/', len(recids) print 'recid = ', recid_str print 'email = ', contact_email print 'name = ', contact_name print ' ' try: send_jobs_mail(recid_str, contact_email, contact_name) time.sleep(1) except IOError as error: print "I/O error({0}): {1}".format(error.errno, error.strerror) print 'PROBLEM sending mail to:' print recid, contact_email, contact_name, '\n' icount += 1
def preprocess_file(read_data): """Get file into a form that can be properly processed.""" read_data = preprocess_file_braces(read_data) #Process any user commands in latex. command_dict = {} for line in read_data.split('\n'): match = None if re.search('command', line): match = re.search(r'\\r?e?newcommand\*?\{\\(\w+)\}\{(.*)\}', line) elif re.search(r'\\def\\', line): match = re.search(r'\\def\\(\w+)\{(.*)\}', line) if match: command_value = match.group(2) if re.search(r'^\\\w', command_value): command_value = '\\' + command_value command_dict[match.group(1)] = command_value for key in command_dict: try: command_string = re.compile(r'\\%s\b' % key) read_data = re.sub(command_string, command_dict[key], read_data) except re.error: print '!!! Problem with user commands:', key, command_dict[key] sys.exit() read_data = read_data.replace('{+}', '{WXYZ}') for line in read_data.split('\n'): #\href{http://inspirehep.net/record/1068305}{J.~Alimena}$^{7}$ match_obj = re.search(r'/record/(\d+).*$', line) if match_obj: for id_type in ['ORCID', 'INSPIRE']: id_num = get_hepnames_anyid_from_recid(match_obj.group(1), id_type) if id_num: #print line line_new = re.sub(r'.*\}\{(.*)\}(\$\^.*)', r'\1 [' + id_num + r']\2', line) read_data = read_data.replace(line, line_new) #print line_new continue #orcid = get_hepnames_anyid_from_recid(match_obj.group(1), 'ORCID') #if orcid: # line_new = re.sub(r'.*' + match_obj.group(1) + '}', # r'\\author[' + orcid + ']', line) # line_new = re.sub(r'\}(\$\^\{.+\}\$)', r'\1}', line_new) # read_data = read_data.replace(line, line_new) # print line_new # continue #else: # inspire = get_hepnames_anyid_from_recid(match_obj.group(1), # 'INSPIRE') # if inspire: # line_new = re.sub(r'.*' + match_obj.group(1) + '}', # r'\\author[' + inspire + ']', line) # read_data = read_data.replace(line, line_new) # print line_new # continue match_obj = re.search(r'record/(\d+)', line) if match_obj: try: inst = get_fieldvalues(match_obj.group(1), '110__u')[0] line_new = re.sub(r'\\href{http://inspirehep.net/record/\d+}', inst + ' %', line) print line_new read_data = read_data.replace(line, line_new) except IndexError: pass #John Smith (University of Somewhere) if re.search(r'^[A-Z].* \(.*\)\s*$', line): line_new = re.sub(r'(.*)\s+\((.*)\)', r'\\author{\1}\n\\affiliation{\2}', line) read_data = read_data.replace(line, line_new) #\AddAuthor{C.~Lindsey}{11}{}{} if re.search(r'\\AddAuthor{', line): line_new = \ re.sub(r'\\AddAuthor{(.*)}{([^\}]*)}{([^\}]*)}{([^\}]*)}', r'\1$^{\2,\3,\4}$', line) line_new = re.sub('[,]+}', '}', line_new) line_new = re.sub('{[,]+', '{', line_new) line_new = line_new.replace(',,', ',') read_data = read_data.replace(line, line_new) #\AddInstitute{1a}{Blah blah} \AddExternalInstitute line = line.replace('\\AddExternalInstitute', '\\AddInstitute') if re.search(r'\\AddInstitute{([^\}]+)}{', line): line_new = re.sub(r'\\AddInstitute{([^\}]+)}', r'$^{\1}$ ', line) read_data = read_data.replace('\\AddExternalInstitute', '\\AddInstitute') read_data = read_data.replace(line, line_new) #\firstname{C.-H.} \lastname{Yu} \inst{4} if re.search(r'\\firstname{', line) and re.search(r'\\inst{', line): line_new = re.sub(r'\\firstname{(.*)}\s*\\lastname{(.*)}\s*\\inst(\{.*\}).*', r'YYYY\2, \1$^\3$', line) read_data = read_data.replace(line, line_new) #\firstname{C.-H.} \lastname{Yu} elif re.search(r'\\firstname{', line): #line_new = re.sub(r'\\firstname{([^\}]+)}\s*\\lastname{([^\}]+)}', line_new = re.sub(r'\\firstname{(.*)}\s*\\lastname{(.*)}', r'YYYY\2, \1', line) read_data = read_data.replace(line, line_new) #I.J.~Arnquist\inst{10} if re.search(r'\\inst\{', line): line_new = re.sub(r'\\inst({[^\}]+\})', r'$^\1$', line) read_data = read_data.replace(line, line_new) #print "read_data =", read_data #Special treatment for BaBar for line in read_data.split('\n'): #BaBar \affiliation{Fermilab$^{a}$, SLAC$^{b}$} if re.search(r'\\affiliation\{.*\$\^\{?[abc]\}?\$', line): line_new = re.sub(r'\$\^\{?[abc]\}?\$', ' and ', line) read_data = read_data.replace(line, line_new) elif re.search(r'\\author\{.*\$\^\{?[abc]+\}?\$', line): line_new = re.sub(r'[ ]*\$\^\{?[abc]+\}?\$[ ]*', '', line) read_data = read_data.replace(line, line_new) elif re.search(r'\\author\{.*\\altaffiliation', line): line_new = re.sub(r'\\altaffiliation.*', '', line) read_data = read_data.replace(line, line_new) if VERBOSE: try: print "BABAR LINE =", line_new except UnboundLocalError: pass #Special treatment for DES and Fermi-LAT and Planck astro_aff_counter = 0 for line in read_data.split('\n'): #Get rid of newcommand lines now if re.search('newcommand', line): read_data = read_data.replace(line, '') if VERBOSE: print "ASTRO LINE =", line if re.search(r'\\section\*\{Affiliations\}', line) or \ re.search(r'\\institute\{\\small', line): astro_aff_counter = 1 if astro_aff_counter and re.search(r'^\\item', line): line_new = \ re.sub(r'^\\item', r'$^{' + str(astro_aff_counter) + r'}$', \ line) read_data = read_data.replace(line, line_new, 1) astro_aff_counter += 1 elif astro_aff_counter and re.search(r'\\goodbreak[ ]*$', line): line_new = \ re.sub(r'(.*)[ ]*\\goodbreak[ ]*$', r'$^{' + \ str(astro_aff_counter) + r'}$ \1', \ line) read_data = read_data.replace(line, line_new, 1) if VERBOSE: print astro_aff_counter, line print line_new astro_aff_counter += 1 elif astro_aff_counter and re.search(r'.\\and[ ]*$', line): line_new = \ re.sub(r'(.*)[ ]*\\and[ ]*$', r'$^{' + \ str(astro_aff_counter) + r'}$ \1', \ line) read_data = read_data.replace(line, line_new, 1) astro_aff_counter += 1 #print read_data #Special treatment for LIGO and Virgo pattern_au = re.compile(r"^([A-Z])[\~\.]([^-]*)([A-Z])([^A-Z]+)\s*\%\s*" r"([a-z])([a-z]+)\.([a-z])([a-z]+)") pattern_af = re.compile(r"\\affiliation\s*\{(.*)\}\s*\%.*(\{\d+\})") for line in read_data.split('\n'): match = re.match(pattern_au, line) if match: if match.group(5).upper() == match.group(1) and \ match.group(7).upper() == match.group(3): line_new = match.group(1) + match.group(6) + ' ' + \ match.group(2) + ' ' + match.group(3) + \ match.group(4) #print line_new, '\t\t', line read_data = read_data.replace(line, line_new) match = re.match(pattern_af, line) if match: line_new = "$^" + match.group(2) + "$" + match.group(1) #print line_new read_data = read_data.replace(line, line_new) #Remove spaces around braces and commas read_data = re.sub(r'[ ]*([\]\}\[\{\,])[ ]*', r'\1', read_data) read_data = re.sub(r'^[ ]+', '', read_data) read_data = re.sub(r'\-+', r'-', read_data) read_data = re.sub(r'%.*\n', '\n', read_data) read_data = re.sub(r'}\$,\s*', '}$\n', read_data) read_data = re.sub(r'\$\^(\w)\$,\s*', r'$^\1$\n', read_data) read_data = re.sub(r'\\thanks\{[^\}]+(0000-0[\d\-]+[\dX])[^\}]*\}', r'\\affiliation{\1}', read_data) read_data = re.sub(r'\}?\\thanks\{[^\}]+\}?', r'', read_data) read_data = re.sub(r'\\item\[(\$\^\{?\w+\}?\$)\]', r'\1', read_data) read_data = re.sub(r'\\llap\{(\$\S+\$)\}', r'\1 ', read_data) read_data = re.sub(r'\\textsuperscript\{([a-z\d\,]+)\}\{', r'$^{\1$} \{', read_data) read_data = re.sub(r'\\textsuperscript\{([a-z\d\,]+)\}', r'$^{\1$}\n', read_data) read_data = re.sub(r'\\address', r'\\affiliation', read_data) read_data = re.sub(r'\\affil\b', r'\\affiliation', read_data) read_data = re.sub(r'\\email\{', r'\\affiliation{', read_data) read_data = re.sub(r'}\s*\\affiliation', '}\n\\\\affiliation', read_data) read_data = re.sub(r'}\s*\\author', '}\n\\\\author', read_data) read_data = re.sub(r'[ ]*\\scriptsize[ ]+', '', read_data) read_data = re.sub(r'\\and[ ]+', '', read_data) read_data = re.sub(r'\$\s*\^', '$^', read_data) if VERBOSE: print "read_data =", read_data read_data = re.sub(r'Irefn{(\w+)}\\Aref{(\w+)}\\Aref{(\w+)}', \ r'Irefn{\1,\2,\3}', read_data) read_data = re.sub(r'Irefn+\{(.*)\}\\?A?r?e?f?s?\{(.*)\}', \ r'Irefn{\1,\2}', read_data) read_data = re.sub(r'Arefs?{(\w+)}', r'Irefn{\1}', read_data) read_data = re.sub(r'\\Idef{(\w+)}', r'$^{\1}$', read_data) #read_data = \ # re.sub(r'(\w\.?)[ \,]*\\(inst|altaffilmark|Irefn)\{(.*)\}', \ # r'\1$^{\3}$', read_data) read_data = \ re.sub(r'[ \,]*\\(inst|altaffilmark|Irefn|thanksref)\{([^\}]+)\}', \ r'$^{\2}$', read_data) #\altaffiltext{2}{Fermilab, Batavia} read_data = \ re.sub(r'\\(altaffiltext|thankstext)\{([\w\,\-]+)\}\{(.*)\}', \ r'$^{\2}$ \3', read_data) read_data = \ re.sub(r'\\item\s*\\[IA]def\{([\w\,\-]+)\}\{(.*)\}', r'$^{\1}$ \2', \ read_data) read_data = \ re.sub(r'\\[IA]def\{([\w\,\-]+)\}\{(.*)\}', r'$^{\1}$ \2', \ read_data) read_data = \ re.sub(r'(.*)\s*\\label\{(.*)\}', r'$^{\2}$ \1', \ read_data) #\author[b,c]{M. Zimmermann} \affiliation[b]{Fermilab} read_data = \ re.sub(r'\\author\[([\w\,\-]+)\]\{(.*)\}', r'\2$^{\1}$', read_data) read_data = \ re.sub(r'\\affiliation\[([\w\,\-]+)\]\{(.*)\}', r'$^{\1}$ \2', \ read_data) #\author{M. Zimmermann$^{b,c}$} \affiliation{$^{b}$Fermilab} remove \author read_data = \ re.sub(r'\\author\{(.*\$\^\{?[\w\,\-]+\}?\$)\}', r'\1', read_data) read_data = \ re.sub(r'\\affiliation\{(\$\^\{?[\w\,\-]+\}?\$.*)\}', r'\1', read_data) read_data = re.sub(r'[\, ]+\}', '}', read_data) read_data = re.sub(r'[\, ]+\$\^', '$^', read_data) #print read_data new_read_data = [] for line in read_data.split('\n'): if re.search('abstract', line, re.IGNORECASE) and astro_aff_counter < 1: break else: new_read_data.append(line) if VERBOSE: print "new_read_data =", new_read_data return new_read_data