예제 #1
0
def main(experiment, collaboration):
    authors = {}
    affiliations = []
    affiliation_count = 1
    search = "693__e:" + experiment
    x = perform_request_search(p = search, cc = 'HepNames')
    for r in x:
        foaf_name = get_fieldvalues(r, '100__q')
        cal_authorNameNative = get_fieldvalues(r, '400__a')
        name = get_fieldvalues(r, '100__a')[0]
        foaf_givenName  = re.sub(r'.*\, ', '', name)
        foaf_familyName =  re.sub(r'\,.*', '', name)
        author_id = find_inspire_id_from_record(r)
        orcid      = get_hepnames_anyid_from_recid(r, 'ORCID')
        if VERBOSE:
            print r
        affiliation = get_hepnames_aff_from_recid(r, 'Current')
        if not affiliation: print 'No aff - find recid', r
        d = {}
        d['foaf_givenName']  = foaf_givenName
        d['foaf_familyName'] = foaf_familyName
        d['affiliation']     = affiliation
        d['author_id']       = author_id
        authors[name.lower()] = d
        affiliations.append(affiliation)
    affiliations = affiliations_process(affiliations)
    for key in authors:
        affiliation = authors[key]['affiliation']
        affiliation_number = affiliations.index(affiliation) + 1
        authors[key]['affiliation_id'] = affiliation_number
    print xml_frontmatter(experiment, collaboration)
    print xml_affiliations(affiliations)
    print xml_authors(authors)
def convert_search_to_inspire_id(search):
    inspire_id = None
    orcid      = None
    recid = get_hepnames_recid_from_search(search)
    if recid:
        inspire_id = find_inspire_id_from_record(recid)
        orcid = get_hepnames_anyid_from_recid(recid, 'ORCID')
    return [inspire_id, orcid]
def main(recids):
    """
    Gets name and email from each HEPNames record.
    """

    if VERBOSE:
        print recids

    icount = 1
    for recid in recids:
        if recid in BAD_RECIDS:
            break
        recid_str = str(recid)
        recid_int = int(recid)
        if re.search(r'INSPIRE-', recid_str):
            search = '035__a:' + recid_str
            result = perform_request_search(p=search, cc='HepNames')
            recid = result[0]
            recid_str = str(recid)
            recid_int = int(recid)
        if get_hepnames_anyid_from_recid(recid_int, 'ORCID'):
            print recid_str, 'already has an ORCID\n'
            icount += 1
            continue
        try:
            contact_email = get_fieldvalues(recid_int, '371__m')[0]
        except:
            contact_email = '*****@*****.**'
        try:
            contact_name = get_fieldvalues(recid_int, '100__a')[0]
            if "," in contact_name:
                contact_name = " ".join(contact_name.split(", ")[::-1])
        except:
            contact_name = 'Sir or Madam'
        #contact_email = '*****@*****.**'
        #contact_email = "*****@*****.**"
        #contact_email = "*****@*****.**"
        #contact_email = "*****@*****.**"
        #contact_email = "*****@*****.**"
        #contact_email = "*****@*****.**"
        #contact_email = "*****@*****.**"
        #contact_email = "*****@*****.**"

        print icount, '/', len(recids)
        print 'recid = ', recid_str
        print 'email = ', contact_email
        print 'name  = ', contact_name
        print ' '
        try:
            send_jobs_mail(recid_str, contact_email, contact_name)
            time.sleep(1)
        except IOError as e:
            print "I/O error({0}): {1}".format(e.errno, e.strerror)
            print 'PROBLEM sending mail to:'
            print recid, contact_email, contact_name, '\n'
        icount += 1
예제 #4
0
def process_line(author, aff, experiment_id):
    """Process each line of the author file to check INSPIRE
       for the authors.
    """

    author = author_first_last(process_author_name(author))
    search = 'find a ' + author + ' and exp ' + EXPERIMENT
    recid = get_hepnames_recid_from_search(search)
    if recid:
        #print create_xml(recid, experiment, experiment_id)
        inspire_aff = get_hepnames_aff_from_recid(recid, 'current')
        inspire_author = get_fieldvalues(recid, '100__a')[0]
        inspire_id = get_hepnames_anyid_from_recid(recid, 'INSPIRE')
        orcid = get_hepnames_anyid_from_recid(recid, 'ORCID')
        return "{0}\t{1}\t{2}\t||\t{3}\t{4}\t{5}\t{6}\n".format(
               author, aff, experiment_id, inspire_author, inspire_aff,
               inspire_id, orcid)
    else:
        return "{0}\t{1}\t{2}\n".format(author, aff, experiment_id)
예제 #5
0
def create_report(filename='', extid='ORCID'):
    """
    Write list of profile info with identifier duplicates
    """
    if not filename:
        print('Must provide a filename for the report')
        return
    i = 0
    eidold = ''
    with open(filename, 'w') as output:
        for pid, eid in find_dup_extid(extid=extid):
            if eidold != eid and i > 1:
                output.write('-'*30 + "\n\n")
                eidold = eid
            output.write("https://inspirehep.net/author/profile/%s\n\n" % pid)
            author = orcid = inspire = None
            for tag, data in run_sql(
                             "select tag, data from aidPERSONIDDATA where" +
                             " personid={0} and tag <> 'arxiv_papers'".
                             format(pid)):
                recid = None
                if tag == 'canonical_name':
                    canonical_name = data
                    recid = get_recid_from_id(canonical_name)
                if recid:
                    #bai = get_hepnames_anyid_from_recid(recid, 'BAI')
                    orcid = get_hepnames_anyid_from_recid(recid, 'ORCID')
                    inspire = get_hepnames_anyid_from_recid(recid, 'INSPIRE')
                    author = get_fieldvalues(recid, '100__a')[0]
                output.write("{0}{1:18s}{2:22s}\n".\
                        format(' '*4, tag, data))
            output.write("{0}{1:20s}{2:20s}{3:20}\n".\
                     format('    HEPNames: ', orcid, inspire, author))
            hep_records = show_papers(pid, eid, orcid, inspire)
            if hep_records:
                output.write("\n    HEP records with other profile's IDs\n")
                output.write(hep_records)
            #try:
            #    output.write(show_papers(pid, eid))
            #except TypeError:
            #    pass
            i += 1
            output.write("\n")
def find_authors():
    search = SEARCH
    print search
    result = perform_request_search(p=search,cc='HepNames')
    authors = []
    for recid in result:
        bai = get_hepnames_anyid_from_recid(recid, 'BAI')
        if bai:
            authors.append(bai)
    return authors
def convert_search_to_inspire_id(search):
    """Convert a search to an INSPIRE ID."""

    inspire_id = None
    orcid = None
    recid = get_hepnames_recid_from_search(search)
    if recid:
        inspire_id = find_inspire_id_from_record(recid)
        orcid = get_hepnames_anyid_from_recid(recid, 'ORCID')
    return [inspire_id, orcid]
def create_report(filename='', extid='ORCID'):
    """
    Write list of profile info with identifier duplicates
    """
    if not filename:
        print('Must provide a filename for the report')
        return
    i = 0
    eidold = ''
    with open(filename, 'w') as output:
        for pid, eid in find_dup_extid(extid=extid):
            if eidold != eid and i > 1:
                output.write('-'*30 + "\n\n")
                eidold = eid
            output.write("https://inspirehep.net/author/profile/%s\n\n" % pid)
            author = orcid = inspire = None
            for tag, data in run_sql(
                             "select tag, data from aidPERSONIDDATA where" +
                             " personid={0} and tag <> 'arxiv_papers'".
                             format(pid)):
                recid = None
                if tag == 'canonical_name':
                    canonical_name = data
                    recid = get_recid_from_id(canonical_name)
                if recid:
                    #bai = get_hepnames_anyid_from_recid(recid, 'BAI')
                    orcid = get_hepnames_anyid_from_recid(recid, 'ORCID')
                    inspire = get_hepnames_anyid_from_recid(recid, 'INSPIRE')
                    author = get_fieldvalues(recid, '100__a')[0]
                output.write("{0}{1:18s}{2:22s}\n".\
                        format(' '*4, tag, data))
            output.write("{0}{1:20s}{2:20s}{3:20}\n".\
                     format('    HEPNames: ', orcid, inspire, author))
            hep_records = show_papers(pid, eid, orcid, inspire)
            if hep_records:
                output.write('\n    HEP records with wrong IDs\n')
                output.write(hep_records)
            #try:
            #    output.write(show_papers(pid, eid))
            #except TypeError:
            #    pass
            i += 1
            output.write("\n")
def find_authors():
    search = SEARCH
    print search
    result = perform_request_search(p=search, cc='HepNames')
    authors = []
    for recid in result:
        bai = get_hepnames_anyid_from_recid(recid, 'BAI')
        if bai:
            authors.append(bai)
    return authors
예제 #10
0
def check_ids(letter=None):
    """Go through HEPNames looking for bad IDs."""

    already_seen = {}
    duplicates = set()
    bad_id_set = set()
    fields = ['035__a', '035__z', '371__m']
    print 'check_ids: letter =', letter
    if letter:
        fields.append('100__a')

    for recid, field in [(recid, field) for recid in RECIDS_HEPN \
                                        for field in fields]:
        skip = False
        field_values = get_fieldvalues(recid, field)
        if field == '100__a':
            try:
                if not field_values[0].startswith(letter):
                    skip = True
            except IndexError:
                print "No name on record:", recid
        if not skip:
            for field_value in field_values:
                if field_value in already_seen:
                    duplicates.add(field + ':"' + field_value + '"')
                    continue
                already_seen[field_value] = field
                if bad_id_check(field_value):
                    bad_id_set.add(field + ':"' + field_value + '"')

    print "Duplicates"
    for duplicate in sorted(duplicates):
        if duplicate.startswith('100__a'):
            result = perform_request_search(p=duplicate, cc='HepNames')
            for recid in result:
                #name = duplicate.replace('100__a:"', '')
                #name = name.replace('"', '')
                print '{0:37s} {1:18s} {2:20s}'.\
                       format(duplicate, \
                         find_inspire_id_from_record(recid), \
                         get_hepnames_anyid_from_recid(recid, 'ORCID'))
        elif duplicate.startswith('035__a'):
            print duplicate
            #search = r'100__a:{0} or 700__a:{0}'.\
            #         format(duplicate.strip('035__a:'))
            search = r'100:{0} or 700:{0}'.\
                     format(duplicate.strip('035__a:'))
            result = perform_request_search(p=duplicate, cc='HEP')
            if len(result) > 0:
                print "Duplicate ID in HEP records"
                print "  ", search
        else:
            print duplicate

    print "Bad metadata"
    for bad_id in sorted(bad_id_set):
        print bad_id

    print "Bad ORCIDS in BAI"
    bad_orcid_bai()

    print "Finding new ORCIDs in HEP"
    new_orcids(already_seen)
예제 #11
0
def preprocess_file(read_data):
    """Get file into a form that can be properly processed."""

    read_data = preprocess_file_braces(read_data)

    #Process any user commands in latex.
    command_dict = {}
    for line in read_data.split('\n'):
        match = None
        if re.search('command', line):
            match = re.search(r'\\r?e?newcommand\*?\{\\(\w+)\}\{(.*)\}', line)
        elif re.search(r'\\def\\', line):
            match = re.search(r'\\def\\(\w+)\{(.*)\}', line)
        if match:
            command_value = match.group(2)
            if re.search(r'^\\\w', command_value):
                command_value = '\\' + command_value
            command_dict[match.group(1)] = command_value
    for key in command_dict:
        try:
            command_string = re.compile(r'\\%s\b' % key)
            read_data = re.sub(command_string, command_dict[key], read_data)
        except re.error:
            print '!!! Problem with user commands:', key, command_dict[key]
            sys.exit()

    read_data = read_data.replace('{+}', '{WXYZ}')
    for line in read_data.split('\n'):
        #\href{http://inspirehep.net/record/1068305}{J.~Alimena}$^{7}$
        match_obj = re.search(r'/record/(\d+).*$', line)
        if match_obj:
            for id_type in ['ORCID', 'INSPIRE']:
                id_num = get_hepnames_anyid_from_recid(match_obj.group(1),
                                                       id_type)
                if id_num:
                    #print line
                    line_new = re.sub(r'.*\}\{(.*)\}(\$\^.*)',
                                      r'\1 [' + id_num + r']\2', line)
                    read_data = read_data.replace(line, line_new)
                    #print line_new
                    continue

            #orcid = get_hepnames_anyid_from_recid(match_obj.group(1), 'ORCID')
            #if orcid:
            #    line_new = re.sub(r'.*' + match_obj.group(1) + '}',
            #                      r'\\author[' + orcid + ']', line)
            #    line_new = re.sub(r'\}(\$\^\{.+\}\$)', r'\1}', line_new)
            #    read_data = read_data.replace(line, line_new)
            #    print line_new
            #    continue
            #else:
            #    inspire = get_hepnames_anyid_from_recid(match_obj.group(1),
            #                                            'INSPIRE')
            #    if inspire:
            #        line_new = re.sub(r'.*' + match_obj.group(1) + '}',
            #                          r'\\author[' + inspire + ']', line)
            #        read_data = read_data.replace(line, line_new)
            #        print line_new
            #        continue
        match_obj = re.search(r'record/(\d+)', line)
        if match_obj:
            try:
                inst = get_fieldvalues(match_obj.group(1), '110__u')[0]
                line_new = re.sub(r'\\href{http://inspirehep.net/record/\d+}',
                                  inst + ' %', line)
                print line_new
                read_data = read_data.replace(line, line_new)
            except IndexError:
                pass

        #John Smith (University of Somewhere)
        if re.search(r'^[A-Z].* \(.*\)\s*$', line):
            line_new = re.sub(r'(.*)\s+\((.*)\)',
                              r'\\author{\1}\n\\affiliation{\2}', line)
            read_data = read_data.replace(line, line_new)

        #\AddAuthor{C.~Lindsey}{11}{}{}
        if re.search(r'\\AddAuthor{', line):
            line_new = \
                     re.sub(r'\\AddAuthor{(.*)}{([^\}]*)}{([^\}]*)}{([^\}]*)}',
                              r'\1$^{\2,\3,\4}$', line)
            line_new = re.sub('[,]+}', '}', line_new)
            line_new = re.sub('{[,]+', '{', line_new)
            line_new = line_new.replace(',,', ',')
            read_data = read_data.replace(line, line_new)
        #\AddInstitute{1a}{Blah blah} \AddExternalInstitute
        line = line.replace('\\AddExternalInstitute', '\\AddInstitute')
        if re.search(r'\\AddInstitute{([^\}]+)}{', line):
            line_new = re.sub(r'\\AddInstitute{([^\}]+)}', r'$^{\1}$ ', line)
            read_data = read_data.replace('\\AddExternalInstitute',
                                          '\\AddInstitute')
            read_data = read_data.replace(line, line_new)

        #\firstname{C.-H.} \lastname{Yu} \inst{4}
        if re.search(r'\\firstname{', line) and re.search(r'\\inst{', line):
            line_new = re.sub(
                r'\\firstname{(.*)}\s*\\lastname{(.*)}\s*\\inst(\{.*\}).*',
                r'YYYY\2, \1$^\3$', line)
            read_data = read_data.replace(line, line_new)
        #\firstname{C.-H.} \lastname{Yu}
        elif re.search(r'\\firstname{', line):
            #line_new = re.sub(r'\\firstname{([^\}]+)}\s*\\lastname{([^\}]+)}',
            line_new = re.sub(r'\\firstname{(.*)}\s*\\lastname{(.*)}',
                              r'YYYY\2, \1', line)
            read_data = read_data.replace(line, line_new)
        #I.J.~Arnquist\inst{10}
        if re.search(r'\\inst\{', line):
            line_new = re.sub(r'\\inst({[^\}]+\})', r'$^\1$', line)
            read_data = read_data.replace(line, line_new)
    #print "read_data =", read_data

    #Special treatment for BaBar
    for line in read_data.split('\n'):
        #BaBar \affiliation{Fermilab$^{a}$, SLAC$^{b}$}
        if re.search(r'\\affiliation\{.*\$\^\{?[abc]\}?\$', line):
            line_new = re.sub(r'\$\^\{?[abc]\}?\$', ' and ', line)
            read_data = read_data.replace(line, line_new)
        elif re.search(r'\\author\{.*\$\^\{?[abc]+\}?\$', line):
            line_new = re.sub(r'[ ]*\$\^\{?[abc]+\}?\$[ ]*', '', line)
            read_data = read_data.replace(line, line_new)
        elif re.search(r'\\author\{.*\\altaffiliation', line):
            line_new = re.sub(r'\\altaffiliation.*', '', line)
            read_data = read_data.replace(line, line_new)
        if VERBOSE:
            try:
                print "BABAR LINE =", line_new
            except UnboundLocalError:
                pass

    #Special treatment for DES and Fermi-LAT and Planck
    astro_aff_counter = 0
    for line in read_data.split('\n'):
        #Get rid of newcommand lines now
        if re.search('newcommand', line):
            read_data = read_data.replace(line, '')
        if VERBOSE:
            print "ASTRO LINE =", line
        if re.search(r'\\section\*\{Affiliations\}', line) or \
           re.search(r'\\institute\{\\small', line):
            astro_aff_counter = 1
        if astro_aff_counter and re.search(r'^\\item', line):
            line_new = \
                re.sub(r'^\\item', r'$^{' + str(astro_aff_counter) + r'}$', \
                line)
            read_data = read_data.replace(line, line_new, 1)
            astro_aff_counter += 1
        elif astro_aff_counter and re.search(r'\\goodbreak[ ]*$', line):
            line_new = \
                re.sub(r'(.*)[ ]*\\goodbreak[ ]*$', r'$^{' + \
                       str(astro_aff_counter) + r'}$ \1', \
                       line)
            read_data = read_data.replace(line, line_new, 1)
            if VERBOSE:
                print astro_aff_counter, line
                print line_new
            astro_aff_counter += 1
        elif astro_aff_counter and re.search(r'.\\and[ ]*$', line):
            line_new = \
                re.sub(r'(.*)[ ]*\\and[ ]*$', r'$^{' + \
                       str(astro_aff_counter) + r'}$ \1', \
                       line)
            read_data = read_data.replace(line, line_new, 1)
            astro_aff_counter += 1
    #print read_data

    #Special treatment for LIGO and Virgo
    pattern_au = re.compile(r"^([A-Z])[\~\.]([^-]*)([A-Z])([^A-Z]+)\s*\%\s*"
                            r"([a-z])([a-z]+)\.([a-z])([a-z]+)")
    pattern_af = re.compile(r"\\affiliation\s*\{(.*)\}\s*\%.*(\{\d+\})")
    for line in read_data.split('\n'):
        match = re.match(pattern_au, line)
        if match:
            if match.group(5).upper() == match.group(1) and \
               match.group(7).upper() == match.group(3):
                line_new = match.group(1) + match.group(6) + ' ' + \
                           match.group(2) + ' ' + match.group(3) + \
                           match.group(4)
                #print line_new, '\t\t', line
                read_data = read_data.replace(line, line_new)
        match = re.match(pattern_af, line)
        if match:
            line_new = "$^" + match.group(2) + "$" + match.group(1)
            #print line_new
            read_data = read_data.replace(line, line_new)

    #Remove spaces around braces and commas
    read_data = re.sub(r'[ ]*([\]\}\[\{\,])[ ]*', r'\1', read_data)
    read_data = re.sub(r'^[ ]+', '', read_data)

    read_data = re.sub(r'\-+', r'-', read_data)

    read_data = re.sub(r'%.*\n', '\n', read_data)
    read_data = re.sub(r'}\$,\s*', '}$\n', read_data)
    read_data = re.sub(r'\$\^(\w)\$,\s*', r'$^\1$\n', read_data)
    read_data = re.sub(r'\\thanks\{[^\}]+(0000-0[\d\-]+[\dX])[^\}]*\}',
                       r'\\affiliation{\1}', read_data)
    read_data = re.sub(r'\}?\\thanks\{[^\}]+\}?', r'', read_data)
    read_data = re.sub(r'\\item\[(\$\^\{?\w+\}?\$)\]', r'\1', read_data)
    read_data = re.sub(r'\\llap\{(\$\S+\$)\}', r'\1 ', read_data)
    read_data = re.sub(r'\\textsuperscript\{([a-z\d\,]+)\}\{', r'$^{\1$} \{',
                       read_data)
    read_data = re.sub(r'\\textsuperscript\{([a-z\d\,]+)\}', r'$^{\1$}\n',
                       read_data)
    read_data = re.sub(r'\\address', r'\\affiliation', read_data)
    read_data = re.sub(r'\\affil\b', r'\\affiliation', read_data)
    read_data = re.sub(r'\\email\{', r'\\affiliation{', read_data)
    read_data = re.sub(r'}\s*\\affiliation', '}\n\\\\affiliation', read_data)
    read_data = re.sub(r'}\s*\\author', '}\n\\\\author', read_data)
    read_data = re.sub(r'[ ]*\\scriptsize[ ]+', '', read_data)
    read_data = re.sub(r'\\and[ ]+', '', read_data)
    read_data = re.sub(r'\$\s*\^', '$^', read_data)
    if VERBOSE:
        print "read_data =", read_data
    read_data = re.sub(r'Irefn{(\w+)}\\Aref{(\w+)}\\Aref{(\w+)}', \
                       r'Irefn{\1,\2,\3}', read_data)
    read_data = re.sub(r'Irefn+\{(.*)\}\\?A?r?e?f?s?\{(.*)\}', \
                       r'Irefn{\1,\2}', read_data)
    read_data = re.sub(r'Arefs?{(\w+)}', r'Irefn{\1}', read_data)
    read_data = re.sub(r'\\Idef{(\w+)}', r'$^{\1}$', read_data)
    #read_data = \
    #    re.sub(r'(\w\.?)[ \,]*\\(inst|altaffilmark|Irefn)\{(.*)\}', \
    #           r'\1$^{\3}$', read_data)
    read_data = \
        re.sub(r'[ \,]*\\(inst|altaffilmark|Irefn|thanksref)\{([^\}]+)\}', \
               r'$^{\2}$', read_data)
    #\altaffiltext{2}{Fermilab, Batavia}
    read_data = \
        re.sub(r'\\(altaffiltext|thankstext)\{([\w\,\-]+)\}\{(.*)\}', \
               r'$^{\2}$ \3', read_data)
    read_data = \
        re.sub(r'\\item\s*\\[IA]def\{([\w\,\-]+)\}\{(.*)\}', r'$^{\1}$ \2', \
               read_data)
    read_data = \
        re.sub(r'\\[IA]def\{([\w\,\-]+)\}\{(.*)\}', r'$^{\1}$ \2', \
               read_data)
    read_data = \
        re.sub(r'(.*)\s*\\label\{(.*)\}', r'$^{\2}$ \1', \
               read_data)
    #\author[b,c]{M. Zimmermann} \affiliation[b]{Fermilab}
    read_data = \
        re.sub(r'\\author\[([\w\,\-]+)\]\{(.*)\}', r'\2$^{\1}$', read_data)
    read_data = \
        re.sub(r'\\affiliation\[([\w\,\-]+)\]\{(.*)\}', r'$^{\1}$ \2', \
               read_data)
    #\author{M. Zimmermann$^{b,c}$} \affiliation{$^{b}$Fermilab} remove \author
    read_data = \
        re.sub(r'\\author\{(.*\$\^\{?[\w\,\-]+\}?\$)\}', r'\1', read_data)
    read_data = \
        re.sub(r'\\affiliation\{(\$\^\{?[\w\,\-]+\}?\$.*)\}', r'\1', read_data)

    read_data = re.sub(r'[\, ]+\}', '}', read_data)
    read_data = re.sub(r'[\, ]+\$\^', '$^', read_data)
    #print read_data
    new_read_data = []
    for line in read_data.split('\n'):
        if re.search('abstract', line,
                     re.IGNORECASE) and astro_aff_counter < 1:
            break
        else:
            new_read_data.append(line)
    if VERBOSE:
        print "new_read_data =", new_read_data
    return new_read_data
예제 #12
0
def main(recids):
    """
    Gets name and email from each HEPNames record.
    """

    if VERBOSE:
        print recids

    icount = 1
    for recid in recids:
        recid_str = str(recid)
        try:
            recid_int = int(recid)
        except ValueError:
            pass
        if re.search(r'INSPIRE-', recid_str) or re.search(r'@', recid_str):
            search = '035__a:' + recid_str
            result = perform_request_search(p=search, cc='HepNames')
            if re.search(r'@', recid_str):
                result = [get_hepnames_recid_from_email(recid_str)]
            try:
                recid = result[0]
            except IndexError:
                print "Problem with:", search
                continue
            recid_str = str(recid)
            recid_int = int(recid)
        if recid_int in BAD_RECIDS:
                print 'Bad recid', recid, '\n'
                continue
        if get_hepnames_anyid_from_recid(recid_int, 'ORCID'):
            print recid_str, 'already has an ORCID\n'
            icount += 1
            continue
        try:
            contact_email = get_fieldvalues(recid_int, '371__m')[0]
        except IndexError:
            try:
                #The hidden contact address
                contact_email = get_fieldvalues(recid_int, '595__m')[0]
            except IndexError:
                contact_email = '*****@*****.**'
        try:
            contact_name = get_fieldvalues(recid_int, '100__a')[0]
            if "," in contact_name:
                contact_name = " ".join(contact_name.split(", ")[::-1])
        except IndexError:
            contact_name = 'Sir or Madam'
        #contact_email = '*****@*****.**'
        #contact_email = "*****@*****.**"
        #contact_email = "*****@*****.**"
        #contact_email = "*****@*****.**"
        #contact_email = "*****@*****.**"
        #contact_email = "*****@*****.**"
        #contact_email = "*****@*****.**"
        #contact_email = "*****@*****.**"
        if TEST:
            contact_email = '*****@*****.**'

        print icount, '/', len(recids)
        print 'recid = ', recid_str
        print 'email = ', contact_email
        print 'name  = ', contact_name
        print ' '

        try:
            send_jobs_mail(recid_str, contact_email, contact_name)
            time.sleep(1)
        except IOError as error:
            print "I/O error({0}): {1}".format(error.errno,
                                               error.strerror)
            print 'PROBLEM sending mail to:'
            print recid, contact_email, contact_name, '\n'
        icount += 1
def preprocess_file(read_data):
    """Get file into a form that can be properly processed."""

    read_data = preprocess_file_braces(read_data)

    #Process any user commands in latex.
    command_dict = {}
    for line in read_data.split('\n'):
        match = None
        if re.search('command', line):
            match = re.search(r'\\r?e?newcommand\*?\{\\(\w+)\}\{(.*)\}', line)
        elif re.search(r'\\def\\', line):
            match = re.search(r'\\def\\(\w+)\{(.*)\}', line)
        if match:
            command_value = match.group(2)
            if re.search(r'^\\\w', command_value):
                command_value = '\\' + command_value
            command_dict[match.group(1)] = command_value
    for key in command_dict:
        try:
            command_string = re.compile(r'\\%s\b' % key)
            read_data = re.sub(command_string, command_dict[key], read_data)
        except re.error:
            print '!!! Problem with user commands:', key, command_dict[key]
            sys.exit()
   
   

    read_data = read_data.replace('{+}', '{WXYZ}')
    for line in read_data.split('\n'):
        #\href{http://inspirehep.net/record/1068305}{J.~Alimena}$^{7}$
        match_obj = re.search(r'/record/(\d+).*$', line)
        if match_obj:
            for id_type in ['ORCID', 'INSPIRE']:
                id_num = get_hepnames_anyid_from_recid(match_obj.group(1),
                                                       id_type)
                if id_num:
                    #print line
                    line_new = re.sub(r'.*\}\{(.*)\}(\$\^.*)', 
                                      r'\1 [' + id_num + r']\2', line)
                    read_data = read_data.replace(line, line_new)
                    #print line_new
                    continue                  
                                            

            #orcid = get_hepnames_anyid_from_recid(match_obj.group(1), 'ORCID')
            #if orcid:
            #    line_new = re.sub(r'.*' + match_obj.group(1) + '}', 
            #                      r'\\author[' + orcid + ']', line)
            #    line_new = re.sub(r'\}(\$\^\{.+\}\$)', r'\1}', line_new)
            #    read_data = read_data.replace(line, line_new)
            #    print line_new
            #    continue
            #else:
            #    inspire = get_hepnames_anyid_from_recid(match_obj.group(1),
            #                                            'INSPIRE')
            #    if inspire:
            #        line_new = re.sub(r'.*' + match_obj.group(1) + '}',
            #                          r'\\author[' + inspire + ']', line)
            #        read_data = read_data.replace(line, line_new)
            #        print line_new
            #        continue
        match_obj = re.search(r'record/(\d+)', line)
        if match_obj:
            try:
                inst = get_fieldvalues(match_obj.group(1), '110__u')[0]
                line_new = re.sub(r'\\href{http://inspirehep.net/record/\d+}',
                           inst + ' %', line)
                print line_new
                read_data = read_data.replace(line, line_new)
            except IndexError:
                pass

        #John Smith (University of Somewhere)
        if re.search(r'^[A-Z].* \(.*\)\s*$', line):
            line_new = re.sub(r'(.*)\s+\((.*)\)', 
                              r'\\author{\1}\n\\affiliation{\2}', line)
            read_data = read_data.replace(line, line_new)

        #\AddAuthor{C.~Lindsey}{11}{}{}
        if re.search(r'\\AddAuthor{', line):
            line_new = \
                     re.sub(r'\\AddAuthor{(.*)}{([^\}]*)}{([^\}]*)}{([^\}]*)}',
                              r'\1$^{\2,\3,\4}$', line)
            line_new = re.sub('[,]+}', '}', line_new)
            line_new = re.sub('{[,]+', '{', line_new)
            line_new = line_new.replace(',,', ',')
            read_data = read_data.replace(line, line_new)
        #\AddInstitute{1a}{Blah blah} \AddExternalInstitute
        line = line.replace('\\AddExternalInstitute', '\\AddInstitute')
        if re.search(r'\\AddInstitute{([^\}]+)}{', line):
            line_new = re.sub(r'\\AddInstitute{([^\}]+)}', r'$^{\1}$ ', line)
            read_data = read_data.replace('\\AddExternalInstitute', '\\AddInstitute')
            read_data = read_data.replace(line, line_new)

        #\firstname{C.-H.} \lastname{Yu} \inst{4}
        if re.search(r'\\firstname{', line) and re.search(r'\\inst{', line):
            line_new = re.sub(r'\\firstname{(.*)}\s*\\lastname{(.*)}\s*\\inst(\{.*\}).*',
                      r'YYYY\2, \1$^\3$', line)
            read_data = read_data.replace(line, line_new)
        #\firstname{C.-H.} \lastname{Yu}
        elif re.search(r'\\firstname{', line):
            #line_new = re.sub(r'\\firstname{([^\}]+)}\s*\\lastname{([^\}]+)}',
            line_new = re.sub(r'\\firstname{(.*)}\s*\\lastname{(.*)}',
                      r'YYYY\2, \1', line)
            read_data = read_data.replace(line, line_new)
        #I.J.~Arnquist\inst{10}
        if re.search(r'\\inst\{', line):
           line_new = re.sub(r'\\inst({[^\}]+\})', r'$^\1$', line)
           read_data = read_data.replace(line, line_new)
    #print "read_data =", read_data

    #Special treatment for BaBar
    for line in read_data.split('\n'):
        #BaBar \affiliation{Fermilab$^{a}$, SLAC$^{b}$}
        if re.search(r'\\affiliation\{.*\$\^\{?[abc]\}?\$', line):
            line_new = re.sub(r'\$\^\{?[abc]\}?\$', ' and ', line)
            read_data = read_data.replace(line, line_new)
        elif re.search(r'\\author\{.*\$\^\{?[abc]+\}?\$', line):
            line_new = re.sub(r'[ ]*\$\^\{?[abc]+\}?\$[ ]*', '', line)
            read_data = read_data.replace(line, line_new)
        elif re.search(r'\\author\{.*\\altaffiliation', line):
            line_new = re.sub(r'\\altaffiliation.*', '', line)
            read_data = read_data.replace(line, line_new)
        if VERBOSE:
            try:
                print "BABAR LINE =", line_new
            except UnboundLocalError:
                pass

    #Special treatment for DES and Fermi-LAT and Planck
    astro_aff_counter = 0
    for line in read_data.split('\n'):
        #Get rid of newcommand lines now
        if re.search('newcommand', line):
            read_data = read_data.replace(line, '')
        if VERBOSE:
            print "ASTRO LINE =", line
        if re.search(r'\\section\*\{Affiliations\}', line) or \
           re.search(r'\\institute\{\\small', line):
            astro_aff_counter = 1
        if astro_aff_counter and re.search(r'^\\item', line):
            line_new = \
                re.sub(r'^\\item', r'$^{' + str(astro_aff_counter) + r'}$', \
                line)
            read_data = read_data.replace(line, line_new, 1)
            astro_aff_counter += 1
        elif astro_aff_counter and re.search(r'\\goodbreak[ ]*$', line):
            line_new = \
                re.sub(r'(.*)[ ]*\\goodbreak[ ]*$', r'$^{' + \
                       str(astro_aff_counter) + r'}$ \1', \
                       line)
            read_data = read_data.replace(line, line_new, 1)
            if VERBOSE:
                print astro_aff_counter, line
                print line_new
            astro_aff_counter += 1
        elif astro_aff_counter and re.search(r'.\\and[ ]*$', line):
            line_new = \
                re.sub(r'(.*)[ ]*\\and[ ]*$', r'$^{' + \
                       str(astro_aff_counter) + r'}$ \1', \
                       line)
            read_data = read_data.replace(line, line_new, 1)
            astro_aff_counter += 1
    #print read_data


    #Special treatment for LIGO and Virgo
    pattern_au = re.compile(r"^([A-Z])[\~\.]([^-]*)([A-Z])([^A-Z]+)\s*\%\s*"
                         r"([a-z])([a-z]+)\.([a-z])([a-z]+)")
    pattern_af = re.compile(r"\\affiliation\s*\{(.*)\}\s*\%.*(\{\d+\})")
    for line in read_data.split('\n'):
        match = re.match(pattern_au, line)
        if match:
            if match.group(5).upper() == match.group(1) and \
               match.group(7).upper() == match.group(3):
                line_new = match.group(1) + match.group(6) + ' ' + \
                           match.group(2) + ' ' + match.group(3) + \
                           match.group(4)
                #print line_new, '\t\t', line
                read_data = read_data.replace(line, line_new)
        match = re.match(pattern_af, line)
        if match:
            line_new = "$^" + match.group(2) + "$" + match.group(1)
            #print line_new
            read_data = read_data.replace(line, line_new)

    #Remove spaces around braces and commas
    read_data = re.sub(r'[ ]*([\]\}\[\{\,])[ ]*', r'\1', read_data)
    read_data = re.sub(r'^[ ]+', '', read_data)

    read_data = re.sub(r'\-+', r'-', read_data)

    read_data = re.sub(r'%.*\n', '\n', read_data)
    read_data = re.sub(r'}\$,\s*', '}$\n', read_data)
    read_data = re.sub(r'\$\^(\w)\$,\s*', r'$^\1$\n', read_data)
    read_data = re.sub(r'\\thanks\{[^\}]+(0000-0[\d\-]+[\dX])[^\}]*\}',
                r'\\affiliation{\1}', read_data)
    read_data = re.sub(r'\}?\\thanks\{[^\}]+\}?', r'', read_data)
    read_data = re.sub(r'\\item\[(\$\^\{?\w+\}?\$)\]', r'\1', read_data)
    read_data = re.sub(r'\\llap\{(\$\S+\$)\}', r'\1 ', read_data)
    read_data = re.sub(r'\\textsuperscript\{([a-z\d\,]+)\}\{',
                       r'$^{\1$} \{', read_data)
    read_data = re.sub(r'\\textsuperscript\{([a-z\d\,]+)\}',
                       r'$^{\1$}\n', read_data)
    read_data = re.sub(r'\\address', r'\\affiliation', read_data)
    read_data = re.sub(r'\\affil\b', r'\\affiliation', read_data)
    read_data = re.sub(r'\\email\{', r'\\affiliation{', read_data)
    read_data = re.sub(r'}\s*\\affiliation', '}\n\\\\affiliation', read_data)
    read_data = re.sub(r'}\s*\\author', '}\n\\\\author', read_data)
    read_data = re.sub(r'[ ]*\\scriptsize[ ]+', '', read_data)
    read_data = re.sub(r'\\and[ ]+', '', read_data)
    read_data = re.sub(r'\$\s*\^', '$^', read_data)
    if VERBOSE:
        print "read_data =", read_data
    read_data = re.sub(r'Irefn{(\w+)}\\Aref{(\w+)}\\Aref{(\w+)}', \
                       r'Irefn{\1,\2,\3}', read_data)
    read_data = re.sub(r'Irefn+\{(.*)\}\\?A?r?e?f?s?\{(.*)\}', \
                       r'Irefn{\1,\2}', read_data)
    read_data = re.sub(r'Arefs?{(\w+)}', r'Irefn{\1}', read_data)
    read_data = re.sub(r'\\Idef{(\w+)}', r'$^{\1}$', read_data)
    #read_data = \
    #    re.sub(r'(\w\.?)[ \,]*\\(inst|altaffilmark|Irefn)\{(.*)\}', \
    #           r'\1$^{\3}$', read_data)
    read_data = \
        re.sub(r'[ \,]*\\(inst|altaffilmark|Irefn|thanksref)\{([^\}]+)\}', \
               r'$^{\2}$', read_data)
    #\altaffiltext{2}{Fermilab, Batavia}
    read_data = \
        re.sub(r'\\(altaffiltext|thankstext)\{([\w\,\-]+)\}\{(.*)\}', \
               r'$^{\2}$ \3', read_data)
    read_data = \
        re.sub(r'\\item\s*\\[IA]def\{([\w\,\-]+)\}\{(.*)\}', r'$^{\1}$ \2', \
               read_data)
    read_data = \
        re.sub(r'\\[IA]def\{([\w\,\-]+)\}\{(.*)\}', r'$^{\1}$ \2', \
               read_data)
    read_data = \
        re.sub(r'(.*)\s*\\label\{(.*)\}', r'$^{\2}$ \1', \
               read_data)
    #\author[b,c]{M. Zimmermann} \affiliation[b]{Fermilab}
    read_data = \
        re.sub(r'\\author\[([\w\,\-]+)\]\{(.*)\}', r'\2$^{\1}$', read_data)
    read_data = \
        re.sub(r'\\affiliation\[([\w\,\-]+)\]\{(.*)\}', r'$^{\1}$ \2', \
               read_data)
    #\author{M. Zimmermann$^{b,c}$} \affiliation{$^{b}$Fermilab} remove \author
    read_data = \
        re.sub(r'\\author\{(.*\$\^\{?[\w\,\-]+\}?\$)\}', r'\1', read_data)
    read_data = \
        re.sub(r'\\affiliation\{(\$\^\{?[\w\,\-]+\}?\$.*)\}', r'\1', read_data)

    read_data = re.sub(r'[\, ]+\}', '}', read_data)
    read_data = re.sub(r'[\, ]+\$\^', '$^', read_data)
    #print read_data
    new_read_data = []
    for line in read_data.split('\n'):
        if re.search('abstract', line, re.IGNORECASE) and astro_aff_counter < 1:
            break
        else:
            new_read_data.append(line)
    if VERBOSE:
        print "new_read_data =", new_read_data
    return new_read_data
def check_ids(letter=None):
    """Go through HEPNames looking for bad IDs."""

    already_seen = {}
    duplicates   = set()
    bad_id_set   = set()
    fields = ['035__a', '035__z', '371__m']
    print 'check_ids: letter =', letter
    if letter:
        fields.append('100__a')

    for recid, field in [(recid, field) for recid in RECIDS_HEPN \
                                        for field in fields]:
        skip = False
        field_values = get_fieldvalues(recid, field)
        if field == '100__a':
            try:
                if not field_values[0].startswith(letter):
                    skip = True
            except IndexError:
                print "No name on record:", recid
        if not skip:
            for field_value in field_values:
                if field_value in already_seen:
                    duplicates.add(field + ':"' + field_value + '"')
                    continue
                already_seen[field_value] = field
                if bad_id_check(field_value):
                    bad_id_set.add(field + ':"' + field_value + '"')

    print "Duplicates"
    for duplicate in sorted(duplicates):
        if duplicate.startswith('100__a'):
            result = perform_request_search(p=duplicate, cc='HepNames')
            for recid in result:
                #name = duplicate.replace('100__a:"', '')
                #name = name.replace('"', '')
                print '{0:37s} {1:18s} {2:20s}'.\
                       format(duplicate, \
                         find_inspire_id_from_record(recid), \
                         get_hepnames_anyid_from_recid(recid, 'ORCID'))
        elif duplicate.startswith('035__a'):
            print duplicate
            #search = r'100__a:{0} or 700__a:{0}'.\
            #         format(duplicate.strip('035__a:'))
            search = r'100:{0} or 700:{0}'.\
                     format(duplicate.strip('035__a:'))
            result = perform_request_search(p=duplicate, cc='HEP')
            if len(result) > 0:
                print "Duplicate ID in HEP records"
                print "  ", search
        else:
            print duplicate


    print "Bad metadata"
    for bad_id in sorted(bad_id_set):
        print bad_id

    print "Bad ORCIDS in BAI"
    bad_orcid_bai()

    print "Finding new ORCIDs in HEP"
    new_orcids(already_seen)