Пример #1
0
def check_doi_eprint(identifier):
    "Check to see if we already have a DOI eprint pairing."

    if identifier.startswith('10.'):
        result = search_unit(identifier, f='0247_a', m='a')
        if result & DELETED:
            return None
        if len(result) == 1:
            INSPIRE_IDENTIFIER_RECID_DICT[identifier] = result[0]
            result = result & INSPIRE_EPRINT_RECIDS
    elif any((re.match(ARXIV_REGEX,
                       identifier), re.match(ARXIV_REGEX_NEW, identifier))):
        prefix = ''
        if re.match(ARXIV_REGEX_NEW, identifier):
            prefix = 'arXiv:'
        result = search_unit(prefix + identifier, f='037__a', m='a')
        if result & DELETED:
            return None
        if len(result) == 1:
            INSPIRE_IDENTIFIER_RECID_DICT[identifier] = result[0]
            result = result & INSPIRE_DOI_RECIDS
    else:
        return None
    if len(result):
        return True
    return None
def get_hepnames_recid_from_email(email):
    """
    Find the HEPNames recid based on email
    """
   
    if email not in EMAILS_HEPNAMES:
        if VERBOSE:
            print "WARNING: no hepnames record found for %s: " % (email)
        return None

    emailsearch = '371__m:%s or 371__o:%s'
    reclist = perform_request_search(p=emailsearch % (email, email),
                                     cc='HepNames')
    hidden_m = search_unit(email, f='595__m', m='a')
    hidden_o = search_unit(email, f='595__o', m='a')
    reclist_hidden = hidden_m or hidden_o & HN
    reclist = intbitset(reclist) or reclist_hidden

    if len(reclist) == 1:
        return reclist[0]
    elif len(reclist) > 1:
        if VERBOSE:
            print "WARNING: more than one hepnames record found for %s: " \
                  % (email)
            print '\t' + ', '.join([str(r) for r in reclist])
        return [r for r in reclist]
    else:
        if VERBOSE:
            print "WARNING: no hepnames record found for %s: " % (email)
        return None
Пример #3
0
def get_hepnames_recid_from_email(email):
    """
    Find the HEPNames recid based on email
    """

    if email not in EMAILS_HEPNAMES:
        if VERBOSE:
            print "WARNING: no hepnames record found for %s: " % (email)
        return None

    emailsearch = '371__m:%s or 371__o:%s'
    reclist = perform_request_search(p=emailsearch % (email, email),
                                     cc='HepNames')
    hidden_m = search_unit(email, f='595__m', m='a')
    hidden_o = search_unit(email, f='595__o', m='a')
    reclist_hidden = hidden_m or hidden_o & HN
    reclist = intbitset(reclist) or reclist_hidden

    if len(reclist) == 1:
        return reclist[0]
    elif len(reclist) > 1:
        if VERBOSE:
            print "WARNING: more than one hepnames record found for %s: " \
                  % (email)
            print '\t' + ', '.join([str(r) for r in reclist])
        return [r for r in reclist]
    else:
        if VERBOSE:
            print "WARNING: no hepnames record found for %s: " % (email)
        return None
Пример #4
0
def get_recid_from_inspire(id_string):
    '''
    Takes an ID string and returns an INSPIRE recid or it returns None.
    '''

    id_string = str(id_string)
    id_string = clean_eprint(id_string)
    if ARXIV_REGEX.match(id_string):
        field = '037__a'
    elif ARXIV_REGEX_NEW.match(id_string):
        field = '037__a'
        id_string = 'arXiv:' + id_string
    elif DOI_REGEX.match(id_string):
        field = '0247_a'
    elif id_string.isdigit():
        field = '001'
    else:
        logging.info('Unknown ID: ' + id_string)
        return False
    result = search_unit(p=id_string, f=field, m='a') - DELETED
    if len(result) > 1:
        print 'Duplicate: {0} {1}'.format(id_string, result)
        quit()
    if len(result) == 1:
        return str(result[0])
    return None
def recid_from_doi(doi):
    """Find if we have a DOI."""

    try:
        return search_unit(p=doi, f='0247*', m='a')[0]
    except IndexError:
        return None
Пример #6
0
def bst_prodsync(method='afs', with_citations='yes', with_claims='yes', skip_collections=''):
    """
    Synchronize to either 'afs' or 'redis'

    with_citations: yes/no, whether records that now matches a record will need to be re-exported.abs
    with_claims: yes/no, whether record involved in some new claim need to be re-exported.
    skip_collections: comma-separated-lists of values for which records having 980:VALUE should be ignored,
        e.g. skip_collections='HEP,HEPNAMES,HEPHIDDEN'
    """
    if not CFG_REDIS_HOST_LABS:
        method = 'afs'

    write_message("Prodsync started using %s method" % method)
    now = datetime.datetime.now()
    future_lastrun = now.strftime('%Y-%m-%d %H:%M:%S')
    lastrun_path = os.path.join(CFG_TMPSHAREDDIR, 'prodsync_%s_lastrun.txt' % method)
    try:
        last_run = open(lastrun_path).read().strip()
        write_message("Syncing records modified since %s" % last_run)
        with run_ro_on_slave_db():
            modified_records = intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date>=%s", (last_run, )))
            compacttime = last_run.replace('-', '').replace(' ', '').replace(':', '')
            notimechangerecs = search_unit("%s->20250101000000" % compacttime, f='005', m='a')
            modified_records += notimechangerecs
            if with_citations.lower() == 'yes':
                for citee, citer in run_sql("SELECT citee, citer FROM rnkCITATIONDICT WHERE last_updated>=%s", (last_run, )):
                    modified_records.add(citer)
            if with_claims.lower() == 'yes':
                modified_records |= intbitset(run_sql("SELECT bibrec FROM aidPERSONIDPAPERS WHERE last_updated>=%s", (last_run, )))
                modified_records |= intbitset(run_sql('SELECT bibrec FROM aidPERSONIDPAPERS AS p JOIN aidPERSONIDDATA as d'
                                                      ' ON p.personid = d.personid WHERE d.tag = "canonical_name" and d.last_updated>=%s', (last_run, )))
    except IOError:
        # Default to everything
        with run_ro_on_slave_db():
            modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
        write_message("Syncing all records")

    skip_collections = skip_collections.split(',')
    skip_collections.remove('')
    for collection in skip_collections:
        modified_records -= search_pattern(p='980:%s' % collection)

    if not modified_records:
        write_message("Nothing to do")
        return True

    tot = len(modified_records)
    time_estimator = get_time_estimator(tot)
    write_message("Adding %s new or modified records" % tot)
    if method == 'afs':
        afs_sync(reversed(modified_records), time_estimator, tot, now)
        open(lastrun_path, "w").write(future_lastrun)
        write_message("DONE!")
    else:
        if redis_sync(reversed(modified_records), time_estimator, tot):
            open(lastrun_path, "w").write(future_lastrun)
            write_message("DONE!")
        else:
            write_message("Skipping prodsync: Redis queue is not yet empty")
Пример #7
0
    def get_hitset(self, key, pos=None):
        """ perform key appropriate search and return hitlist """
        if not self._fields[key]:
            return intbitset()
        if pos is None:
            pos = slice(None, None)
        hits = intbitset()
        if key == 'pubnote':
            for val in self._fields[key][pos]:
                hits |= search_pattern(f='journal', p=val, ap=1)
        elif key == 'repno':
            for val in self._fields[key][pos]:
                hits |= search_unit(f='reportnumber', p=val)
        elif key == 'DOI':
            for val in self._fields[key][pos]:
                hits |= search_unit(f='doi', p=val, m='a')

        return hits & HEPRECS
Пример #8
0
    def get_hitset(self, key, pos=None):
        """ perform key appropriate search and return hitlist """
        if not self._fields[key]:
            return intbitset()
        if pos is None:
            pos = slice(None, None)
        hits = intbitset()
        if key == 'pubnote':
            for val in self._fields[key][pos]:
                hits |= search_pattern(f='journal', p=val, ap=1)
        elif key == 'repno':
            for val in self._fields[key][pos]:
                hits |= search_unit(f='reportnumber', p=val)
        elif key == 'DOI':
            for val in self._fields[key][pos]:
                hits |= search_unit(f='doi', p=val, m='a')

        return hits & HEPRECS
Пример #9
0
def _get_unique_recid_for(journal='', report='', doi=''):
    """Return the recid for this set of identifiers.

    If no recid can be found or if too many are found, returns 0 to indicate
    failure.
    """

    hits = []
    if journal:
        hits = search_unit(f='journal', p=journal)
    if report and len(hits) != 1:
        hits = search_unit(f='reportnumber', p=report)
    if doi and len(hits) != 1:
        hits = search_unit(f='doi', p=doi)

    if len(hits) > 1:
        # FIXME: should throw exception or maybe show multiple possibilities
        return 0
    elif len(hits) == 1:
        return hits.pop()
    else:
        return 0
Пример #10
0
def _get_unique_recid_for(journal='', report='', doi=''):
    """Return the recid for this set of identifiers.

    If no recid can be found or if too many are found, returns 0 to indicate
    failure.
    """

    hits = []
    if journal:
        hits = search_unit(f='journal', p=journal)
    if report and len(hits) != 1:
        hits = search_unit(f='reportnumber', p=report)
    if doi and len(hits) != 1:
        hits = search_unit(f='doi', p=doi)

    if len(hits) > 1:
        # FIXME: should throw exception or maybe show multiple possibilities
        return 0
    elif len(hits) == 1:
        return hits.pop()
    else:
        return 0
def citationloss(exactauthor, startdate):

    recordsofauthor = search_unit(exactauthor, f='exactauthor')
    removedcitations = intbitset([i[0] for i in \
                                  run_sql('select citee from rnkCITATIONLOG where action_date>"%s"' % startdate)])

    lossoverlap = recordsofauthor & removedcitations
    if lossoverlap:
        recsaffected = run_sql('select citer,citee,action_date from rnkCITATIONLOG where citee in (' \
                        + ', '.join([str(i) for i in lossoverlap]) \
                           + ') and action_date>"%s"' % (startdate))
        return recsaffected
    return None
Пример #12
0
def get_jacow_dois():
    """Return all the JACoW DOIs INSPIRE has."""

    jacow_dois_record = set()
    for doi in get_all_field_values('0247_a'):
        if doi.startswith('10.18429/JACoW-'):
            jacow_dois_record.add('doi:' + doi)

    jacow_dois_ref = set()
    for doi in get_all_field_values('999C5a'):
        if doi.startswith('doi:10.18429/JACoW-'):
            jacow_dois_ref.add(doi)
    missing_dois = jacow_dois_ref - jacow_dois_record
    if not missing_dois:
        return jacow_dois_record

    for doi in sorted(missing_dois):
        if good_doi(doi):
            search_unit('doi', f='0247_2', m='a')
            doi = doi.replace('doi:', '')
            if search_unit(doi, f='0247_2', m='a'):
                continue
            print 'https://doi.org/{0}'.format(doi)
    sys.exit()
Пример #13
0
def get_record_ids_to_export(unmatched_only=False, since=None):
    """Return all records with identifiers to sync."""
    all_recids = get_all_recids()
    recids_with_other_id = search_pattern(p='035__9:%s' % CFG_OTHER_SITE)
    if CFG_INSPIRE_SITE:
        recids_with_other_id |= search_unit(p='CDS-*', f='595__a', m='a')
    recids_with_a_doi = search_pattern(p='doi:"**"')
    recids_with_an_arxiv_id = search_pattern(p='035__9:"arXiv"')
    if since:
        modified_recids = intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date>=%s", (since, )))
        all_recids = all_recids & modified_recids
    if unmatched_only:
        all_recids = all_recids - recids_with_other_id
        return (recids_with_a_doi | recids_with_an_arxiv_id) & all_recids
    else:
        return (recids_with_a_doi | recids_with_an_arxiv_id | recids_with_other_id) & all_recids
Пример #14
0
def get_record_ids_to_export(unmatched_only=False, since=None):
    """Return all records with identifiers to sync."""
    all_recids = get_all_recids()
    recids_with_other_id = search_pattern(p='035__9:%s' % CFG_OTHER_SITE)
    if CFG_INSPIRE_SITE:
        recids_with_other_id |= search_unit(p='CDS-*', f='595__a', m='a')
    recids_with_a_doi = search_pattern(p='doi:"**"')
    recids_with_an_arxiv_id = search_pattern(p='035__9:"arXiv"')
    if since:
        modified_recids = intbitset(
            run_sql("SELECT id FROM bibrec WHERE modification_date>=%s",
                    (since, )))
        all_recids = all_recids & modified_recids
    if unmatched_only:
        all_recids = all_recids - recids_with_other_id
        return (recids_with_a_doi | recids_with_an_arxiv_id) & all_recids
    else:
        return (recids_with_a_doi | recids_with_an_arxiv_id
                | recids_with_other_id) & all_recids
Пример #15
0
def main():
    file_name = 'tmp_' + __file__
    file_name = re.sub('.py', '_correct.out', file_name)
    output = open(file_name,'w')
    output.write('<collection>')
    result_hep = perform_request_search(p=SEARCH, cc=SUBFILE)
    result = search_unit(p="*D0-PRELIMINARY-NOTE*",m='a',f='980*')
    result = result[:5]
    for recid in result:
        #info = print_record(recid, ot=['001','037'], format='xm')
        #info = re.sub(r'code="a">FERMILAB', r'code="z">FERMILAB', info)
        info = print_record(recid, ot=['001', '100', '700', '980'], format='hm')
        #info = re.sub(r'>.*[fF]*o[rf] the (\w+) [Cc]oll.*<', r'>\1 Collaboration<', info)
        #info = re.sub(r'>\w+tion [fF]*or [Tt]he (\w+)<', r'>\1 Collaboration<', info)
        info = re.sub(r'code="e">FERMILAB-TEV-', r'', info)
        info = re.sub(r'<\/?pre[^\>]*>', r'', info)
        info += '\n\n'
        output.write(info)
    output.write('</collection>')
    output.close()
Пример #16
0
def main():
    file_name = 'tmp_' + __file__
    file_name = re.sub('.py', '_correct.out', file_name)
    output = open(file_name, 'w')
    output.write('<collection>')
    result_hep = perform_request_search(p=SEARCH, cc=SUBFILE)
    result = search_unit(p="*D0-PRELIMINARY-NOTE*", m='a', f='980*')
    result = result[:5]
    for recid in result:
        #info = print_record(recid, ot=['001','037'], format='xm')
        #info = re.sub(r'code="a">FERMILAB', r'code="z">FERMILAB', info)
        info = print_record(recid,
                            ot=['001', '100', '700', '980'],
                            format='hm')
        #info = re.sub(r'>.*[fF]*o[rf] the (\w+) [Cc]oll.*<', r'>\1 Collaboration<', info)
        #info = re.sub(r'>\w+tion [fF]*or [Tt]he (\w+)<', r'>\1 Collaboration<', info)
        info = re.sub(r'code="e">FERMILAB-TEV-', r'', info)
        info = re.sub(r'<\/?pre[^\>]*>', r'', info)
        info += '\n\n'
        output.write(info)
    output.write('</collection>')
    output.close()
Пример #17
0
def format_element(bfo, reference_prefix, reference_suffix):
    """
    Prints the references of this record

    @param reference_prefix a prefix displayed before each reference
    @param reference_suffix a suffix displayed after each reference
    """
    references = bfo.fields("999C5", escape=0, repeatable_subfields_p=True)

    out = ""
    last_o = ""

    if not references:
        return out

    out += "<table>"
    for reference in references:
        ref_out = []
        ref_out.append('<tr><td valign="top">')

        display_journal = ''
        display_report = ''
        clean_report = ''
        clean_journal = ''
        hits = []
        if reference.has_key('o') and not reference['o'][0] == last_o:
            temp_ref = reference['o'][0].replace('.', '')
            if '[' in temp_ref and ']' in temp_ref:
                ref_out.append("<small>" + temp_ref + "</small> ")
            else:
                ref_out.append("<small>[" + temp_ref + "] </small> ")
            last_o = temp_ref
        ref_out.append("</td><td>")

        if reference_prefix:
            ref_out.append(reference_prefix)

        if reference.has_key('s'):
            display_journal = reference['s'][0]
            clean_journal = reference['s'][0]
        if reference.has_key('r'):
            if "[" in reference['r'][0] and "]" in reference['r'][0]:
                breaknum = reference['r'][0].find('[')
                newreference = reference['r'][0][:breaknum].strip()
                display_report = newreference
                clean_report = newreference
            else:
                display_report = reference['r'][0]
                clean_report = reference['r'][0]
        if clean_report:
            hits = search_unit(f='reportnumber', p=clean_report)
        if clean_journal and len(hits)!=1:
            hits = search_unit(f='journal', p=clean_journal)
        if reference.has_key('a') and len(hits)!=1:
            hits = search_unit(p=reference['a'][0])
        if reference.has_key('0') and len(hits)!=1:
            # check if the record exists in the database
            try:
                recID = int(reference['0'][0])
                if get_record(recID):
                    # since we already have a recID, we can assign it directly
                    # to the "hits" variable, so it will be handled in the last if statement
                    hits = [recID]
            except ValueError:
                pass
        if len(hits) == 1:
            ref_out.append('<small>' + format_record(list(hits)[0],'hs') + '</small>')
        else:
            if reference.has_key('h'):
                ref_out.append("<small> " + reference['h'][0] + ".</small>")
            if reference.has_key('t'):
                ref_out.append("<small> " + reference['t'][0] + "</small> -")
            if reference.has_key('y'):
                ref_out.append("<small> " + reference['y'][0] + ".</small>")
            if reference.has_key('p'):
                ref_out.append("<small> " + reference['p'][0] + ".</small>")
            if reference.has_key('m'):
                ref_out.append("<small> "+ reference['m'][0].replace(']]', ']') + ".</small>")
            if reference.has_key('a'):
                ref_out.append("<small> <a href=\"http://dx.doi.org/" + \
                reference['a'][0] + "\">" + reference['a'][0]+ "</a></small>")
            if reference.has_key('u'):
                ref_out.append("<small> <a href=" + reference['u'][0] + ">" + \
                reference['u'][0]+ "</a></small>")
            if reference.has_key('i'):
                for r in reference['i']:
                    ref_out.append("<small> <a href=\"/search?ln=en&amp;p=020__a%3A"+r+"\">"+r+"</a></small>")

            ref_out.append('<small>')
            if display_journal:
                ref_out.append(display_journal)
            if display_report:
                ref_out.append(' ' + display_report)
            ref_out.append("</small>")

        if reference_suffix:
            ref_out.append(reference_suffix)

        ref_out.append("</td></tr>")
        out += ' '.join(ref_out)

    return out + "</table>"
Пример #18
0
def format_element(bfo, reference_prefix, reference_suffix):
    """
    Prints the references of this record

    @param reference_prefix a prefix displayed before each reference
    @param reference_suffix a suffix displayed after each reference
    """
    references = bfo.fields("999C5", escape=0, repeatable_subfields_p=True)

    out = ""
    last_o = ""

    if not references:
        return out

    out += "<table>"
    for reference in references:
        ref_out = []
        ref_out.append('<tr><td valign="top">')

        display_journal = ""
        display_report = ""
        clean_report = ""
        clean_journal = ""
        hits = []
        if reference.has_key("o") and not reference["o"][0] == last_o:
            temp_ref = reference["o"][0].replace(".", "")
            if "[" in temp_ref and "]" in temp_ref:
                ref_out.append("<small>" + temp_ref + "</small> ")
            else:
                ref_out.append("<small>[" + temp_ref + "] </small> ")
            last_o = temp_ref
        ref_out.append("</td><td>")

        if reference_prefix:
            ref_out.append(reference_prefix)

        if reference.has_key("s"):
            display_journal = reference["s"][0]
            clean_journal = reference["s"][0]
        if reference.has_key("r"):
            if "[" in reference["r"][0] and "]" in reference["r"][0]:
                breaknum = reference["r"][0].find("[")
                newreference = reference["r"][0][:breaknum].strip()
                display_report = newreference
                clean_report = newreference
            else:
                display_report = reference["r"][0]
                clean_report = reference["r"][0]
        if clean_report:
            hits = search_unit(f="reportnumber", p=clean_report)
        if clean_journal and len(hits) != 1:
            hits = search_pattern(f="journal", p=clean_journal, ap=1)
        if reference.has_key("a") and len(hits) != 1:
            hits = search_unit(p=reference["a"][0])
        if reference.has_key("0") and len(hits) != 1:
            # check if the record exists in the database
            try:
                recID = int(reference["0"][0])
                if get_record(recID):
                    # since we already have a recID, we can assign it directly
                    # to the "hits" variable, so it will be handled in the last if statement
                    hits = [recID]
            except ValueError:
                pass
        if len(hits) == 1:
            ref_out.append("<small>" + format_record(list(hits)[0], "hs") + "</small>")
        else:
            if reference.has_key("h"):
                ref_out.append("<small> " + reference["h"][0] + ".</small>")
            if reference.has_key("t"):
                ref_out.append("<small> " + reference["t"][0] + "</small> -")
            if reference.has_key("y"):
                ref_out.append("<small> " + reference["y"][0] + ".</small>")
            if reference.has_key("p"):
                ref_out.append("<small> " + reference["p"][0] + ".</small>")
            if reference.has_key("m"):
                ref_out.append("<small> " + reference["m"][0].replace("]]", "]") + ".</small>")
            if reference.has_key("a"):
                ref_out.append(
                    '<small> <a href="http://dx.doi.org/'
                    + reference["a"][0]
                    + '">'
                    + reference["a"][0]
                    + "</a></small>"
                )
            if reference.has_key("u"):
                ref_out.append("<small> <a href=" + reference["u"][0] + ">" + reference["u"][0] + "</a></small>")
            if reference.has_key("i"):
                for r in reference["i"]:
                    ref_out.append('<small> <a href="/search?ln=en&amp;p=020__a%3A' + r + '">' + r + "</a></small>")

            ref_out.append("<small>")
            if display_journal:
                ref_out.append(display_journal)
            if display_report:
                ref_out.append(" " + display_report)
            ref_out.append("</small>")

        if reference_suffix:
            ref_out.append(reference_suffix)

        ref_out.append("</td></tr>")
        out += " ".join(ref_out)

    return out + "</table>"
Пример #19
0
        data_set = generate_data(data_set)
        try:
            with open(filename, "wb") as file_handle:
                pickle.dump(data_set, file_handle)
        except pickle.PicklingError:
            print "Problem creating:", filename
    except pickle.UnpicklingError:
        print "Pickle problem for", filename
    return data_set


INSPIRE_JOURNALS = get_data('INSPIRE_JOURNALS')[0]
(INSPIRE_EPRINTS, INSPIRE_BIBCODES) = get_data('INSPIRE_EPRINTS')
INSPIRE_DOIS = get_data('INSPIRE_DOIS')[0]

INSPIRE_EPRINT_RECIDS = search_unit('arxiv', f='037__9', m='a')
INSPIRE_DOI_RECIDS = search_unit('doi', f='0247_2', m='a')
INSPIRE_IDENTIFIER_RECID_DICT = {}
DELETED = search_unit(p='DELETED', m='a', f='980*')

print 'Eprints', len(INSPIRE_EPRINTS), random.sample(INSPIRE_EPRINTS, 1)
print 'Bibcodes', len(INSPIRE_BIBCODES), random.sample(INSPIRE_BIBCODES, 1)
print 'DOIs', len(INSPIRE_DOIS), random.sample(INSPIRE_DOIS, 1)
print 'Journals', len(INSPIRE_JOURNALS), random.sample(INSPIRE_JOURNALS, 1)


def check_doi_eprint(identifier):
    "Check to see if we already have a DOI eprint pairing."

    if identifier.startswith('10.'):
        result = search_unit(identifier, f='0247_a', m='a')
def ref_analyzer(citation_informations, initialresult, initial_citationlist,
                 initial_referencelist,config, updated_rec_list ):
    """Analyze the citation informations and calculate the citation weight
       and cited by list dictionary.
    """
    function = ""
    try:
        function = config.get("rank_method", "function")
    except:
        register_exception(prefix="cfg section [rank_method] has no attr function", alert_admin=True)
        return {}

    pubrefntag = ""
    try:
        pubrefntag  = config.get(function, "reference_via_report_number")
    except:
        register_exception(prefix="cfg section "+function+" has no attr reference_via_report_number", alert_admin=True)
        return {}

    pubreftag = ""
    try:
        pubreftag = config.get(function, "reference_via_pubinfo")
    except:
        register_exception(prefix="cfg section "+function+" has no attr reference_via_pubinfo", alert_admin=True)
        return {}

    #pubrefntag is often 999C5r, pubreftag 999C5s
    if task_get_task_param('verbose') >= 9:
        write_message("pubrefntag "+pubrefntag)
        write_message("pubreftag "+pubreftag)

    citation_list = initial_citationlist
    reference_list = initial_referencelist
    result = initialresult
    d_reports_numbers = citation_informations[0] #dict of recid -> institute_give_publ_id
    d_references_report_numbers = citation_informations[1] #dict of recid -> ['astro-ph/xyz'..]
    d_references_s = citation_informations[2]
       #dict of recid -> publication_infos_in_its_bibliography
    d_records_s = citation_informations[3] #recid -> its publication inf
    t1 = os.times()[4]

    write_message("Phase 0: temporarily remove changed records from citation dictionaries; they will be filled later")
    for somerecid in updated_rec_list:
        try:
            del citation_list[somerecid]
        except KeyError:
            pass
        try:
            del reference_list[somerecid]
        except KeyError:
            pass

    write_message("Phase 1: d_references_report_numbers")
    #d_references_report_numbers: e.g 8 -> ([astro-ph/9889],[hep-ph/768])
    #meaning: rec 8 contains these in bibliography

    done = 0
    numrecs = len(d_references_report_numbers)
    for thisrecid, refnumbers in d_references_report_numbers.iteritems():
        if (done % 1000 == 0):
            mesg =  "d_references_report_numbers done "+str(done)+" of "+str(numrecs)
            write_message(mesg)
            task_update_progress(mesg)
            task_sleep_now_if_required()
        done = done+1

        for refnumber in refnumbers:
            if refnumber:
                p = refnumber
                f = 'reportnumber'
                #sanitise p
                p.replace("\n",'')
                #search for "hep-th/5644654 or such" in existing records
                rec_ids = get_recids_matching_query(p, f)
                if rec_ids and rec_ids[0]:
                    write_citer_cited(thisrecid, rec_ids[0])
                    remove_from_missing(p)
                    if not result.has_key(rec_ids[0]):
                        result[rec_ids[0]] = 0
                    # Citation list should have rec_ids[0] but check anyway
                    if not citation_list.has_key(rec_ids[0]):
                        citation_list[rec_ids[0]] = []
                    #append unless this key already has the item
                    if not thisrecid in citation_list[rec_ids[0]]:
                        citation_list[rec_ids[0]].append(thisrecid)
                        #and update result
                        result[rec_ids[0]] += 1

                    if not reference_list.has_key(thisrecid):
                        reference_list[thisrecid] = []
                    if not rec_ids[0] in reference_list[thisrecid]:
                        reference_list[thisrecid].append(rec_ids[0])
                else:
                    #the reference we wanted was not found among our records.
                    #put the reference in the "missing".. however, it will look
                    #bad.. gfhgf/1254312, so  get the corresponding 999C5s (full ref) too
                    #This should really be done in the next loop d_references_s
                    #but the 999C5s fields are not yet normalized

                    #rectext = print_record(thisrecid, format='hm', ot=pubreftag[:-1])
                    rectext = "" # print_record() call disabled to speed things up
                    lines = rectext.split("\n")
                    rpart = p #to be used..
                    for l in lines:
                        if (l.find(p) > 0): #the gfhgf/1254312 was found.. get the s-part of it
                            st = l.find('$s')
                            if (st > 0):
                                end = l.find('$', st)
                                if (end == st):
                                    end = len(l)
                                rpart = l[st+2:end]
                    insert_into_missing(thisrecid, rpart)

    mesg = "d_references_report_numbers done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t2 = os.times()[4]

    #try to find references based on 999C5s, like Phys.Rev.Lett. 53 (1986) 2285
    write_message("Phase 2: d_references_s")
    done = 0
    numrecs = len(d_references_s)
    for thisrecid, refss in d_references_s.iteritems():
        if (done % 1000 == 0):
            mesg = "d_references_s done "+str(done)+" of "+str(numrecs)
            write_message(mesg)
            task_update_progress(mesg)
            task_sleep_now_if_required()

        done = done+1

        for refs in refss:
            if refs:
                p = refs
                #remove the latter page number if it is like 67-74
                matches = re.compile("(.*)(-\d+$)").findall(p)
                if matches and matches[0]:
                    p = matches[0][0]
                rec_id = None
                try:
                    rec_ids = list(search_unit(p, 'journal') - INTBITSET_OF_DELETED_RECORDS)
                except:
                    rec_ids = None
                write_message("These match searching "+p+" in journal: "+str(rec_id), verbose=9)
                if rec_ids and rec_ids[0]:
                    #the refered publication is in our collection, remove
                    #from missing
                    remove_from_missing(p)
                else:
                    #it was not found so add in missing
                    insert_into_missing(thisrecid, p)
                #check citation and reference for this..
                if rec_ids and rec_ids[0]:
                    #the above should always hold
                    if not result.has_key(rec_ids[0]):
                        result[rec_ids[0]] = 0
                    if not citation_list.has_key(rec_ids[0]):
                        citation_list[rec_ids[0]] = []
                    if not thisrecid in citation_list[rec_ids[0]]:
                        citation_list[rec_ids[0]].append(thisrecid) #append actual list
                        result[rec_ids[0]] += 1 #add count for this..

                    #update reference_list accordingly
                    if not reference_list.has_key(thisrecid):
                        reference_list[thisrecid] = []
                    if not rec_ids[0] in reference_list[thisrecid]:
                        reference_list[thisrecid].append(rec_ids[0])
    mesg = "d_references_s done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t3 = os.times()[4]
    done = 0
    numrecs = len(d_reports_numbers)
    write_message("Phase 3: d_reports_numbers")

    #search for stuff like CERN-TH-4859/87 in list of refs
    for thisrecid, reportcodes in d_reports_numbers.iteritems():
        if (done % 1000 == 0):
            mesg = "d_report_numbers done "+str(done)+" of "+str(numrecs)
            write_message(mesg)
            task_update_progress(mesg)
        done = done+1

        for reportcode in reportcodes:
            if reportcode:
                rec_ids = []
                try:
                    rec_ids = get_recids_matching_query(reportcode, pubrefntag)
                except:
                    rec_ids = []

                if rec_ids:
                    for recid in rec_ids:
                        #normal checks..
                        if not citation_list.has_key(thisrecid):
                            citation_list[thisrecid] = []
                        if not reference_list.has_key(recid):
                            reference_list[recid] = []
                        if not result.has_key(thisrecid):
                            result[thisrecid] = 0

                        #normal updates
                        if not recid in citation_list[thisrecid]:
                            result[thisrecid] += 1
                            citation_list[thisrecid].append(recid)
                        if not thisrecid in reference_list[recid]:
                            reference_list[recid].append(thisrecid)

    mesg = "d_report_numbers done fully"
    write_message(mesg)
    task_update_progress(mesg)

    #find this record's pubinfo in other records' bibliography
    write_message("Phase 4: d_records_s")
    done = 0
    numrecs = len(d_records_s)
    t4 = os.times()[4]
    for thisrecid, recs in d_records_s.iteritems():
        if (done % 1000 == 0):
            mesg = "d_records_s done "+str(done)+" of "+str(numrecs)
            write_message(mesg)
            task_update_progress(mesg)
        done = done+1
        p = recs.replace("\"","")
        #search the publication string like Phys. Lett., B 482 (2000) 417 in 999C5s
        rec_ids = list(search_unit(f=pubreftag, p=p, m='a') - INTBITSET_OF_DELETED_RECORDS)
        write_message("These records match "+p+" in "+pubreftag+" : "+str(rec_ids), verbose=9)
        if rec_ids:
            for rec_id in rec_ids:
                #normal checks
                if not result.has_key(thisrecid):
                    result[thisrecid] = 0
                if not citation_list.has_key(thisrecid):
                    citation_list[thisrecid] = []
                if not reference_list.has_key(rec_id):
                    reference_list[rec_id] = []

                if not rec_id in citation_list[thisrecid]:
                    result[thisrecid] += 1
                    citation_list[thisrecid].append(rec_id)
                if not thisrecid in reference_list[rec_id]:
                    reference_list[rec_id].append(thisrecid)

    mesg = "d_records_s done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 5: reverse lists")

    #remove empty lists in citation and reference
    keys = citation_list.keys()
    for k in keys:
        if not citation_list[k]:
            del citation_list[k]

    keys = reference_list.keys()
    for k in keys:
        if not reference_list[k]:
            del reference_list[k]

    write_message("Phase 6: self-citations")
    selfdic = {}
    #get the initial self citation dict
    initial_self_dict = get_cit_dict("selfcitdict")
    selfdic = initial_self_dict
    #add new records to selfdic
    acit = task_get_option("author-citations")
    if not acit:
        write_message("Self cite processing disabled. Use -A option to enable it.")
    else:
        write_message("self cite and author citations enabled")
        selfdic = get_self_citations(updated_rec_list, citation_list,
                                 initial_self_dict, config)
    #selfdic consists of
    #key k -> list of values [v1,v2,..]
    #where k is a record with author A and k cites v1,v2.. and A appears in v1,v2..

    #create a reverse "x cited by y" self cit dict
    selfcitedbydic = {}
    for k in selfdic.keys():
        vlist = selfdic[k]
        for v in vlist:
            if selfcitedbydic.has_key(v):
                tmplist = selfcitedbydic[v]
                if not k in tmplist:
                    tmplist.append(k)
            else:
                tmplist = [k]
            selfcitedbydic[v] = tmplist

    write_message("Getting author citations")

    #get author citations for records in updated_rec_list
    initial_author_dict = get_initial_author_dict()
    authorcitdic = initial_author_dict
    acit = task_get_option("author-citations")
    if not acit:
        print "Author cites disabled. Use -A option to enable it."
    else:
        write_message("author citations enabled")
        authorcitdic = get_author_citations(updated_rec_list, citation_list,
                                        initial_author_dict, config)


    if task_get_task_param('verbose') >= 3:
        #print only X first to prevent flood
        tmpdict = {}
        tmp = citation_list.keys()[0:10]
        for t in tmp:
            tmpdict[t] = citation_list[t]
        write_message("citation_list (x is cited by y): "+str(tmpdict))
        write_message("size: "+str(len(citation_list.keys())))
        tmp = reference_list.keys()[0:10]
        tmpdict = {}
        for t in tmp:
            tmpdict[t] = reference_list[t]
        write_message("reference_list (x cites y): "+str(tmpdict))
        write_message("size: "+str(len(reference_list.keys())))
        tmp = selfcitedbydic.keys()[0:10]
        tmpdict = {}
        for t in tmp:
            tmpdict[t] = selfcitedbydic[t]
        mesg = "selfcitedbydic (x is cited by y and one of the authors of x same as y's):"
        mesg += str(tmpdict)
        write_message(mesg)
        write_message("size: "+str(len(selfcitedbydic.keys())))
        tmp = selfdic.keys()[0:100]
        tmpdict = {}
        for t in tmp:
            tmpdict[t] = selfdic[t]
        mesg = "selfdic (x cites y and one of the authors of x same as y's): "+str(tmpdict)
        write_message(mesg)
        write_message("size: "+str(len(selfdic.keys())))
        tmp = authorcitdic.keys()[0:10]
        tmpdict = {}
        for t in tmp:
            tmpdict[t] = authorcitdic[t]
        write_message("authorcitdic (author is cited in recs): "+str(tmpdict))
        write_message("size: "+str(len(authorcitdic.keys())))
    insert_cit_ref_list_intodb(citation_list, reference_list,
                               selfcitedbydic, selfdic, authorcitdic)

    t5 = os.times()[4]

    write_message("Execution time for analyzing the citation information generating the dictionary:")
    write_message("... checking ref number: %.2f sec" % (t2-t1))
    write_message("... checking ref ypvt: %.2f sec" % (t3-t2))
    write_message("... checking rec number: %.2f sec" % (t4-t3))
    write_message("... checking rec ypvt: %.2f sec" % (t5-t4))
    write_message("... total time of ref_analyze: %.2f sec" % (t5-t1))

    return result
Пример #21
0
def format(bfo, reference_prefix, reference_suffix):
    """
    Prints the references of this record

    @param reference_prefix a prefix displayed before each reference
    @param reference_suffix a suffix displayed after each reference
    """

    from invenio.search_engine import search_unit
    from invenio.bibformat import format_record
    references = bfo.fields("999C5", escape=1)
    out = ""

    for reference in references:
        ref_out = ''

        if reference.has_key('o'):
            if out != "":
                ref_out = '</li>'
            ref_out += '<li><small>'+\
                       reference['o']+ "</small> "
#  LEAVE out full ref while we have spires import which does not store
#  useful things here
#        if reference.has_key('m'):
#            ref_out += "<small>"+ reference['m']+ "</small> "



        display_journal = ''
        display_report = ''
        clean_report = ''
        clean_journal = ''
        hits = []
        if reference.has_key('s'):
            display_journal = reference['s']
            clean_journal = reference['s']
        if reference.has_key('r'):
            display_report = reference['r']
            clean_report = reference['r']
        if clean_report:
            hits = search_unit(f='reportnumber', p=clean_report)
        if clean_journal and len(hits)!=1:
            hits = search_unit(f='journal', p=clean_journal)
        if len(hits) == 1:
            ref_out += '<small>' +\
                       format_record(list(hits)[0],'hs') + '</small>'

#  Silly stuff that can be used if there are a lot of multiple hits
#
#        elif len(hits)>1:
#            if display_journal:
#                ref_out += '<small><a href="'+CFG_SITE_URL+\
#                           '/search?f=journal&amp;p='+ \
#                           reference['s']+ \
#                           '&amp;ln=' + bfo.lang + \
#                           '">'+display_journal+"</a></small>"
#            if display_report:
#                ref_out += ' <small><a href="'+CFG_SITE_URL+\
#                           '/search?f=reportnumber&amp;p='+ \
#                           reference['r']+ \
#                           '&amp;ln=' + bfo.lang + \
#                           '">'+display_report+"</a></small>"

        else:
            ref_out = '<small>'
            if display_journal:
                ref_out += display_journal
            if display_report:
                ref_out += ' '+display_report
            ref_out += ' (not in Inspire)</small>'



        ref_out += "<br />"



        if reference_prefix is not None and ref_out != '':
            ref_out = reference_prefix + ref_out
        if reference_suffix is not None and ref_out != '':
            ref_out += reference_suffix

        out += ref_out

    if out != '':
        out += '</li>'

    return out
Пример #22
0
def format_element(bfo, reference_prefix, reference_suffix):
    """
    Prints the references of this record

    @param reference_prefix a prefix displayed before each reference
    @param reference_suffix a suffix displayed after each reference
    """

    references = bfo.fields("999C5", escape=1)

    out = "<div id='referenceinp_link_box'><span id='referenceinp_link_span'><a id='referenceinp_link' href='"+CFG_SITE_URL+'/record/'+str(bfo.recID)+'/export/hrf'+"'>Update these references</a></span></div>"
    last_o = ""

    if not references:
        return out

    out += "<table>"
    for reference in references:
        ref_out = '<tr><td valign="top">'

        display_journal = ''
        display_report = ''
        clean_report = ''
        clean_journal = ''
        hits = []
        if reference.has_key('o') and not reference['o'] == last_o:
            temp_ref = reference['o'].replace('.', '')
            if '[' in temp_ref and ']' in temp_ref:
                ref_out += "<small>" + temp_ref + "</small> "
            else:
                ref_out += "<small>[" + temp_ref + "] </small> "
            last_o = temp_ref
        ref_out += "</td><td>"
        if reference.has_key('s'):
            display_journal = reference['s']
            clean_journal = reference['s']
        if reference.has_key('r'):
            if "[" in reference['r'] and "]" in reference['r']:
                breaknum = reference['r'].find('[')
                newreference = reference['r'][:breaknum].strip()
                display_report = newreference
                clean_report = newreference
            else:
                display_report = reference['r']
                clean_report = reference['r']
        if clean_report:
            hits = search_unit(f='reportnumber', p=clean_report)
        if clean_journal and len(hits)!=1:
            hits = search_unit(f='journal', p=clean_journal)
        if reference.has_key('a') and len(hits)!=1:
            hits = search_unit(f='doi', p=reference['a'])
        if len(hits) == 1:
            ref_out += '<small>' +\
                       format_record(list(hits)[0],'hs') + '</small>'
        else:
            if reference.has_key('h'):
                ref_out += "<small> " + reference['h'] + ".</small> "

            if reference.has_key('m'):
                ref_out += "<small>"+ reference['m'].replace(']]', ']') + ".</small> "

            if reference.has_key('a'):
                ref_out += " <small><a href=\"http://dx.doi.org/" + \
                reference['a'] + "\">" + reference['a']+ "</a></small> "

            if reference.has_key('u'):
                ref_out += " <small><a href=" + reference['u'] + ">" + \
                reference['u']+ "</a></small> "

            ref_out += ' <small>'
            if display_journal:
                ref_out += display_journal
            if display_report:
                ref_out += ' ' + display_report



        ref_out += "</small></td></tr>"

        if reference_prefix is not None and ref_out != '':
            ref_out = reference_prefix + ref_out
        if reference_suffix is not None and ref_out != '':
            ref_out += reference_suffix

        out += ref_out

    if out != '':
        out += '</li>'

    return out + "</table>"
Пример #23
0
def bst_prodsync(method='afs',
                 with_citations='yes',
                 with_claims='yes',
                 skip_collections=''):
    """
    Synchronize to either 'afs' or 'redis'

    with_citations: yes/no, whether records that now matches a record will need to be re-exported.abs
    with_claims: yes/no, whether record involved in some new claim need to be re-exported.
    skip_collections: comma-separated-lists of values for which records having 980:VALUE should be ignored,
        e.g. skip_collections='HEP,HEPNAMES,HEPHIDDEN'
    """
    if not CFG_REDIS_HOST_LABS:
        method = 'afs'

    write_message("Prodsync started using %s method" % method)
    now = datetime.datetime.now()
    future_lastrun = now.strftime('%Y-%m-%d %H:%M:%S')
    lastrun_path = os.path.join(CFG_TMPSHAREDDIR,
                                'prodsync_%s_lastrun.txt' % method)
    try:
        last_run = open(lastrun_path).read().strip()
        write_message("Syncing records modified since %s" % last_run)
        with run_ro_on_slave_db():
            modified_records = intbitset(
                run_sql("SELECT id FROM bibrec WHERE modification_date>=%s",
                        (last_run, )))
            compacttime = last_run.replace('-',
                                           '').replace(' ',
                                                       '').replace(':', '')
            notimechangerecs = search_unit("%s->20250101000000" % compacttime,
                                           f='005',
                                           m='a')
            modified_records += notimechangerecs
            if with_citations.lower() == 'yes':
                for citee, citer in run_sql(
                        "SELECT citee, citer FROM rnkCITATIONDICT WHERE last_updated>=%s",
                    (last_run, )):
                    modified_records.add(citer)
            if with_claims.lower() == 'yes':
                modified_records |= intbitset(
                    run_sql(
                        "SELECT bibrec FROM aidPERSONIDPAPERS WHERE last_updated>=%s",
                        (last_run, )))
                modified_records |= intbitset(
                    run_sql(
                        "SELECT bibrec FROM aidPERSONIDPAPERS AS p JOIN aidPERSONIDDATA as d"
                        " ON p.personid = d.personid WHERE d.last_updated>=%s",
                        (last_run, )))
    except IOError:
        # Default to everything
        with run_ro_on_slave_db():
            modified_records = intbitset(run_sql("SELECT id FROM bibrec"))
        write_message("Syncing all records")

    skip_collections = skip_collections.split(',')
    skip_collections.remove('')
    for collection in skip_collections:
        modified_records -= search_pattern(p='980:%s' % collection)

    if not modified_records:
        write_message("Nothing to do")
        return True

    tot = len(modified_records)
    time_estimator = get_time_estimator(tot)
    write_message("Adding %s new or modified records" % tot)
    if method == 'afs':
        afs_sync(reversed(modified_records), time_estimator, tot, now)
        open(lastrun_path, "w").write(future_lastrun)
        write_message("DONE!")
    else:
        if redis_sync(reversed(modified_records), time_estimator, tot):
            open(lastrun_path, "w").write(future_lastrun)
            write_message("DONE!")
        else:
            write_message("Skipping prodsync: Redis queue is not yet empty")
Пример #24
0
from invenio.bibindex_engine import get_field_tags
from invenio.bibindex_engine import CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK


class memoise:
    def __init__(self, function):
        self.memo = {}
        self.function = function

    def __call__(self, *args):
        if args not in self.memo:
            self.memo[args] = self.function(*args)
        return self.memo[args]


INTBITSET_OF_DELETED_RECORDS = search_unit(p="DELETED", f="980", m="a")

re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK = re.compile(CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK)


def get_recids_matching_query(p, f, m="e"):
    """Return set of recIDs matching query for pattern p in field f."""
    return search_pattern(p=p, f=f, m=m) - INTBITSET_OF_DELETED_RECORDS


def get_citation_weight(rank_method_code, config, chunk_size=20000):
    """return a dictionary which is used by bibrank daemon for generating
    the index of sorted research results by citation information
    """
    begin_time = time.time()
Пример #25
0
def format_element(bfo, reference_prefix, reference_suffix):
    """
    Prints the references of this record

    @param reference_prefix a prefix displayed before each reference
    @param reference_suffix a suffix displayed after each reference
    """
    references = bfo.fields("999C5", escape=1, repeatable_subfields_p=True)

    out = ""
    last_o = ""

    if not references:
        return out

    out += "<table>"
    for reference in references:
        ref_out = []
        ref_out.append('<tr><td valign="top">')

        display_journal = ''
        display_report = ''
        clean_report = ''
        clean_journal = ''
        hits = []
        if reference.has_key('o') and not reference['o'][0] == last_o:
            temp_ref = reference['o'][0].replace('.', '')
            if '[' in temp_ref and ']' in temp_ref:
                ref_out.append("<small>" + temp_ref + "</small> ")
            else:
                ref_out.append("<small>[" + temp_ref + "] </small> ")
            last_o = temp_ref
        ref_out.append("</td><td>")

        if reference_prefix:
            ref_out.append(reference_prefix)

        if reference.has_key('s'):
            display_journal = reference['s'][0]
            clean_journal = reference['s'][0]
        if reference.has_key('r'):
            if "[" in reference['r'][0] and "]" in reference['r'][0]:
                breaknum = reference['r'][0].find('[')
                newreference = reference['r'][0][:breaknum].strip()
                display_report = newreference
                clean_report = newreference
            else:
                display_report = reference['r'][0]
                clean_report = reference['r'][0]
        if clean_report:
            hits = search_unit(f='reportnumber', p=clean_report)
        if clean_journal and len(hits) != 1:
            hits = search_unit(f='journal', p=clean_journal)
        if reference.has_key('a') and len(hits) != 1:
            hits = search_unit(p=reference['a'][0])
        if reference.has_key('0') and len(hits) != 1:
            # check if the record exists in the database
            try:
                recID = int(reference['0'][0])
                if get_record(recID):
                    # since we already have a recID, we can assign it directly
                    # to the "hits" variable, so it will be handled in the last if statement
                    hits = [recID]
            except ValueError:
                pass
        if len(hits) == 1:
            ref_out.append('<small>' + format_record(list(hits)[0], 'hs') +
                           '</small>')
        else:
            if reference.has_key('h'):
                ref_out.append("<small> " + reference['h'][0] + ".</small>")
            if reference.has_key('t'):
                ref_out.append("<small> " + reference['t'][0] + "</small> -")
            if reference.has_key('y'):
                ref_out.append("<small> " + reference['y'][0] + ".</small>")
            if reference.has_key('p'):
                ref_out.append("<small> " + reference['p'][0] + ".</small>")
            if reference.has_key('m'):
                ref_out.append("<small> " +
                               reference['m'][0].replace(']]', ']') +
                               ".</small>")
            if reference.has_key('a'):
                ref_out.append("<small> <a href=\"http://dx.doi.org/" + \
                reference['a'][0] + "\">" + reference['a'][0]+ "</a></small>")
            if reference.has_key('u'):
                ref_out.append("<small> <a href=" + reference['u'][0] + ">" + \
                reference['u'][0]+ "</a></small>")
            if reference.has_key('i'):
                for r in reference['i']:
                    ref_out.append(
                        "<small> <a href=\"/search?ln=en&amp;p=020__a%3A" + r +
                        "\">" + r + "</a></small>")

            ref_out.append('<small>')
            if display_journal:
                ref_out.append(display_journal)
            if display_report:
                ref_out.append(' ' + display_report)
            ref_out.append("</small>")

        if reference_suffix:
            ref_out.append(reference_suffix)

        ref_out.append("</td></tr>")
        out += ' '.join(ref_out)

    return out + "</table>"
Пример #26
0
from hep_ads_xml_input import ARXIV_REGEX, ARXIV_REGEX_NEW
from hep_compare_arxiv_inspire_input import IGNORE_EPRINTS, UK_TO_US

LOGFILE = 'tmp_' + __file__
LOGFILE = re.sub('.py', '.log', LOGFILE)
logging.basicConfig(filename=LOGFILE,
                    filemode='w',
                    format='%(name)s - %(levelname)s - %(message)s',
                    level=logging.INFO)

INPUT_FILE = 'tmp_hep_ads_xml_missing_eprint.in'
MAX_COUNT = 10
URL_BASE = 'http://export.arxiv.org/api/query?id_list='
DOI_REGEX = re.compile(r'^10.\d{4,9}/\S+$')

DELETED = search_unit(p='DELETED', m='a', f='980*')


def clean_eprint(eprint):
    '''Remove possible prefix from eprint.'''

    regex = re.compile('^arxiv:', re.I)
    return regex.sub('', eprint)


def create_xml(recid, input_dict):
    '''Create marcxml file from.'''

    record = {}
    record_add_field(record, '001', controlfield_value=str(recid))
def examine(field_search):
    field = field_search[0]
    search = field_search[1]
    collection = field_search[2]
    core = perform_request_search(p='980:CORE', cc='HEP')
    search_theory = 'find fc p or fc t or fc l or fc n or fc g 980:core'
    search_core = '980:core'
    core = perform_request_search(p=search_core, cc='HEP')
    if re.search(r'541.*', field):
        result = search_unit(p = search, m = 'a', f = field)
        #result = result & get_collection_reclist('HEP')
        result = result & intbitset(core)
    else:
        if not re.search(r'\:', search):
            search = field + ':' + search
        result = perform_request_search(p = search, cc = collection)
        if collection == 'HEP':
            result = intbitset(result) & intbitset(core)
    if VERBOSE:
        print 'VERBOSE', field, search, collection, len(result)
    already_seen_field_values = []
    for recid in result:
        recid_print = ""
        field_values = get_fieldvalues(recid, field)
        for field_value in field_values:
            bad_id = False
            if field_value in already_seen_field_values:
                continue
            if re.search(r'INSPIRE', field_value):
                inspire_form = r'^INSPIRE-\d{8}$'
                if not re.match(inspire_form, field_value):
                    print 'Bad INSPIRE ID: ', field_value
                    bad_id = True
            elif re.search(r'^0000-', field_value):
                orcid_form = r'^0000-\d{4}-\d{4}-\d{3}[\dX]$'
                if not re.match(orcid_form, field_value):
                    print 'Bad ORCID ID: ', field_value
                    bad_id = True
            search_dup = '{0}:"{1}"'.format(field, field_value)
            if field == '371__m' or field == '541__b':
                search_dup = email_search(field_value)
                if re.search(r"\'", field_value):
                    field_value_mod = \
                        re.sub(r"\'", r".", field_value)
                    search_dup = email_search(field_value_mod)
            elif collection == 'HEP':
#field == '541__a' or field == '100__j' \
#                                   or field == '700__j':
                ignore = r'(CCID|JACoW|uid|arxiv)'
                if re.search(ignore, field_value):
                    continue
                field_value = re.sub(r'^\w+:', r'', field_value)
                if not field_value:
                    continue
                search_dup = '035__a:' + field_value
                #collection = 'HepNames'
            if collection == 'HEP':
#field == '541__a' or field == '541__b':
                recid_print = "http://inspirehep.net/record/" \
                              + str(recid) + "/export/xm"
            #print search_dup
            if field_value in already_seen_field_values:
                continue
            result_dup =  perform_request_search(p = search_dup, \
                              cc = 'HepNames')
            if len(result_dup) != 1 or bad_id:
                if field == '100__a':
                    for recid_dup in result_dup:
                        author_id = \
                            find_inspire_id_from_record(recid_dup)
                        print '{0:11d} {1:40s} {2:20s}'.\
                              format(recid_dup, field_value, author_id)
                else:
                    if len(result_dup) == 0:
                        print_field_value = field_value
                        if collection == 'HEP' and \
                               re.search(r'^0000-', field_value):
                            print_field_value = 'http://orcid.org/' + \
                                          field_value
                        print '{0:40s} {1:30s}'. \
                              format(print_field_value, recid_print)
                    else:
                        print search_dup, recid_print, result_dup, bad_id
                if field == '035__a' and field_value:
                    author_search = r'100__a:"{0}" or 700__a:"{0}"'
                    search_hep = author_search.format(field_value)
                    result_hep = perform_request_search(p = search_hep, \
                        cc = 'HEP')
                    if len(result_hep) > 0:
                        print 'Bad ID in HEP', search_hep, \
                            len(result_hep)
            already_seen_field_values.append(field_value)
from invenio.dbquery import run_sql, serialize_via_marshal, \
                            deserialize_via_marshal
from invenio.bibindex_engine import CFG_JOURNAL_PUBINFO_STANDARD_FORM
from invenio.search_engine import search_pattern, search_unit
from invenio.search_engine_utils import get_fieldvalues
from invenio.bibformat_utils import parse_tag
from invenio.bibknowledge import get_kb_mappings
from invenio.bibtask import write_message, task_get_option, \
                     task_update_progress, task_sleep_now_if_required, \
                     task_get_task_param
from invenio.errorlib import register_exception
from invenio.bibindex_engine import get_field_tags
from invenio.bibindex_engine import CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK


INTBITSET_OF_DELETED_RECORDS = search_unit(p='DELETED', f='980', m='a')

re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK = re.compile(CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK)

def get_recids_matching_query(p, f, m='e'):
    """Return set of recIDs matching query for pattern p in field f."""
    return search_pattern(p=p, f=f, m=m) - INTBITSET_OF_DELETED_RECORDS


def get_citation_weight(rank_method_code, config, chunk_size=20000):
    """return a dictionary which is used by bibrank daemon for generating
    the index of sorted research results by citation information
    """
    begin_time = time.time()

    quick = task_get_option("quick") != "no"
Пример #29
0
def format_element(bfo, reference_prefix, reference_suffix):
    """
    Prints the references of this record

    @param reference_prefix a prefix displayed before each reference
    @param reference_suffix a suffix displayed after each reference
    """

    from invenio.search_engine import search_unit
    from invenio.bibformat import format_record
    references = bfo.fields("999C5", escape=1)
    out = ""

    for reference in references:
        ref_out = ''

#        if reference.has_key('o'):
#            if out != "":
#                ref_out = '</li>'
#            ref_out += '<li><small>'+\
#                       reference['o']+ "</small> "

        display_journal = ''
        display_report = ''
        clean_report = ''
        clean_journal = ''
        hits = []
        if reference.has_key('s'):
            display_journal = reference['s']
            clean_journal = reference['s']
        if reference.has_key('r'):
            display_report = reference['r']
            clean_report = reference['r']
        if clean_report:
            hits = search_unit(f='reportnumber', p=clean_report)
        if clean_journal and len(hits)!=1:
            hits = search_unit(f='journal', p=clean_journal)
        if reference.has_key('a') and len(hits)!=1:
            hits = search_unit(f='doi', p=reference['a'])
        if len(hits) == 1:
            ref_out += '<small>' +\
                       format_record(list(hits)[0],'hs') + '</small>'


        else:

            if reference.has_key('h'):
                ref_out += "<small> " + reference['h']+ ".</small> "

            if reference.has_key('m'):
                ref_out += "<small>"+ reference['m'] + ".</small> "

            if reference.has_key('a'):
                ref_out += " <small><a href=\"http://dx.doi.org/" + \
                reference['a'] + "\">" + reference['a']+ "</a></small> "

            ref_out += ' <small>'
            if display_journal:
                ref_out += display_journal
            if display_report:
                ref_out += ' ' + display_report
            ref_out += '<br /> <em>(not extracted or not in INSPIRE)</em></small>'



        ref_out += "<br />"

        if reference_prefix is not None and ref_out != '':
            ref_out = reference_prefix + ref_out
        if reference_suffix is not None and ref_out != '':
            ref_out += reference_suffix

        out += ref_out

    if out != '':
        out += '</li>'

    return out
Пример #30
0
                     task_get_task_param
from invenio.errorlib import register_exception
from invenio.intbitset import intbitset

class memoise:
    def __init__(self, function):
        self.memo = {}
        self.function = function
    def __call__(self, *args):
        if self.memo.has_key(args):
            return self.memo[args]
        else:
            object = self.memo[args] = self.function(*args)
            return object

INTBITSET_OF_DELETED_RECORDS = search_unit(p='DELETED', f='980', m='a')

def get_recids_matching_query(pvalue, fvalue):
    """Return list of recIDs matching query for PVALUE and FVALUE."""
    rec_id = list(search_pattern(p=pvalue, f=fvalue, m='e') - INTBITSET_OF_DELETED_RECORDS)
    return rec_id
get_recids_matching_query = memoise(get_recids_matching_query)

def get_citation_weight(rank_method_code, config):
    """return a dictionary which is used by bibrank daemon for generating
    the index of sorted research results by citation information
    """
    begin_time = time.time()
    last_update_time = get_bibrankmethod_lastupdate(rank_method_code)

    if task_get_option("quick") == "no":
def ref_analyzer(citation_informations,
                 dicts,
                 updated_recids,
                 tags,
                 do_catchup=True):
    """Analyze the citation informations and calculate the citation weight
       and cited by list dictionary.
    """
    citations_weight = dicts['cites_weight']
    citations = dicts['cites']
    references = dicts['refs']
    selfcites = dicts['selfcites']
    selfrefs = dicts['selfrefs']
    authorcites = dicts['authorcites']

    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)

    def add_to_dicts(citer, cited):
        # Make sure we don't add ourselves
        # Workaround till we know why we are adding ourselves.
        if citer == cited:
            return
        if cited not in citations_weight:
            citations_weight[cited] = 0
        # Citations and citations weight
        if citer not in citations.setdefault(cited, []):
            citations[cited].append(citer)
            citations_weight[cited] += 1
        # References
        if cited not in references.setdefault(citer, []):
            references[citer].append(cited)

    # dict of recid -> institute_give_publ_id
    records_info, references_info = citation_informations

    t1 = os.times()[4]

    write_message("Phase 0: temporarily remove changed records from " \
                  "citation dictionaries; they will be filled later")
    if do_catchup:
        for somerecid in updated_recids:
            try:
                del citations[somerecid]
            except KeyError:
                pass

    for somerecid in updated_recids:
        try:
            del references[somerecid]
        except KeyError:
            pass

    # Try to find references based on 999C5r
    # e.g 8 -> ([astro-ph/9889],[hep-ph/768])
    # meaning: rec 8 contains these in bibliography
    write_message("Phase 1: Report numbers references")
    done = 0
    for thisrecid, refnumbers in references_info['report-numbers'].iteritems():
        step("Report numbers references", thisrecid, done,
             len(references_info['report-numbers']))
        done += 1

        for refnumber in (r for r in refnumbers if r):
            field = 'reportnumber'
            refnumber = standardize_report_number(refnumber)
            # Search for "hep-th/5644654 or such" in existing records
            recids = get_recids_matching_query(p=refnumber, f=field)
            write_message("These match searching %s in %s: %s" % \
                                   (refnumber, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, refnumber)
            else:
                remove_from_missing(refnumber)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', refnumber)
                msg = "Whoops: record '%d' report number value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, refnumber, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t2 = os.times()[4]

    # Try to find references based on 999C5s
    # e.g. Phys.Rev.Lett. 53 (1986) 2285
    write_message("Phase 2: Journal references")
    done = 0
    for thisrecid, refs in references_info['journals'].iteritems():
        step("Journal references", thisrecid, done,
             len(references_info['journals']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'journal'

            # check reference value to see whether it is well formed:
            if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p):
                store_citation_warning('not-well-formed', p)
                msg = "Whoops, record '%d' reference value '%s' " \
                      "is not well formed; skipping it." % (thisrecid, p)
                write_message(msg, stream=sys.stderr)
                continue  # skip this ill-formed value

            recids = search_unit(p, field) - INTBITSET_OF_DELETED_RECORDS
            write_message("These match searching %s in %s: %s" \
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' reference value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t3 = os.times()[4]

    # Try to find references based on 999C5a
    # e.g. 10.1007/BF03170733
    write_message("Phase 3: DOI references")
    done = 0
    for thisrecid, refs in references_info['doi'].iteritems():
        step("DOI references", thisrecid, done, len(references_info['doi']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'doi'

            recids = get_recids_matching_query(p, field)
            write_message("These match searching %s in %s: %s" \
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' DOI value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t4 = os.times()[4]

    # Search for stuff like CERN-TH-4859/87 in list of refs
    write_message("Phase 4: report numbers catchup")
    done = 0
    for thisrecid, reportcodes in records_info['report-numbers'].iteritems():
        step("Report numbers catchup", thisrecid, done,
             len(records_info['report-numbers']))
        done += 1

        for reportcode in (r for r in reportcodes if r):
            if reportcode.startswith('arXiv'):
                std_reportcode = standardize_report_number(reportcode)
                report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \
                                                re.escape(std_reportcode)
                recids = get_recids_matching_query(report_pattern,
                                                   tags['refs_report_number'],
                                                   'r')
            else:
                recids = get_recids_matching_query(reportcode,
                                                   tags['refs_report_number'],
                                                   'e')
            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    # Find this record's pubinfo in other records' bibliography
    write_message("Phase 5: journals catchup")
    done = 0
    t5 = os.times()[4]
    for thisrecid, rec_journals in records_info['journals'].iteritems():
        step("Journals catchup", thisrecid, done,
             len(records_info['journals']))
        done += 1

        for journal in rec_journals:
            journal = journal.replace("\"", "")
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5s
            recids = search_unit(p=journal, f=tags['refs_journal'], m='a') \
                                                - INTBITSET_OF_DELETED_RECORDS
            write_message("These records match %s in %s: %s" \
                    % (journal, tags['refs_journal'], list(recids)), verbose=9)

            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 6: DOI catchup")
    done = 0
    t6 = os.times()[4]
    for thisrecid, dois in records_info['doi'].iteritems():
        step("DOI catchup", thisrecid, done, len(records_info['doi']))
        done += 1

        for doi in dois:
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5a
            recids = search_unit(p=doi, f=tags['refs_doi'], m='a') \
                                                - INTBITSET_OF_DELETED_RECORDS
            write_message("These records match %s in %s: %s" \
                            % (doi, tags['refs_doi'], list(recids)), verbose=9)

            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 7: remove empty lists from dicts")

    # Remove empty lists in citation and reference
    keys = citations.keys()
    for k in keys:
        if not citations[k]:
            del citations[k]

    keys = references.keys()
    for k in keys:
        if not references[k]:
            del references[k]

    if task_get_task_param('verbose') >= 3:
        # Print only X first to prevent flood
        write_message("citation_list (x is cited by y):")
        write_message(dict(islice(citations.iteritems(), 10)))
        write_message("size: %s" % len(citations))
        write_message("reference_list (x cites y):")
        write_message(dict(islice(references.iteritems(), 10)))
        write_message("size: %s" % len(references))
        write_message("selfcitedbydic (x is cited by y and one of the " \
                      "authors of x same as y's):")
        write_message(dict(islice(selfcites.iteritems(), 10)))
        write_message("size: %s" % len(selfcites))
        write_message("selfdic (x cites y and one of the authors of x " \
                      "same as y's):")
        write_message(dict(islice(selfrefs.iteritems(), 10)))
        write_message("size: %s" % len(selfrefs))
        write_message("authorcitdic (author is cited in recs):")
        write_message(dict(islice(authorcites.iteritems(), 10)))
        write_message("size: %s" % len(authorcites))

    t7 = os.times()[4]

    write_message("Execution time for analyzing the citation information " \
                  "generating the dictionary:")
    write_message("... checking ref report numbers: %.2f sec" % (t2 - t1))
    write_message("... checking ref journals: %.2f sec" % (t3 - t2))
    write_message("... checking ref DOI: %.2f sec" % (t4 - t3))
    write_message("... checking rec report numbers: %.2f sec" % (t5 - t4))
    write_message("... checking rec journals: %.2f sec" % (t6 - t5))
    write_message("... checking rec DOI: %.2f sec" % (t7 - t6))
    write_message("... total time of ref_analyze: %.2f sec" % (t7 - t1))

    return citations_weight, citations, references, selfcites, \
                                                        selfrefs, authorcites
Пример #32
0
def ref_analyzer(citation_informations, initialresult, initial_citationlist,
                 initial_referencelist, config, updated_rec_list):
    """Analyze the citation informations and calculate the citation weight
       and cited by list dictionary.
    """
    function = ""
    try:
        function = config.get("rank_method", "function")
    except:
        register_exception(
            prefix="cfg section [rank_method] has no attr function",
            alert_admin=True)
        return {}

    pubrefntag = ""
    try:
        pubrefntag = config.get(function, "reference_via_report_number")
    except:
        register_exception(prefix="cfg section " + function +
                           " has no attr reference_via_report_number",
                           alert_admin=True)
        return {}

    pubreftag = ""
    try:
        pubreftag = config.get(function, "reference_via_pubinfo")
    except:
        register_exception(prefix="cfg section " + function +
                           " has no attr reference_via_pubinfo",
                           alert_admin=True)
        return {}

    #pubrefntag is often 999C5r, pubreftag 999C5s
    if task_get_task_param('verbose') >= 9:
        write_message("pubrefntag " + pubrefntag)
        write_message("pubreftag " + pubreftag)

    citation_list = initial_citationlist
    reference_list = initial_referencelist
    result = initialresult
    d_reports_numbers = citation_informations[
        0]  #dict of recid -> institute_give_publ_id
    d_references_report_numbers = citation_informations[
        1]  #dict of recid -> ['astro-ph/xyz'..]
    d_references_s = citation_informations[2]
    #dict of recid -> publication_infos_in_its_bibliography
    d_records_s = citation_informations[3]  #recid -> its publication inf
    t1 = os.times()[4]

    write_message("Phase 1: d_references_report_numbers")
    #d_references_report_numbers: e.g 8 -> ([astro-ph/9889],[hep-ph/768])
    #meaning: rec 8 contains these in bibliography

    done = 0
    numrecs = len(d_references_report_numbers)
    for thisrecid, refnumbers in d_references_report_numbers.iteritems():
        if (done % 1000 == 0):
            mesg = "d_references_report_numbers done " + str(
                done) + " of " + str(numrecs)
            write_message(mesg)
            task_update_progress(mesg)
            #write to db!
            insert_into_cit_db(reference_list, "reversedict")
            insert_into_cit_db(citation_list, "citationdict")
            #it's ok to sleep too, we got something done
            task_sleep_now_if_required()
        done = done + 1

        for refnumber in refnumbers:
            if refnumber:
                p = refnumber
                f = 'reportnumber'
                #sanitise p
                p.replace("\n", '')
                #search for "hep-th/5644654 or such" in existing records
                rec_ids = get_recids_matching_query(p, f)
                if rec_ids and rec_ids[0]:
                    write_citer_cited(thisrecid, rec_ids[0])
                    remove_from_missing(p)
                    if not result.has_key(rec_ids[0]):
                        result[rec_ids[0]] = 0
                    # Citation list should have rec_ids[0] but check anyway
                    if not citation_list.has_key(rec_ids[0]):
                        citation_list[rec_ids[0]] = []
                    #append unless this key already has the item
                    if not thisrecid in citation_list[rec_ids[0]]:
                        citation_list[rec_ids[0]].append(thisrecid)
                        #and update result
                        result[rec_ids[0]] += 1

                    if not reference_list.has_key(thisrecid):
                        reference_list[thisrecid] = []
                    if not rec_ids[0] in reference_list[thisrecid]:
                        reference_list[thisrecid].append(rec_ids[0])
                else:
                    #the reference we wanted was not found among our records.
                    #put the reference in the "missing".. however, it will look
                    #bad.. gfhgf/1254312, so  get the corresponding 999C5s (full ref) too
                    #This should really be done in the next loop d_references_s
                    #but the 999C5s fields are not yet normalized

                    #rectext = print_record(thisrecid, format='hm', ot=pubreftag[:-1])
                    rectext = ""  # print_record() call disabled to speed things up
                    lines = rectext.split("\n")
                    rpart = p  #to be used..
                    for l in lines:
                        if (
                                l.find(p) > 0
                        ):  #the gfhgf/1254312 was found.. get the s-part of it
                            st = l.find('$s')
                            if (st > 0):
                                end = l.find('$', st)
                                if (end == st):
                                    end = len(l)
                                rpart = l[st + 2:end]
                    insert_into_missing(thisrecid, rpart)

    mesg = "d_references_report_numbers done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t2 = os.times()[4]

    #try to find references based on 999C5s, like Phys.Rev.Lett. 53 (1986) 2285
    write_message("Phase 2: d_references_s")
    done = 0
    numrecs = len(d_references_s)
    for thisrecid, refss in d_references_s.iteritems():
        if (done % 1000 == 0):
            mesg = "d_references_s done " + str(done) + " of " + str(numrecs)
            write_message(mesg)
            task_update_progress(mesg)
            #write to db!
            insert_into_cit_db(reference_list, "reversedict")
            insert_into_cit_db(citation_list, "citationdict")
            task_sleep_now_if_required()

        done = done + 1

        for refs in refss:
            if refs:
                p = refs
                #remove the latter page number if it is like 67-74
                matches = re.compile("(.*)(-\d+$)").findall(p)
                if matches and matches[0]:
                    p = matches[0][0]
                rec_id = None
                try:
                    rec_ids = list(search_unit(p, 'journal'))
                except:
                    rec_ids = None
                write_message("These match searching " + p + " in journal: " +
                              str(rec_id),
                              verbose=9)
                if rec_ids and rec_ids[0]:
                    #the refered publication is in our collection, remove
                    #from missing
                    remove_from_missing(p)
                else:
                    #it was not found so add in missing
                    insert_into_missing(thisrecid, p)
                #check citation and reference for this..
                if rec_ids and rec_ids[0]:
                    #the above should always hold
                    if not result.has_key(rec_ids[0]):
                        result[rec_ids[0]] = 0
                    if not citation_list.has_key(rec_ids[0]):
                        citation_list[rec_ids[0]] = []
                    if not thisrecid in citation_list[rec_ids[0]]:
                        citation_list[rec_ids[0]].append(
                            thisrecid)  #append actual list
                        result[rec_ids[0]] += 1  #add count for this..

                    #update reference_list accordingly
                    if not reference_list.has_key(thisrecid):
                        reference_list[thisrecid] = []
                    if not rec_ids[0] in reference_list[thisrecid]:
                        reference_list[thisrecid].append(rec_ids[0])
    mesg = "d_references_s done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t3 = os.times()[4]
    done = 0
    numrecs = len(d_reports_numbers)
    write_message("Phase 3: d_reports_numbers")

    #search for stuff like CERN-TH-4859/87 in list of refs
    for thisrecid, reportcodes in d_reports_numbers.iteritems():
        if (done % 1000 == 0):
            mesg = "d_report_numbers done " + str(done) + " of " + str(numrecs)
            write_message(mesg)
            task_update_progress(mesg)
        done = done + 1

        for reportcode in reportcodes:
            if reportcode:
                rec_ids = []
                try:
                    rec_ids = get_recids_matching_query(reportcode, pubrefntag)
                except:
                    rec_ids = []

                if rec_ids:
                    for recid in rec_ids:
                        #normal checks..
                        if not citation_list.has_key(thisrecid):
                            citation_list[thisrecid] = []
                        if not reference_list.has_key(recid):
                            reference_list[recid] = []
                        if not result.has_key(thisrecid):
                            result[thisrecid] = 0

                        #normal updates
                        if not recid in citation_list[thisrecid]:
                            result[thisrecid] += 1
                            citation_list[thisrecid].append(recid)
                        if not thisrecid in reference_list[recid]:
                            reference_list[recid].append(thisrecid)

    mesg = "d_report_numbers done fully"
    write_message(mesg)
    task_update_progress(mesg)

    #find this record's pubinfo in other records' bibliography
    write_message("Phase 4: d_records_s")
    done = 0
    numrecs = len(d_records_s)
    t4 = os.times()[4]
    for thisrecid, recs in d_records_s.iteritems():
        if (done % 1000 == 0):
            mesg = "d_records_s done " + str(done) + " of " + str(numrecs)
            write_message(mesg)
            task_update_progress(mesg)
        done = done + 1
        p = recs.replace("\"", "")
        #search the publication string like Phys. Lett., B 482 (2000) 417 in 999C5s
        rec_ids = list(search_unit(f=pubreftag, p=p, m='a'))
        write_message("These records match " + p + " in " + pubreftag + " : " +
                      str(rec_ids),
                      verbose=9)
        if rec_ids:
            for rec_id in rec_ids:
                #normal checks
                if not result.has_key(thisrecid):
                    result[thisrecid] = 0
                if not citation_list.has_key(thisrecid):
                    citation_list[thisrecid] = []
                if not reference_list.has_key(rec_id):
                    reference_list[rec_id] = []

                if not rec_id in citation_list[thisrecid]:
                    result[thisrecid] += 1
                    citation_list[thisrecid].append(rec_id)
                if not thisrecid in reference_list[rec_id]:
                    reference_list[rec_id].append(thisrecid)

    mesg = "d_records_s done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 5: reverse lists")

    #remove empty lists in citation and reference
    keys = citation_list.keys()
    for k in keys:
        if not citation_list[k]:
            del citation_list[k]

    keys = reference_list.keys()
    for k in keys:
        if not reference_list[k]:
            del reference_list[k]

    write_message("Phase 6: self-citations")
    selfdic = {}
    #get the initial self citation dict
    initial_self_dict = get_cit_dict("selfcitdict")
    selfdic = initial_self_dict
    #add new records to selfdic
    acit = task_get_option("author-citations")
    if not acit:
        write_message(
            "Self cite processing disabled. Use -A option to enable it.")
    else:
        write_message("self cite and author citations enabled")
        selfdic = get_self_citations(updated_rec_list, citation_list,
                                     initial_self_dict, config)
    #selfdic consists of
    #key k -> list of values [v1,v2,..]
    #where k is a record with author A and k cites v1,v2.. and A appears in v1,v2..

    #create a reverse "x cited by y" self cit dict
    selfcitedbydic = {}
    for k in selfdic.keys():
        vlist = selfdic[k]
        for v in vlist:
            if selfcitedbydic.has_key(v):
                tmplist = selfcitedbydic[v]
                if not k in tmplist:
                    tmplist.append(k)
            else:
                tmplist = [k]
            selfcitedbydic[v] = tmplist

    write_message("Getting author citations")

    #get author citations for records in updated_rec_list
    initial_author_dict = get_initial_author_dict()
    authorcitdic = initial_author_dict
    acit = task_get_option("author-citations")
    if not acit:
        print "Author cites disabled. Use -A option to enable it."
    else:
        write_message("author citations enabled")
        authorcitdic = get_author_citations(updated_rec_list, citation_list,
                                            initial_author_dict, config)

    if task_get_task_param('verbose') >= 3:
        #print only X first to prevent flood
        tmpdict = {}
        tmp = citation_list.keys()[0:10]
        for t in tmp:
            tmpdict[t] = citation_list[t]
        write_message("citation_list (x is cited by y): " + str(tmpdict))
        write_message("size: " + str(len(citation_list.keys())))
        tmp = reference_list.keys()[0:10]
        tmpdict = {}
        for t in tmp:
            tmpdict[t] = reference_list[t]
        write_message("reference_list (x cites y): " + str(tmpdict))
        write_message("size: " + str(len(reference_list.keys())))
        tmp = selfcitedbydic.keys()[0:10]
        tmpdict = {}
        for t in tmp:
            tmpdict[t] = selfcitedbydic[t]
        mesg = "selfcitedbydic (x is cited by y and one of the authors of x same as y's):"
        mesg += str(tmpdict)
        write_message(mesg)
        write_message("size: " + str(len(selfcitedbydic.keys())))
        tmp = selfdic.keys()[0:100]
        tmpdict = {}
        for t in tmp:
            tmpdict[t] = selfdic[t]
        mesg = "selfdic (x cites y and one of the authors of x same as y's): " + str(
            tmpdict)
        write_message(mesg)
        write_message("size: " + str(len(selfdic.keys())))
        tmp = authorcitdic.keys()[0:10]
        tmpdict = {}
        for t in tmp:
            tmpdict[t] = authorcitdic[t]
        write_message("authorcitdic (author is cited in recs): " +
                      str(tmpdict))
        write_message("size: " + str(len(authorcitdic.keys())))
    insert_cit_ref_list_intodb(citation_list, reference_list, selfcitedbydic,
                               selfdic, authorcitdic)

    t5 = os.times()[4]

    write_message(
        "Execution time for analyzing the citation information generating the dictionary:"
    )
    write_message("... checking ref number: %.2f sec" % (t2 - t1))
    write_message("... checking ref ypvt: %.2f sec" % (t3 - t2))
    write_message("... checking rec number: %.2f sec" % (t4 - t3))
    write_message("... checking rec ypvt: %.2f sec" % (t5 - t4))
    write_message("... total time of ref_analyze: %.2f sec" % (t5 - t1))

    return result
def deleted_recids_cache(cache={}):
    if 'deleted_records' not in cache:
        cache['deleted_records'] = search_unit(p='DELETED', f='980', m='a')
    return cache['deleted_records']
def ref_analyzer(citation_informations, dicts,
                 updated_recids, tags, do_catchup=True):
    """Analyze the citation informations and calculate the citation weight
       and cited by list dictionary.
    """
    citations_weight = dicts['cites_weight']
    citations = dicts['cites']
    references = dicts['refs']
    selfcites = dicts['selfcites']
    selfrefs = dicts['selfrefs']
    authorcites = dicts['authorcites']

    def step(msg_prefix, recid, done, total):
        if done % 30 == 0:
            task_sleep_now_if_required()

        if done % 1000 == 0:
            mesg = "%s done %s of %s" % (msg_prefix, done, total)
            write_message(mesg)
            task_update_progress(mesg)

        write_message("Processing: %s" % recid, verbose=9)

    def add_to_dicts(citer, cited):
        # Make sure we don't add ourselves
        # Workaround till we know why we are adding ourselves.
        if citer == cited:
            return
        if cited not in citations_weight:
            citations_weight[cited] = 0
        # Citations and citations weight
        if citer not in citations.setdefault(cited, []):
            citations[cited].append(citer)
            citations_weight[cited] += 1
        # References
        if cited not in references.setdefault(citer, []):
            references[citer].append(cited)

    # dict of recid -> institute_give_publ_id
    records_info, references_info = citation_informations

    t1 = os.times()[4]

    write_message("Phase 0: temporarily remove changed records from " \
                  "citation dictionaries; they will be filled later")
    if do_catchup:
        for somerecid in updated_recids:
            try:
                del citations[somerecid]
            except KeyError:
                pass

    for somerecid in updated_recids:
        try:
            del references[somerecid]
        except KeyError:
            pass

    # Try to find references based on 999C5r
    # e.g 8 -> ([astro-ph/9889],[hep-ph/768])
    # meaning: rec 8 contains these in bibliography
    write_message("Phase 1: Report numbers references")
    done = 0
    for thisrecid, refnumbers in references_info['report-numbers'].iteritems():
        step("Report numbers references", thisrecid, done,
                                        len(references_info['report-numbers']))
        done += 1

        for refnumber in (r for r in refnumbers if r):
            field = 'reportnumber'
            refnumber = standardize_report_number(refnumber)
            # Search for "hep-th/5644654 or such" in existing records
            recids = get_recids_matching_query(p=refnumber, f=field)
            write_message("These match searching %s in %s: %s" % \
                                   (refnumber, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, refnumber)
            else:
                remove_from_missing(refnumber)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', refnumber)
                msg = "Whoops: record '%d' report number value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, refnumber, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t2 = os.times()[4]

    # Try to find references based on 999C5s
    # e.g. Phys.Rev.Lett. 53 (1986) 2285
    write_message("Phase 2: Journal references")
    done = 0
    for thisrecid, refs in references_info['journals'].iteritems():
        step("Journal references", thisrecid, done,
                                              len(references_info['journals']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'journal'

            # check reference value to see whether it is well formed:
            if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p):
                store_citation_warning('not-well-formed', p)
                msg = "Whoops, record '%d' reference value '%s' " \
                      "is not well formed; skipping it." % (thisrecid, p)
                write_message(msg, stream=sys.stderr)
                continue  # skip this ill-formed value

            recids = search_unit(p, field) - INTBITSET_OF_DELETED_RECORDS
            write_message("These match searching %s in %s: %s" \
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' reference value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]:  # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t3 = os.times()[4]

    # Try to find references based on 999C5a
    # e.g. 10.1007/BF03170733
    write_message("Phase 3: DOI references")
    done = 0
    for thisrecid, refs in references_info['doi'].iteritems():
        step("DOI references", thisrecid, done, len(references_info['doi']))
        done += 1

        for reference in (r for r in refs if r):
            p = reference
            field = 'doi'

            recids = get_recids_matching_query(p, field)
            write_message("These match searching %s in %s: %s" \
                                 % (reference, field, list(recids)), verbose=9)

            if not recids:
                insert_into_missing(thisrecid, p)
            else:
                remove_from_missing(p)

            if len(recids) > 1:
                store_citation_warning('multiple-matches', p)
                msg = "Whoops: record '%d' DOI value '%s' " \
                      "matches many records; taking only the first one. %s" % \
                      (thisrecid, p, repr(recids))
                write_message(msg, stream=sys.stderr)

            for recid in list(recids)[:1]: # take only the first one
                add_to_dicts(thisrecid, recid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    t4 = os.times()[4]

    # Search for stuff like CERN-TH-4859/87 in list of refs
    write_message("Phase 4: report numbers catchup")
    done = 0
    for thisrecid, reportcodes in records_info['report-numbers'].iteritems():
        step("Report numbers catchup", thisrecid, done,
                                           len(records_info['report-numbers']))
        done += 1

        for reportcode in (r for r in reportcodes if r):
            if reportcode.startswith('arXiv'):
                std_reportcode = standardize_report_number(reportcode)
                report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \
                                                re.escape(std_reportcode)
                recids = get_recids_matching_query(report_pattern,
                                                   tags['refs_report_number'],
                                                   'r')
            else:
                recids = get_recids_matching_query(reportcode,
                                                   tags['refs_report_number'],
                                                   'e')
            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    # Find this record's pubinfo in other records' bibliography
    write_message("Phase 5: journals catchup")
    done = 0
    t5 = os.times()[4]
    for thisrecid, rec_journals in records_info['journals'].iteritems():
        step("Journals catchup", thisrecid, done,
                                                 len(records_info['journals']))
        done += 1

        for journal in rec_journals:
            journal = journal.replace("\"", "")
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5s
            recids = search_unit(p=journal, f=tags['refs_journal'], m='a') \
                                                - INTBITSET_OF_DELETED_RECORDS
            write_message("These records match %s in %s: %s" \
                    % (journal, tags['refs_journal'], list(recids)), verbose=9)

            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 6: DOI catchup")
    done = 0
    t6 = os.times()[4]
    for thisrecid, dois in records_info['doi'].iteritems():
        step("DOI catchup", thisrecid, done, len(records_info['doi']))
        done += 1

        for doi in dois:
            # Search the publication string like
            # Phys. Lett., B 482 (2000) 417 in 999C5a
            recids = search_unit(p=doi, f=tags['refs_doi'], m='a') \
                                                - INTBITSET_OF_DELETED_RECORDS
            write_message("These records match %s in %s: %s" \
                            % (doi, tags['refs_doi'], list(recids)), verbose=9)

            for recid in recids:
                add_to_dicts(recid, thisrecid)

    mesg = "done fully"
    write_message(mesg)
    task_update_progress(mesg)

    write_message("Phase 7: remove empty lists from dicts")

    # Remove empty lists in citation and reference
    keys = citations.keys()
    for k in keys:
        if not citations[k]:
            del citations[k]

    keys = references.keys()
    for k in keys:
        if not references[k]:
            del references[k]

    if task_get_task_param('verbose') >= 3:
        # Print only X first to prevent flood
        write_message("citation_list (x is cited by y):")
        write_message(dict(islice(citations.iteritems(), 10)))
        write_message("size: %s" % len(citations))
        write_message("reference_list (x cites y):")
        write_message(dict(islice(references.iteritems(), 10)))
        write_message("size: %s" % len(references))
        write_message("selfcitedbydic (x is cited by y and one of the " \
                      "authors of x same as y's):")
        write_message(dict(islice(selfcites.iteritems(), 10)))
        write_message("size: %s" % len(selfcites))
        write_message("selfdic (x cites y and one of the authors of x " \
                      "same as y's):")
        write_message(dict(islice(selfrefs.iteritems(), 10)))
        write_message("size: %s" % len(selfrefs))
        write_message("authorcitdic (author is cited in recs):")
        write_message(dict(islice(authorcites.iteritems(), 10)))
        write_message("size: %s" % len(authorcites))

    t7 = os.times()[4]

    write_message("Execution time for analyzing the citation information " \
                  "generating the dictionary:")
    write_message("... checking ref report numbers: %.2f sec" % (t2-t1))
    write_message("... checking ref journals: %.2f sec" % (t3-t2))
    write_message("... checking ref DOI: %.2f sec" % (t4-t3))
    write_message("... checking rec report numbers: %.2f sec" % (t5-t4))
    write_message("... checking rec journals: %.2f sec" % (t6-t5))
    write_message("... checking rec DOI: %.2f sec" % (t7-t6))
    write_message("... total time of ref_analyze: %.2f sec" % (t7-t1))

    return citations_weight, citations, references, selfcites, \
                                                        selfrefs, authorcites
Пример #35
0
def deleted_recids_cache(cache={}):
    if 'deleted_records' not in cache:
        cache['deleted_records'] = search_unit(p='DELETED', f='980', m='a')
    return cache['deleted_records']