def check_doi_eprint(identifier): "Check to see if we already have a DOI eprint pairing." if identifier.startswith('10.'): result = search_unit(identifier, f='0247_a', m='a') if result & DELETED: return None if len(result) == 1: INSPIRE_IDENTIFIER_RECID_DICT[identifier] = result[0] result = result & INSPIRE_EPRINT_RECIDS elif any((re.match(ARXIV_REGEX, identifier), re.match(ARXIV_REGEX_NEW, identifier))): prefix = '' if re.match(ARXIV_REGEX_NEW, identifier): prefix = 'arXiv:' result = search_unit(prefix + identifier, f='037__a', m='a') if result & DELETED: return None if len(result) == 1: INSPIRE_IDENTIFIER_RECID_DICT[identifier] = result[0] result = result & INSPIRE_DOI_RECIDS else: return None if len(result): return True return None
def get_hepnames_recid_from_email(email): """ Find the HEPNames recid based on email """ if email not in EMAILS_HEPNAMES: if VERBOSE: print "WARNING: no hepnames record found for %s: " % (email) return None emailsearch = '371__m:%s or 371__o:%s' reclist = perform_request_search(p=emailsearch % (email, email), cc='HepNames') hidden_m = search_unit(email, f='595__m', m='a') hidden_o = search_unit(email, f='595__o', m='a') reclist_hidden = hidden_m or hidden_o & HN reclist = intbitset(reclist) or reclist_hidden if len(reclist) == 1: return reclist[0] elif len(reclist) > 1: if VERBOSE: print "WARNING: more than one hepnames record found for %s: " \ % (email) print '\t' + ', '.join([str(r) for r in reclist]) return [r for r in reclist] else: if VERBOSE: print "WARNING: no hepnames record found for %s: " % (email) return None
def get_recid_from_inspire(id_string): ''' Takes an ID string and returns an INSPIRE recid or it returns None. ''' id_string = str(id_string) id_string = clean_eprint(id_string) if ARXIV_REGEX.match(id_string): field = '037__a' elif ARXIV_REGEX_NEW.match(id_string): field = '037__a' id_string = 'arXiv:' + id_string elif DOI_REGEX.match(id_string): field = '0247_a' elif id_string.isdigit(): field = '001' else: logging.info('Unknown ID: ' + id_string) return False result = search_unit(p=id_string, f=field, m='a') - DELETED if len(result) > 1: print 'Duplicate: {0} {1}'.format(id_string, result) quit() if len(result) == 1: return str(result[0]) return None
def recid_from_doi(doi): """Find if we have a DOI.""" try: return search_unit(p=doi, f='0247*', m='a')[0] except IndexError: return None
def bst_prodsync(method='afs', with_citations='yes', with_claims='yes', skip_collections=''): """ Synchronize to either 'afs' or 'redis' with_citations: yes/no, whether records that now matches a record will need to be re-exported.abs with_claims: yes/no, whether record involved in some new claim need to be re-exported. skip_collections: comma-separated-lists of values for which records having 980:VALUE should be ignored, e.g. skip_collections='HEP,HEPNAMES,HEPHIDDEN' """ if not CFG_REDIS_HOST_LABS: method = 'afs' write_message("Prodsync started using %s method" % method) now = datetime.datetime.now() future_lastrun = now.strftime('%Y-%m-%d %H:%M:%S') lastrun_path = os.path.join(CFG_TMPSHAREDDIR, 'prodsync_%s_lastrun.txt' % method) try: last_run = open(lastrun_path).read().strip() write_message("Syncing records modified since %s" % last_run) with run_ro_on_slave_db(): modified_records = intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date>=%s", (last_run, ))) compacttime = last_run.replace('-', '').replace(' ', '').replace(':', '') notimechangerecs = search_unit("%s->20250101000000" % compacttime, f='005', m='a') modified_records += notimechangerecs if with_citations.lower() == 'yes': for citee, citer in run_sql("SELECT citee, citer FROM rnkCITATIONDICT WHERE last_updated>=%s", (last_run, )): modified_records.add(citer) if with_claims.lower() == 'yes': modified_records |= intbitset(run_sql("SELECT bibrec FROM aidPERSONIDPAPERS WHERE last_updated>=%s", (last_run, ))) modified_records |= intbitset(run_sql('SELECT bibrec FROM aidPERSONIDPAPERS AS p JOIN aidPERSONIDDATA as d' ' ON p.personid = d.personid WHERE d.tag = "canonical_name" and d.last_updated>=%s', (last_run, ))) except IOError: # Default to everything with run_ro_on_slave_db(): modified_records = intbitset(run_sql("SELECT id FROM bibrec")) write_message("Syncing all records") skip_collections = skip_collections.split(',') skip_collections.remove('') for collection in skip_collections: modified_records -= search_pattern(p='980:%s' % collection) if not modified_records: write_message("Nothing to do") return True tot = len(modified_records) time_estimator = get_time_estimator(tot) write_message("Adding %s new or modified records" % tot) if method == 'afs': afs_sync(reversed(modified_records), time_estimator, tot, now) open(lastrun_path, "w").write(future_lastrun) write_message("DONE!") else: if redis_sync(reversed(modified_records), time_estimator, tot): open(lastrun_path, "w").write(future_lastrun) write_message("DONE!") else: write_message("Skipping prodsync: Redis queue is not yet empty")
def get_hitset(self, key, pos=None): """ perform key appropriate search and return hitlist """ if not self._fields[key]: return intbitset() if pos is None: pos = slice(None, None) hits = intbitset() if key == 'pubnote': for val in self._fields[key][pos]: hits |= search_pattern(f='journal', p=val, ap=1) elif key == 'repno': for val in self._fields[key][pos]: hits |= search_unit(f='reportnumber', p=val) elif key == 'DOI': for val in self._fields[key][pos]: hits |= search_unit(f='doi', p=val, m='a') return hits & HEPRECS
def _get_unique_recid_for(journal='', report='', doi=''): """Return the recid for this set of identifiers. If no recid can be found or if too many are found, returns 0 to indicate failure. """ hits = [] if journal: hits = search_unit(f='journal', p=journal) if report and len(hits) != 1: hits = search_unit(f='reportnumber', p=report) if doi and len(hits) != 1: hits = search_unit(f='doi', p=doi) if len(hits) > 1: # FIXME: should throw exception or maybe show multiple possibilities return 0 elif len(hits) == 1: return hits.pop() else: return 0
def citationloss(exactauthor, startdate): recordsofauthor = search_unit(exactauthor, f='exactauthor') removedcitations = intbitset([i[0] for i in \ run_sql('select citee from rnkCITATIONLOG where action_date>"%s"' % startdate)]) lossoverlap = recordsofauthor & removedcitations if lossoverlap: recsaffected = run_sql('select citer,citee,action_date from rnkCITATIONLOG where citee in (' \ + ', '.join([str(i) for i in lossoverlap]) \ + ') and action_date>"%s"' % (startdate)) return recsaffected return None
def get_jacow_dois(): """Return all the JACoW DOIs INSPIRE has.""" jacow_dois_record = set() for doi in get_all_field_values('0247_a'): if doi.startswith('10.18429/JACoW-'): jacow_dois_record.add('doi:' + doi) jacow_dois_ref = set() for doi in get_all_field_values('999C5a'): if doi.startswith('doi:10.18429/JACoW-'): jacow_dois_ref.add(doi) missing_dois = jacow_dois_ref - jacow_dois_record if not missing_dois: return jacow_dois_record for doi in sorted(missing_dois): if good_doi(doi): search_unit('doi', f='0247_2', m='a') doi = doi.replace('doi:', '') if search_unit(doi, f='0247_2', m='a'): continue print 'https://doi.org/{0}'.format(doi) sys.exit()
def get_record_ids_to_export(unmatched_only=False, since=None): """Return all records with identifiers to sync.""" all_recids = get_all_recids() recids_with_other_id = search_pattern(p='035__9:%s' % CFG_OTHER_SITE) if CFG_INSPIRE_SITE: recids_with_other_id |= search_unit(p='CDS-*', f='595__a', m='a') recids_with_a_doi = search_pattern(p='doi:"**"') recids_with_an_arxiv_id = search_pattern(p='035__9:"arXiv"') if since: modified_recids = intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date>=%s", (since, ))) all_recids = all_recids & modified_recids if unmatched_only: all_recids = all_recids - recids_with_other_id return (recids_with_a_doi | recids_with_an_arxiv_id) & all_recids else: return (recids_with_a_doi | recids_with_an_arxiv_id | recids_with_other_id) & all_recids
def get_record_ids_to_export(unmatched_only=False, since=None): """Return all records with identifiers to sync.""" all_recids = get_all_recids() recids_with_other_id = search_pattern(p='035__9:%s' % CFG_OTHER_SITE) if CFG_INSPIRE_SITE: recids_with_other_id |= search_unit(p='CDS-*', f='595__a', m='a') recids_with_a_doi = search_pattern(p='doi:"**"') recids_with_an_arxiv_id = search_pattern(p='035__9:"arXiv"') if since: modified_recids = intbitset( run_sql("SELECT id FROM bibrec WHERE modification_date>=%s", (since, ))) all_recids = all_recids & modified_recids if unmatched_only: all_recids = all_recids - recids_with_other_id return (recids_with_a_doi | recids_with_an_arxiv_id) & all_recids else: return (recids_with_a_doi | recids_with_an_arxiv_id | recids_with_other_id) & all_recids
def main(): file_name = 'tmp_' + __file__ file_name = re.sub('.py', '_correct.out', file_name) output = open(file_name,'w') output.write('<collection>') result_hep = perform_request_search(p=SEARCH, cc=SUBFILE) result = search_unit(p="*D0-PRELIMINARY-NOTE*",m='a',f='980*') result = result[:5] for recid in result: #info = print_record(recid, ot=['001','037'], format='xm') #info = re.sub(r'code="a">FERMILAB', r'code="z">FERMILAB', info) info = print_record(recid, ot=['001', '100', '700', '980'], format='hm') #info = re.sub(r'>.*[fF]*o[rf] the (\w+) [Cc]oll.*<', r'>\1 Collaboration<', info) #info = re.sub(r'>\w+tion [fF]*or [Tt]he (\w+)<', r'>\1 Collaboration<', info) info = re.sub(r'code="e">FERMILAB-TEV-', r'', info) info = re.sub(r'<\/?pre[^\>]*>', r'', info) info += '\n\n' output.write(info) output.write('</collection>') output.close()
def main(): file_name = 'tmp_' + __file__ file_name = re.sub('.py', '_correct.out', file_name) output = open(file_name, 'w') output.write('<collection>') result_hep = perform_request_search(p=SEARCH, cc=SUBFILE) result = search_unit(p="*D0-PRELIMINARY-NOTE*", m='a', f='980*') result = result[:5] for recid in result: #info = print_record(recid, ot=['001','037'], format='xm') #info = re.sub(r'code="a">FERMILAB', r'code="z">FERMILAB', info) info = print_record(recid, ot=['001', '100', '700', '980'], format='hm') #info = re.sub(r'>.*[fF]*o[rf] the (\w+) [Cc]oll.*<', r'>\1 Collaboration<', info) #info = re.sub(r'>\w+tion [fF]*or [Tt]he (\w+)<', r'>\1 Collaboration<', info) info = re.sub(r'code="e">FERMILAB-TEV-', r'', info) info = re.sub(r'<\/?pre[^\>]*>', r'', info) info += '\n\n' output.write(info) output.write('</collection>') output.close()
def format_element(bfo, reference_prefix, reference_suffix): """ Prints the references of this record @param reference_prefix a prefix displayed before each reference @param reference_suffix a suffix displayed after each reference """ references = bfo.fields("999C5", escape=0, repeatable_subfields_p=True) out = "" last_o = "" if not references: return out out += "<table>" for reference in references: ref_out = [] ref_out.append('<tr><td valign="top">') display_journal = '' display_report = '' clean_report = '' clean_journal = '' hits = [] if reference.has_key('o') and not reference['o'][0] == last_o: temp_ref = reference['o'][0].replace('.', '') if '[' in temp_ref and ']' in temp_ref: ref_out.append("<small>" + temp_ref + "</small> ") else: ref_out.append("<small>[" + temp_ref + "] </small> ") last_o = temp_ref ref_out.append("</td><td>") if reference_prefix: ref_out.append(reference_prefix) if reference.has_key('s'): display_journal = reference['s'][0] clean_journal = reference['s'][0] if reference.has_key('r'): if "[" in reference['r'][0] and "]" in reference['r'][0]: breaknum = reference['r'][0].find('[') newreference = reference['r'][0][:breaknum].strip() display_report = newreference clean_report = newreference else: display_report = reference['r'][0] clean_report = reference['r'][0] if clean_report: hits = search_unit(f='reportnumber', p=clean_report) if clean_journal and len(hits)!=1: hits = search_unit(f='journal', p=clean_journal) if reference.has_key('a') and len(hits)!=1: hits = search_unit(p=reference['a'][0]) if reference.has_key('0') and len(hits)!=1: # check if the record exists in the database try: recID = int(reference['0'][0]) if get_record(recID): # since we already have a recID, we can assign it directly # to the "hits" variable, so it will be handled in the last if statement hits = [recID] except ValueError: pass if len(hits) == 1: ref_out.append('<small>' + format_record(list(hits)[0],'hs') + '</small>') else: if reference.has_key('h'): ref_out.append("<small> " + reference['h'][0] + ".</small>") if reference.has_key('t'): ref_out.append("<small> " + reference['t'][0] + "</small> -") if reference.has_key('y'): ref_out.append("<small> " + reference['y'][0] + ".</small>") if reference.has_key('p'): ref_out.append("<small> " + reference['p'][0] + ".</small>") if reference.has_key('m'): ref_out.append("<small> "+ reference['m'][0].replace(']]', ']') + ".</small>") if reference.has_key('a'): ref_out.append("<small> <a href=\"http://dx.doi.org/" + \ reference['a'][0] + "\">" + reference['a'][0]+ "</a></small>") if reference.has_key('u'): ref_out.append("<small> <a href=" + reference['u'][0] + ">" + \ reference['u'][0]+ "</a></small>") if reference.has_key('i'): for r in reference['i']: ref_out.append("<small> <a href=\"/search?ln=en&p=020__a%3A"+r+"\">"+r+"</a></small>") ref_out.append('<small>') if display_journal: ref_out.append(display_journal) if display_report: ref_out.append(' ' + display_report) ref_out.append("</small>") if reference_suffix: ref_out.append(reference_suffix) ref_out.append("</td></tr>") out += ' '.join(ref_out) return out + "</table>"
def format_element(bfo, reference_prefix, reference_suffix): """ Prints the references of this record @param reference_prefix a prefix displayed before each reference @param reference_suffix a suffix displayed after each reference """ references = bfo.fields("999C5", escape=0, repeatable_subfields_p=True) out = "" last_o = "" if not references: return out out += "<table>" for reference in references: ref_out = [] ref_out.append('<tr><td valign="top">') display_journal = "" display_report = "" clean_report = "" clean_journal = "" hits = [] if reference.has_key("o") and not reference["o"][0] == last_o: temp_ref = reference["o"][0].replace(".", "") if "[" in temp_ref and "]" in temp_ref: ref_out.append("<small>" + temp_ref + "</small> ") else: ref_out.append("<small>[" + temp_ref + "] </small> ") last_o = temp_ref ref_out.append("</td><td>") if reference_prefix: ref_out.append(reference_prefix) if reference.has_key("s"): display_journal = reference["s"][0] clean_journal = reference["s"][0] if reference.has_key("r"): if "[" in reference["r"][0] and "]" in reference["r"][0]: breaknum = reference["r"][0].find("[") newreference = reference["r"][0][:breaknum].strip() display_report = newreference clean_report = newreference else: display_report = reference["r"][0] clean_report = reference["r"][0] if clean_report: hits = search_unit(f="reportnumber", p=clean_report) if clean_journal and len(hits) != 1: hits = search_pattern(f="journal", p=clean_journal, ap=1) if reference.has_key("a") and len(hits) != 1: hits = search_unit(p=reference["a"][0]) if reference.has_key("0") and len(hits) != 1: # check if the record exists in the database try: recID = int(reference["0"][0]) if get_record(recID): # since we already have a recID, we can assign it directly # to the "hits" variable, so it will be handled in the last if statement hits = [recID] except ValueError: pass if len(hits) == 1: ref_out.append("<small>" + format_record(list(hits)[0], "hs") + "</small>") else: if reference.has_key("h"): ref_out.append("<small> " + reference["h"][0] + ".</small>") if reference.has_key("t"): ref_out.append("<small> " + reference["t"][0] + "</small> -") if reference.has_key("y"): ref_out.append("<small> " + reference["y"][0] + ".</small>") if reference.has_key("p"): ref_out.append("<small> " + reference["p"][0] + ".</small>") if reference.has_key("m"): ref_out.append("<small> " + reference["m"][0].replace("]]", "]") + ".</small>") if reference.has_key("a"): ref_out.append( '<small> <a href="http://dx.doi.org/' + reference["a"][0] + '">' + reference["a"][0] + "</a></small>" ) if reference.has_key("u"): ref_out.append("<small> <a href=" + reference["u"][0] + ">" + reference["u"][0] + "</a></small>") if reference.has_key("i"): for r in reference["i"]: ref_out.append('<small> <a href="/search?ln=en&p=020__a%3A' + r + '">' + r + "</a></small>") ref_out.append("<small>") if display_journal: ref_out.append(display_journal) if display_report: ref_out.append(" " + display_report) ref_out.append("</small>") if reference_suffix: ref_out.append(reference_suffix) ref_out.append("</td></tr>") out += " ".join(ref_out) return out + "</table>"
data_set = generate_data(data_set) try: with open(filename, "wb") as file_handle: pickle.dump(data_set, file_handle) except pickle.PicklingError: print "Problem creating:", filename except pickle.UnpicklingError: print "Pickle problem for", filename return data_set INSPIRE_JOURNALS = get_data('INSPIRE_JOURNALS')[0] (INSPIRE_EPRINTS, INSPIRE_BIBCODES) = get_data('INSPIRE_EPRINTS') INSPIRE_DOIS = get_data('INSPIRE_DOIS')[0] INSPIRE_EPRINT_RECIDS = search_unit('arxiv', f='037__9', m='a') INSPIRE_DOI_RECIDS = search_unit('doi', f='0247_2', m='a') INSPIRE_IDENTIFIER_RECID_DICT = {} DELETED = search_unit(p='DELETED', m='a', f='980*') print 'Eprints', len(INSPIRE_EPRINTS), random.sample(INSPIRE_EPRINTS, 1) print 'Bibcodes', len(INSPIRE_BIBCODES), random.sample(INSPIRE_BIBCODES, 1) print 'DOIs', len(INSPIRE_DOIS), random.sample(INSPIRE_DOIS, 1) print 'Journals', len(INSPIRE_JOURNALS), random.sample(INSPIRE_JOURNALS, 1) def check_doi_eprint(identifier): "Check to see if we already have a DOI eprint pairing." if identifier.startswith('10.'): result = search_unit(identifier, f='0247_a', m='a')
def ref_analyzer(citation_informations, initialresult, initial_citationlist, initial_referencelist,config, updated_rec_list ): """Analyze the citation informations and calculate the citation weight and cited by list dictionary. """ function = "" try: function = config.get("rank_method", "function") except: register_exception(prefix="cfg section [rank_method] has no attr function", alert_admin=True) return {} pubrefntag = "" try: pubrefntag = config.get(function, "reference_via_report_number") except: register_exception(prefix="cfg section "+function+" has no attr reference_via_report_number", alert_admin=True) return {} pubreftag = "" try: pubreftag = config.get(function, "reference_via_pubinfo") except: register_exception(prefix="cfg section "+function+" has no attr reference_via_pubinfo", alert_admin=True) return {} #pubrefntag is often 999C5r, pubreftag 999C5s if task_get_task_param('verbose') >= 9: write_message("pubrefntag "+pubrefntag) write_message("pubreftag "+pubreftag) citation_list = initial_citationlist reference_list = initial_referencelist result = initialresult d_reports_numbers = citation_informations[0] #dict of recid -> institute_give_publ_id d_references_report_numbers = citation_informations[1] #dict of recid -> ['astro-ph/xyz'..] d_references_s = citation_informations[2] #dict of recid -> publication_infos_in_its_bibliography d_records_s = citation_informations[3] #recid -> its publication inf t1 = os.times()[4] write_message("Phase 0: temporarily remove changed records from citation dictionaries; they will be filled later") for somerecid in updated_rec_list: try: del citation_list[somerecid] except KeyError: pass try: del reference_list[somerecid] except KeyError: pass write_message("Phase 1: d_references_report_numbers") #d_references_report_numbers: e.g 8 -> ([astro-ph/9889],[hep-ph/768]) #meaning: rec 8 contains these in bibliography done = 0 numrecs = len(d_references_report_numbers) for thisrecid, refnumbers in d_references_report_numbers.iteritems(): if (done % 1000 == 0): mesg = "d_references_report_numbers done "+str(done)+" of "+str(numrecs) write_message(mesg) task_update_progress(mesg) task_sleep_now_if_required() done = done+1 for refnumber in refnumbers: if refnumber: p = refnumber f = 'reportnumber' #sanitise p p.replace("\n",'') #search for "hep-th/5644654 or such" in existing records rec_ids = get_recids_matching_query(p, f) if rec_ids and rec_ids[0]: write_citer_cited(thisrecid, rec_ids[0]) remove_from_missing(p) if not result.has_key(rec_ids[0]): result[rec_ids[0]] = 0 # Citation list should have rec_ids[0] but check anyway if not citation_list.has_key(rec_ids[0]): citation_list[rec_ids[0]] = [] #append unless this key already has the item if not thisrecid in citation_list[rec_ids[0]]: citation_list[rec_ids[0]].append(thisrecid) #and update result result[rec_ids[0]] += 1 if not reference_list.has_key(thisrecid): reference_list[thisrecid] = [] if not rec_ids[0] in reference_list[thisrecid]: reference_list[thisrecid].append(rec_ids[0]) else: #the reference we wanted was not found among our records. #put the reference in the "missing".. however, it will look #bad.. gfhgf/1254312, so get the corresponding 999C5s (full ref) too #This should really be done in the next loop d_references_s #but the 999C5s fields are not yet normalized #rectext = print_record(thisrecid, format='hm', ot=pubreftag[:-1]) rectext = "" # print_record() call disabled to speed things up lines = rectext.split("\n") rpart = p #to be used.. for l in lines: if (l.find(p) > 0): #the gfhgf/1254312 was found.. get the s-part of it st = l.find('$s') if (st > 0): end = l.find('$', st) if (end == st): end = len(l) rpart = l[st+2:end] insert_into_missing(thisrecid, rpart) mesg = "d_references_report_numbers done fully" write_message(mesg) task_update_progress(mesg) t2 = os.times()[4] #try to find references based on 999C5s, like Phys.Rev.Lett. 53 (1986) 2285 write_message("Phase 2: d_references_s") done = 0 numrecs = len(d_references_s) for thisrecid, refss in d_references_s.iteritems(): if (done % 1000 == 0): mesg = "d_references_s done "+str(done)+" of "+str(numrecs) write_message(mesg) task_update_progress(mesg) task_sleep_now_if_required() done = done+1 for refs in refss: if refs: p = refs #remove the latter page number if it is like 67-74 matches = re.compile("(.*)(-\d+$)").findall(p) if matches and matches[0]: p = matches[0][0] rec_id = None try: rec_ids = list(search_unit(p, 'journal') - INTBITSET_OF_DELETED_RECORDS) except: rec_ids = None write_message("These match searching "+p+" in journal: "+str(rec_id), verbose=9) if rec_ids and rec_ids[0]: #the refered publication is in our collection, remove #from missing remove_from_missing(p) else: #it was not found so add in missing insert_into_missing(thisrecid, p) #check citation and reference for this.. if rec_ids and rec_ids[0]: #the above should always hold if not result.has_key(rec_ids[0]): result[rec_ids[0]] = 0 if not citation_list.has_key(rec_ids[0]): citation_list[rec_ids[0]] = [] if not thisrecid in citation_list[rec_ids[0]]: citation_list[rec_ids[0]].append(thisrecid) #append actual list result[rec_ids[0]] += 1 #add count for this.. #update reference_list accordingly if not reference_list.has_key(thisrecid): reference_list[thisrecid] = [] if not rec_ids[0] in reference_list[thisrecid]: reference_list[thisrecid].append(rec_ids[0]) mesg = "d_references_s done fully" write_message(mesg) task_update_progress(mesg) t3 = os.times()[4] done = 0 numrecs = len(d_reports_numbers) write_message("Phase 3: d_reports_numbers") #search for stuff like CERN-TH-4859/87 in list of refs for thisrecid, reportcodes in d_reports_numbers.iteritems(): if (done % 1000 == 0): mesg = "d_report_numbers done "+str(done)+" of "+str(numrecs) write_message(mesg) task_update_progress(mesg) done = done+1 for reportcode in reportcodes: if reportcode: rec_ids = [] try: rec_ids = get_recids_matching_query(reportcode, pubrefntag) except: rec_ids = [] if rec_ids: for recid in rec_ids: #normal checks.. if not citation_list.has_key(thisrecid): citation_list[thisrecid] = [] if not reference_list.has_key(recid): reference_list[recid] = [] if not result.has_key(thisrecid): result[thisrecid] = 0 #normal updates if not recid in citation_list[thisrecid]: result[thisrecid] += 1 citation_list[thisrecid].append(recid) if not thisrecid in reference_list[recid]: reference_list[recid].append(thisrecid) mesg = "d_report_numbers done fully" write_message(mesg) task_update_progress(mesg) #find this record's pubinfo in other records' bibliography write_message("Phase 4: d_records_s") done = 0 numrecs = len(d_records_s) t4 = os.times()[4] for thisrecid, recs in d_records_s.iteritems(): if (done % 1000 == 0): mesg = "d_records_s done "+str(done)+" of "+str(numrecs) write_message(mesg) task_update_progress(mesg) done = done+1 p = recs.replace("\"","") #search the publication string like Phys. Lett., B 482 (2000) 417 in 999C5s rec_ids = list(search_unit(f=pubreftag, p=p, m='a') - INTBITSET_OF_DELETED_RECORDS) write_message("These records match "+p+" in "+pubreftag+" : "+str(rec_ids), verbose=9) if rec_ids: for rec_id in rec_ids: #normal checks if not result.has_key(thisrecid): result[thisrecid] = 0 if not citation_list.has_key(thisrecid): citation_list[thisrecid] = [] if not reference_list.has_key(rec_id): reference_list[rec_id] = [] if not rec_id in citation_list[thisrecid]: result[thisrecid] += 1 citation_list[thisrecid].append(rec_id) if not thisrecid in reference_list[rec_id]: reference_list[rec_id].append(thisrecid) mesg = "d_records_s done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 5: reverse lists") #remove empty lists in citation and reference keys = citation_list.keys() for k in keys: if not citation_list[k]: del citation_list[k] keys = reference_list.keys() for k in keys: if not reference_list[k]: del reference_list[k] write_message("Phase 6: self-citations") selfdic = {} #get the initial self citation dict initial_self_dict = get_cit_dict("selfcitdict") selfdic = initial_self_dict #add new records to selfdic acit = task_get_option("author-citations") if not acit: write_message("Self cite processing disabled. Use -A option to enable it.") else: write_message("self cite and author citations enabled") selfdic = get_self_citations(updated_rec_list, citation_list, initial_self_dict, config) #selfdic consists of #key k -> list of values [v1,v2,..] #where k is a record with author A and k cites v1,v2.. and A appears in v1,v2.. #create a reverse "x cited by y" self cit dict selfcitedbydic = {} for k in selfdic.keys(): vlist = selfdic[k] for v in vlist: if selfcitedbydic.has_key(v): tmplist = selfcitedbydic[v] if not k in tmplist: tmplist.append(k) else: tmplist = [k] selfcitedbydic[v] = tmplist write_message("Getting author citations") #get author citations for records in updated_rec_list initial_author_dict = get_initial_author_dict() authorcitdic = initial_author_dict acit = task_get_option("author-citations") if not acit: print "Author cites disabled. Use -A option to enable it." else: write_message("author citations enabled") authorcitdic = get_author_citations(updated_rec_list, citation_list, initial_author_dict, config) if task_get_task_param('verbose') >= 3: #print only X first to prevent flood tmpdict = {} tmp = citation_list.keys()[0:10] for t in tmp: tmpdict[t] = citation_list[t] write_message("citation_list (x is cited by y): "+str(tmpdict)) write_message("size: "+str(len(citation_list.keys()))) tmp = reference_list.keys()[0:10] tmpdict = {} for t in tmp: tmpdict[t] = reference_list[t] write_message("reference_list (x cites y): "+str(tmpdict)) write_message("size: "+str(len(reference_list.keys()))) tmp = selfcitedbydic.keys()[0:10] tmpdict = {} for t in tmp: tmpdict[t] = selfcitedbydic[t] mesg = "selfcitedbydic (x is cited by y and one of the authors of x same as y's):" mesg += str(tmpdict) write_message(mesg) write_message("size: "+str(len(selfcitedbydic.keys()))) tmp = selfdic.keys()[0:100] tmpdict = {} for t in tmp: tmpdict[t] = selfdic[t] mesg = "selfdic (x cites y and one of the authors of x same as y's): "+str(tmpdict) write_message(mesg) write_message("size: "+str(len(selfdic.keys()))) tmp = authorcitdic.keys()[0:10] tmpdict = {} for t in tmp: tmpdict[t] = authorcitdic[t] write_message("authorcitdic (author is cited in recs): "+str(tmpdict)) write_message("size: "+str(len(authorcitdic.keys()))) insert_cit_ref_list_intodb(citation_list, reference_list, selfcitedbydic, selfdic, authorcitdic) t5 = os.times()[4] write_message("Execution time for analyzing the citation information generating the dictionary:") write_message("... checking ref number: %.2f sec" % (t2-t1)) write_message("... checking ref ypvt: %.2f sec" % (t3-t2)) write_message("... checking rec number: %.2f sec" % (t4-t3)) write_message("... checking rec ypvt: %.2f sec" % (t5-t4)) write_message("... total time of ref_analyze: %.2f sec" % (t5-t1)) return result
def format(bfo, reference_prefix, reference_suffix): """ Prints the references of this record @param reference_prefix a prefix displayed before each reference @param reference_suffix a suffix displayed after each reference """ from invenio.search_engine import search_unit from invenio.bibformat import format_record references = bfo.fields("999C5", escape=1) out = "" for reference in references: ref_out = '' if reference.has_key('o'): if out != "": ref_out = '</li>' ref_out += '<li><small>'+\ reference['o']+ "</small> " # LEAVE out full ref while we have spires import which does not store # useful things here # if reference.has_key('m'): # ref_out += "<small>"+ reference['m']+ "</small> " display_journal = '' display_report = '' clean_report = '' clean_journal = '' hits = [] if reference.has_key('s'): display_journal = reference['s'] clean_journal = reference['s'] if reference.has_key('r'): display_report = reference['r'] clean_report = reference['r'] if clean_report: hits = search_unit(f='reportnumber', p=clean_report) if clean_journal and len(hits)!=1: hits = search_unit(f='journal', p=clean_journal) if len(hits) == 1: ref_out += '<small>' +\ format_record(list(hits)[0],'hs') + '</small>' # Silly stuff that can be used if there are a lot of multiple hits # # elif len(hits)>1: # if display_journal: # ref_out += '<small><a href="'+CFG_SITE_URL+\ # '/search?f=journal&p='+ \ # reference['s']+ \ # '&ln=' + bfo.lang + \ # '">'+display_journal+"</a></small>" # if display_report: # ref_out += ' <small><a href="'+CFG_SITE_URL+\ # '/search?f=reportnumber&p='+ \ # reference['r']+ \ # '&ln=' + bfo.lang + \ # '">'+display_report+"</a></small>" else: ref_out = '<small>' if display_journal: ref_out += display_journal if display_report: ref_out += ' '+display_report ref_out += ' (not in Inspire)</small>' ref_out += "<br />" if reference_prefix is not None and ref_out != '': ref_out = reference_prefix + ref_out if reference_suffix is not None and ref_out != '': ref_out += reference_suffix out += ref_out if out != '': out += '</li>' return out
def format_element(bfo, reference_prefix, reference_suffix): """ Prints the references of this record @param reference_prefix a prefix displayed before each reference @param reference_suffix a suffix displayed after each reference """ references = bfo.fields("999C5", escape=1) out = "<div id='referenceinp_link_box'><span id='referenceinp_link_span'><a id='referenceinp_link' href='"+CFG_SITE_URL+'/record/'+str(bfo.recID)+'/export/hrf'+"'>Update these references</a></span></div>" last_o = "" if not references: return out out += "<table>" for reference in references: ref_out = '<tr><td valign="top">' display_journal = '' display_report = '' clean_report = '' clean_journal = '' hits = [] if reference.has_key('o') and not reference['o'] == last_o: temp_ref = reference['o'].replace('.', '') if '[' in temp_ref and ']' in temp_ref: ref_out += "<small>" + temp_ref + "</small> " else: ref_out += "<small>[" + temp_ref + "] </small> " last_o = temp_ref ref_out += "</td><td>" if reference.has_key('s'): display_journal = reference['s'] clean_journal = reference['s'] if reference.has_key('r'): if "[" in reference['r'] and "]" in reference['r']: breaknum = reference['r'].find('[') newreference = reference['r'][:breaknum].strip() display_report = newreference clean_report = newreference else: display_report = reference['r'] clean_report = reference['r'] if clean_report: hits = search_unit(f='reportnumber', p=clean_report) if clean_journal and len(hits)!=1: hits = search_unit(f='journal', p=clean_journal) if reference.has_key('a') and len(hits)!=1: hits = search_unit(f='doi', p=reference['a']) if len(hits) == 1: ref_out += '<small>' +\ format_record(list(hits)[0],'hs') + '</small>' else: if reference.has_key('h'): ref_out += "<small> " + reference['h'] + ".</small> " if reference.has_key('m'): ref_out += "<small>"+ reference['m'].replace(']]', ']') + ".</small> " if reference.has_key('a'): ref_out += " <small><a href=\"http://dx.doi.org/" + \ reference['a'] + "\">" + reference['a']+ "</a></small> " if reference.has_key('u'): ref_out += " <small><a href=" + reference['u'] + ">" + \ reference['u']+ "</a></small> " ref_out += ' <small>' if display_journal: ref_out += display_journal if display_report: ref_out += ' ' + display_report ref_out += "</small></td></tr>" if reference_prefix is not None and ref_out != '': ref_out = reference_prefix + ref_out if reference_suffix is not None and ref_out != '': ref_out += reference_suffix out += ref_out if out != '': out += '</li>' return out + "</table>"
def bst_prodsync(method='afs', with_citations='yes', with_claims='yes', skip_collections=''): """ Synchronize to either 'afs' or 'redis' with_citations: yes/no, whether records that now matches a record will need to be re-exported.abs with_claims: yes/no, whether record involved in some new claim need to be re-exported. skip_collections: comma-separated-lists of values for which records having 980:VALUE should be ignored, e.g. skip_collections='HEP,HEPNAMES,HEPHIDDEN' """ if not CFG_REDIS_HOST_LABS: method = 'afs' write_message("Prodsync started using %s method" % method) now = datetime.datetime.now() future_lastrun = now.strftime('%Y-%m-%d %H:%M:%S') lastrun_path = os.path.join(CFG_TMPSHAREDDIR, 'prodsync_%s_lastrun.txt' % method) try: last_run = open(lastrun_path).read().strip() write_message("Syncing records modified since %s" % last_run) with run_ro_on_slave_db(): modified_records = intbitset( run_sql("SELECT id FROM bibrec WHERE modification_date>=%s", (last_run, ))) compacttime = last_run.replace('-', '').replace(' ', '').replace(':', '') notimechangerecs = search_unit("%s->20250101000000" % compacttime, f='005', m='a') modified_records += notimechangerecs if with_citations.lower() == 'yes': for citee, citer in run_sql( "SELECT citee, citer FROM rnkCITATIONDICT WHERE last_updated>=%s", (last_run, )): modified_records.add(citer) if with_claims.lower() == 'yes': modified_records |= intbitset( run_sql( "SELECT bibrec FROM aidPERSONIDPAPERS WHERE last_updated>=%s", (last_run, ))) modified_records |= intbitset( run_sql( "SELECT bibrec FROM aidPERSONIDPAPERS AS p JOIN aidPERSONIDDATA as d" " ON p.personid = d.personid WHERE d.last_updated>=%s", (last_run, ))) except IOError: # Default to everything with run_ro_on_slave_db(): modified_records = intbitset(run_sql("SELECT id FROM bibrec")) write_message("Syncing all records") skip_collections = skip_collections.split(',') skip_collections.remove('') for collection in skip_collections: modified_records -= search_pattern(p='980:%s' % collection) if not modified_records: write_message("Nothing to do") return True tot = len(modified_records) time_estimator = get_time_estimator(tot) write_message("Adding %s new or modified records" % tot) if method == 'afs': afs_sync(reversed(modified_records), time_estimator, tot, now) open(lastrun_path, "w").write(future_lastrun) write_message("DONE!") else: if redis_sync(reversed(modified_records), time_estimator, tot): open(lastrun_path, "w").write(future_lastrun) write_message("DONE!") else: write_message("Skipping prodsync: Redis queue is not yet empty")
from invenio.bibindex_engine import get_field_tags from invenio.bibindex_engine import CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK class memoise: def __init__(self, function): self.memo = {} self.function = function def __call__(self, *args): if args not in self.memo: self.memo[args] = self.function(*args) return self.memo[args] INTBITSET_OF_DELETED_RECORDS = search_unit(p="DELETED", f="980", m="a") re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK = re.compile(CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK) def get_recids_matching_query(p, f, m="e"): """Return set of recIDs matching query for pattern p in field f.""" return search_pattern(p=p, f=f, m=m) - INTBITSET_OF_DELETED_RECORDS def get_citation_weight(rank_method_code, config, chunk_size=20000): """return a dictionary which is used by bibrank daemon for generating the index of sorted research results by citation information """ begin_time = time.time()
def format_element(bfo, reference_prefix, reference_suffix): """ Prints the references of this record @param reference_prefix a prefix displayed before each reference @param reference_suffix a suffix displayed after each reference """ references = bfo.fields("999C5", escape=1, repeatable_subfields_p=True) out = "" last_o = "" if not references: return out out += "<table>" for reference in references: ref_out = [] ref_out.append('<tr><td valign="top">') display_journal = '' display_report = '' clean_report = '' clean_journal = '' hits = [] if reference.has_key('o') and not reference['o'][0] == last_o: temp_ref = reference['o'][0].replace('.', '') if '[' in temp_ref and ']' in temp_ref: ref_out.append("<small>" + temp_ref + "</small> ") else: ref_out.append("<small>[" + temp_ref + "] </small> ") last_o = temp_ref ref_out.append("</td><td>") if reference_prefix: ref_out.append(reference_prefix) if reference.has_key('s'): display_journal = reference['s'][0] clean_journal = reference['s'][0] if reference.has_key('r'): if "[" in reference['r'][0] and "]" in reference['r'][0]: breaknum = reference['r'][0].find('[') newreference = reference['r'][0][:breaknum].strip() display_report = newreference clean_report = newreference else: display_report = reference['r'][0] clean_report = reference['r'][0] if clean_report: hits = search_unit(f='reportnumber', p=clean_report) if clean_journal and len(hits) != 1: hits = search_unit(f='journal', p=clean_journal) if reference.has_key('a') and len(hits) != 1: hits = search_unit(p=reference['a'][0]) if reference.has_key('0') and len(hits) != 1: # check if the record exists in the database try: recID = int(reference['0'][0]) if get_record(recID): # since we already have a recID, we can assign it directly # to the "hits" variable, so it will be handled in the last if statement hits = [recID] except ValueError: pass if len(hits) == 1: ref_out.append('<small>' + format_record(list(hits)[0], 'hs') + '</small>') else: if reference.has_key('h'): ref_out.append("<small> " + reference['h'][0] + ".</small>") if reference.has_key('t'): ref_out.append("<small> " + reference['t'][0] + "</small> -") if reference.has_key('y'): ref_out.append("<small> " + reference['y'][0] + ".</small>") if reference.has_key('p'): ref_out.append("<small> " + reference['p'][0] + ".</small>") if reference.has_key('m'): ref_out.append("<small> " + reference['m'][0].replace(']]', ']') + ".</small>") if reference.has_key('a'): ref_out.append("<small> <a href=\"http://dx.doi.org/" + \ reference['a'][0] + "\">" + reference['a'][0]+ "</a></small>") if reference.has_key('u'): ref_out.append("<small> <a href=" + reference['u'][0] + ">" + \ reference['u'][0]+ "</a></small>") if reference.has_key('i'): for r in reference['i']: ref_out.append( "<small> <a href=\"/search?ln=en&p=020__a%3A" + r + "\">" + r + "</a></small>") ref_out.append('<small>') if display_journal: ref_out.append(display_journal) if display_report: ref_out.append(' ' + display_report) ref_out.append("</small>") if reference_suffix: ref_out.append(reference_suffix) ref_out.append("</td></tr>") out += ' '.join(ref_out) return out + "</table>"
from hep_ads_xml_input import ARXIV_REGEX, ARXIV_REGEX_NEW from hep_compare_arxiv_inspire_input import IGNORE_EPRINTS, UK_TO_US LOGFILE = 'tmp_' + __file__ LOGFILE = re.sub('.py', '.log', LOGFILE) logging.basicConfig(filename=LOGFILE, filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO) INPUT_FILE = 'tmp_hep_ads_xml_missing_eprint.in' MAX_COUNT = 10 URL_BASE = 'http://export.arxiv.org/api/query?id_list=' DOI_REGEX = re.compile(r'^10.\d{4,9}/\S+$') DELETED = search_unit(p='DELETED', m='a', f='980*') def clean_eprint(eprint): '''Remove possible prefix from eprint.''' regex = re.compile('^arxiv:', re.I) return regex.sub('', eprint) def create_xml(recid, input_dict): '''Create marcxml file from.''' record = {} record_add_field(record, '001', controlfield_value=str(recid))
def examine(field_search): field = field_search[0] search = field_search[1] collection = field_search[2] core = perform_request_search(p='980:CORE', cc='HEP') search_theory = 'find fc p or fc t or fc l or fc n or fc g 980:core' search_core = '980:core' core = perform_request_search(p=search_core, cc='HEP') if re.search(r'541.*', field): result = search_unit(p = search, m = 'a', f = field) #result = result & get_collection_reclist('HEP') result = result & intbitset(core) else: if not re.search(r'\:', search): search = field + ':' + search result = perform_request_search(p = search, cc = collection) if collection == 'HEP': result = intbitset(result) & intbitset(core) if VERBOSE: print 'VERBOSE', field, search, collection, len(result) already_seen_field_values = [] for recid in result: recid_print = "" field_values = get_fieldvalues(recid, field) for field_value in field_values: bad_id = False if field_value in already_seen_field_values: continue if re.search(r'INSPIRE', field_value): inspire_form = r'^INSPIRE-\d{8}$' if not re.match(inspire_form, field_value): print 'Bad INSPIRE ID: ', field_value bad_id = True elif re.search(r'^0000-', field_value): orcid_form = r'^0000-\d{4}-\d{4}-\d{3}[\dX]$' if not re.match(orcid_form, field_value): print 'Bad ORCID ID: ', field_value bad_id = True search_dup = '{0}:"{1}"'.format(field, field_value) if field == '371__m' or field == '541__b': search_dup = email_search(field_value) if re.search(r"\'", field_value): field_value_mod = \ re.sub(r"\'", r".", field_value) search_dup = email_search(field_value_mod) elif collection == 'HEP': #field == '541__a' or field == '100__j' \ # or field == '700__j': ignore = r'(CCID|JACoW|uid|arxiv)' if re.search(ignore, field_value): continue field_value = re.sub(r'^\w+:', r'', field_value) if not field_value: continue search_dup = '035__a:' + field_value #collection = 'HepNames' if collection == 'HEP': #field == '541__a' or field == '541__b': recid_print = "http://inspirehep.net/record/" \ + str(recid) + "/export/xm" #print search_dup if field_value in already_seen_field_values: continue result_dup = perform_request_search(p = search_dup, \ cc = 'HepNames') if len(result_dup) != 1 or bad_id: if field == '100__a': for recid_dup in result_dup: author_id = \ find_inspire_id_from_record(recid_dup) print '{0:11d} {1:40s} {2:20s}'.\ format(recid_dup, field_value, author_id) else: if len(result_dup) == 0: print_field_value = field_value if collection == 'HEP' and \ re.search(r'^0000-', field_value): print_field_value = 'http://orcid.org/' + \ field_value print '{0:40s} {1:30s}'. \ format(print_field_value, recid_print) else: print search_dup, recid_print, result_dup, bad_id if field == '035__a' and field_value: author_search = r'100__a:"{0}" or 700__a:"{0}"' search_hep = author_search.format(field_value) result_hep = perform_request_search(p = search_hep, \ cc = 'HEP') if len(result_hep) > 0: print 'Bad ID in HEP', search_hep, \ len(result_hep) already_seen_field_values.append(field_value)
from invenio.dbquery import run_sql, serialize_via_marshal, \ deserialize_via_marshal from invenio.bibindex_engine import CFG_JOURNAL_PUBINFO_STANDARD_FORM from invenio.search_engine import search_pattern, search_unit from invenio.search_engine_utils import get_fieldvalues from invenio.bibformat_utils import parse_tag from invenio.bibknowledge import get_kb_mappings from invenio.bibtask import write_message, task_get_option, \ task_update_progress, task_sleep_now_if_required, \ task_get_task_param from invenio.errorlib import register_exception from invenio.bibindex_engine import get_field_tags from invenio.bibindex_engine import CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK INTBITSET_OF_DELETED_RECORDS = search_unit(p='DELETED', f='980', m='a') re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK = re.compile(CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK) def get_recids_matching_query(p, f, m='e'): """Return set of recIDs matching query for pattern p in field f.""" return search_pattern(p=p, f=f, m=m) - INTBITSET_OF_DELETED_RECORDS def get_citation_weight(rank_method_code, config, chunk_size=20000): """return a dictionary which is used by bibrank daemon for generating the index of sorted research results by citation information """ begin_time = time.time() quick = task_get_option("quick") != "no"
def format_element(bfo, reference_prefix, reference_suffix): """ Prints the references of this record @param reference_prefix a prefix displayed before each reference @param reference_suffix a suffix displayed after each reference """ from invenio.search_engine import search_unit from invenio.bibformat import format_record references = bfo.fields("999C5", escape=1) out = "" for reference in references: ref_out = '' # if reference.has_key('o'): # if out != "": # ref_out = '</li>' # ref_out += '<li><small>'+\ # reference['o']+ "</small> " display_journal = '' display_report = '' clean_report = '' clean_journal = '' hits = [] if reference.has_key('s'): display_journal = reference['s'] clean_journal = reference['s'] if reference.has_key('r'): display_report = reference['r'] clean_report = reference['r'] if clean_report: hits = search_unit(f='reportnumber', p=clean_report) if clean_journal and len(hits)!=1: hits = search_unit(f='journal', p=clean_journal) if reference.has_key('a') and len(hits)!=1: hits = search_unit(f='doi', p=reference['a']) if len(hits) == 1: ref_out += '<small>' +\ format_record(list(hits)[0],'hs') + '</small>' else: if reference.has_key('h'): ref_out += "<small> " + reference['h']+ ".</small> " if reference.has_key('m'): ref_out += "<small>"+ reference['m'] + ".</small> " if reference.has_key('a'): ref_out += " <small><a href=\"http://dx.doi.org/" + \ reference['a'] + "\">" + reference['a']+ "</a></small> " ref_out += ' <small>' if display_journal: ref_out += display_journal if display_report: ref_out += ' ' + display_report ref_out += '<br /> <em>(not extracted or not in INSPIRE)</em></small>' ref_out += "<br />" if reference_prefix is not None and ref_out != '': ref_out = reference_prefix + ref_out if reference_suffix is not None and ref_out != '': ref_out += reference_suffix out += ref_out if out != '': out += '</li>' return out
task_get_task_param from invenio.errorlib import register_exception from invenio.intbitset import intbitset class memoise: def __init__(self, function): self.memo = {} self.function = function def __call__(self, *args): if self.memo.has_key(args): return self.memo[args] else: object = self.memo[args] = self.function(*args) return object INTBITSET_OF_DELETED_RECORDS = search_unit(p='DELETED', f='980', m='a') def get_recids_matching_query(pvalue, fvalue): """Return list of recIDs matching query for PVALUE and FVALUE.""" rec_id = list(search_pattern(p=pvalue, f=fvalue, m='e') - INTBITSET_OF_DELETED_RECORDS) return rec_id get_recids_matching_query = memoise(get_recids_matching_query) def get_citation_weight(rank_method_code, config): """return a dictionary which is used by bibrank daemon for generating the index of sorted research results by citation information """ begin_time = time.time() last_update_time = get_bibrankmethod_lastupdate(rank_method_code) if task_get_option("quick") == "no":
def ref_analyzer(citation_informations, dicts, updated_recids, tags, do_catchup=True): """Analyze the citation informations and calculate the citation weight and cited by list dictionary. """ citations_weight = dicts['cites_weight'] citations = dicts['cites'] references = dicts['refs'] selfcites = dicts['selfcites'] selfrefs = dicts['selfrefs'] authorcites = dicts['authorcites'] def step(msg_prefix, recid, done, total): if done % 30 == 0: task_sleep_now_if_required() if done % 1000 == 0: mesg = "%s done %s of %s" % (msg_prefix, done, total) write_message(mesg) task_update_progress(mesg) write_message("Processing: %s" % recid, verbose=9) def add_to_dicts(citer, cited): # Make sure we don't add ourselves # Workaround till we know why we are adding ourselves. if citer == cited: return if cited not in citations_weight: citations_weight[cited] = 0 # Citations and citations weight if citer not in citations.setdefault(cited, []): citations[cited].append(citer) citations_weight[cited] += 1 # References if cited not in references.setdefault(citer, []): references[citer].append(cited) # dict of recid -> institute_give_publ_id records_info, references_info = citation_informations t1 = os.times()[4] write_message("Phase 0: temporarily remove changed records from " \ "citation dictionaries; they will be filled later") if do_catchup: for somerecid in updated_recids: try: del citations[somerecid] except KeyError: pass for somerecid in updated_recids: try: del references[somerecid] except KeyError: pass # Try to find references based on 999C5r # e.g 8 -> ([astro-ph/9889],[hep-ph/768]) # meaning: rec 8 contains these in bibliography write_message("Phase 1: Report numbers references") done = 0 for thisrecid, refnumbers in references_info['report-numbers'].iteritems(): step("Report numbers references", thisrecid, done, len(references_info['report-numbers'])) done += 1 for refnumber in (r for r in refnumbers if r): field = 'reportnumber' refnumber = standardize_report_number(refnumber) # Search for "hep-th/5644654 or such" in existing records recids = get_recids_matching_query(p=refnumber, f=field) write_message("These match searching %s in %s: %s" % \ (refnumber, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, refnumber) else: remove_from_missing(refnumber) if len(recids) > 1: store_citation_warning('multiple-matches', refnumber) msg = "Whoops: record '%d' report number value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, refnumber, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t2 = os.times()[4] # Try to find references based on 999C5s # e.g. Phys.Rev.Lett. 53 (1986) 2285 write_message("Phase 2: Journal references") done = 0 for thisrecid, refs in references_info['journals'].iteritems(): step("Journal references", thisrecid, done, len(references_info['journals'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'journal' # check reference value to see whether it is well formed: if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p): store_citation_warning('not-well-formed', p) msg = "Whoops, record '%d' reference value '%s' " \ "is not well formed; skipping it." % (thisrecid, p) write_message(msg, stream=sys.stderr) continue # skip this ill-formed value recids = search_unit(p, field) - INTBITSET_OF_DELETED_RECORDS write_message("These match searching %s in %s: %s" \ % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' reference value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t3 = os.times()[4] # Try to find references based on 999C5a # e.g. 10.1007/BF03170733 write_message("Phase 3: DOI references") done = 0 for thisrecid, refs in references_info['doi'].iteritems(): step("DOI references", thisrecid, done, len(references_info['doi'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'doi' recids = get_recids_matching_query(p, field) write_message("These match searching %s in %s: %s" \ % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' DOI value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t4 = os.times()[4] # Search for stuff like CERN-TH-4859/87 in list of refs write_message("Phase 4: report numbers catchup") done = 0 for thisrecid, reportcodes in records_info['report-numbers'].iteritems(): step("Report numbers catchup", thisrecid, done, len(records_info['report-numbers'])) done += 1 for reportcode in (r for r in reportcodes if r): if reportcode.startswith('arXiv'): std_reportcode = standardize_report_number(reportcode) report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \ re.escape(std_reportcode) recids = get_recids_matching_query(report_pattern, tags['refs_report_number'], 'r') else: recids = get_recids_matching_query(reportcode, tags['refs_report_number'], 'e') for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) # Find this record's pubinfo in other records' bibliography write_message("Phase 5: journals catchup") done = 0 t5 = os.times()[4] for thisrecid, rec_journals in records_info['journals'].iteritems(): step("Journals catchup", thisrecid, done, len(records_info['journals'])) done += 1 for journal in rec_journals: journal = journal.replace("\"", "") # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5s recids = search_unit(p=journal, f=tags['refs_journal'], m='a') \ - INTBITSET_OF_DELETED_RECORDS write_message("These records match %s in %s: %s" \ % (journal, tags['refs_journal'], list(recids)), verbose=9) for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 6: DOI catchup") done = 0 t6 = os.times()[4] for thisrecid, dois in records_info['doi'].iteritems(): step("DOI catchup", thisrecid, done, len(records_info['doi'])) done += 1 for doi in dois: # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5a recids = search_unit(p=doi, f=tags['refs_doi'], m='a') \ - INTBITSET_OF_DELETED_RECORDS write_message("These records match %s in %s: %s" \ % (doi, tags['refs_doi'], list(recids)), verbose=9) for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 7: remove empty lists from dicts") # Remove empty lists in citation and reference keys = citations.keys() for k in keys: if not citations[k]: del citations[k] keys = references.keys() for k in keys: if not references[k]: del references[k] if task_get_task_param('verbose') >= 3: # Print only X first to prevent flood write_message("citation_list (x is cited by y):") write_message(dict(islice(citations.iteritems(), 10))) write_message("size: %s" % len(citations)) write_message("reference_list (x cites y):") write_message(dict(islice(references.iteritems(), 10))) write_message("size: %s" % len(references)) write_message("selfcitedbydic (x is cited by y and one of the " \ "authors of x same as y's):") write_message(dict(islice(selfcites.iteritems(), 10))) write_message("size: %s" % len(selfcites)) write_message("selfdic (x cites y and one of the authors of x " \ "same as y's):") write_message(dict(islice(selfrefs.iteritems(), 10))) write_message("size: %s" % len(selfrefs)) write_message("authorcitdic (author is cited in recs):") write_message(dict(islice(authorcites.iteritems(), 10))) write_message("size: %s" % len(authorcites)) t7 = os.times()[4] write_message("Execution time for analyzing the citation information " \ "generating the dictionary:") write_message("... checking ref report numbers: %.2f sec" % (t2 - t1)) write_message("... checking ref journals: %.2f sec" % (t3 - t2)) write_message("... checking ref DOI: %.2f sec" % (t4 - t3)) write_message("... checking rec report numbers: %.2f sec" % (t5 - t4)) write_message("... checking rec journals: %.2f sec" % (t6 - t5)) write_message("... checking rec DOI: %.2f sec" % (t7 - t6)) write_message("... total time of ref_analyze: %.2f sec" % (t7 - t1)) return citations_weight, citations, references, selfcites, \ selfrefs, authorcites
def ref_analyzer(citation_informations, initialresult, initial_citationlist, initial_referencelist, config, updated_rec_list): """Analyze the citation informations and calculate the citation weight and cited by list dictionary. """ function = "" try: function = config.get("rank_method", "function") except: register_exception( prefix="cfg section [rank_method] has no attr function", alert_admin=True) return {} pubrefntag = "" try: pubrefntag = config.get(function, "reference_via_report_number") except: register_exception(prefix="cfg section " + function + " has no attr reference_via_report_number", alert_admin=True) return {} pubreftag = "" try: pubreftag = config.get(function, "reference_via_pubinfo") except: register_exception(prefix="cfg section " + function + " has no attr reference_via_pubinfo", alert_admin=True) return {} #pubrefntag is often 999C5r, pubreftag 999C5s if task_get_task_param('verbose') >= 9: write_message("pubrefntag " + pubrefntag) write_message("pubreftag " + pubreftag) citation_list = initial_citationlist reference_list = initial_referencelist result = initialresult d_reports_numbers = citation_informations[ 0] #dict of recid -> institute_give_publ_id d_references_report_numbers = citation_informations[ 1] #dict of recid -> ['astro-ph/xyz'..] d_references_s = citation_informations[2] #dict of recid -> publication_infos_in_its_bibliography d_records_s = citation_informations[3] #recid -> its publication inf t1 = os.times()[4] write_message("Phase 1: d_references_report_numbers") #d_references_report_numbers: e.g 8 -> ([astro-ph/9889],[hep-ph/768]) #meaning: rec 8 contains these in bibliography done = 0 numrecs = len(d_references_report_numbers) for thisrecid, refnumbers in d_references_report_numbers.iteritems(): if (done % 1000 == 0): mesg = "d_references_report_numbers done " + str( done) + " of " + str(numrecs) write_message(mesg) task_update_progress(mesg) #write to db! insert_into_cit_db(reference_list, "reversedict") insert_into_cit_db(citation_list, "citationdict") #it's ok to sleep too, we got something done task_sleep_now_if_required() done = done + 1 for refnumber in refnumbers: if refnumber: p = refnumber f = 'reportnumber' #sanitise p p.replace("\n", '') #search for "hep-th/5644654 or such" in existing records rec_ids = get_recids_matching_query(p, f) if rec_ids and rec_ids[0]: write_citer_cited(thisrecid, rec_ids[0]) remove_from_missing(p) if not result.has_key(rec_ids[0]): result[rec_ids[0]] = 0 # Citation list should have rec_ids[0] but check anyway if not citation_list.has_key(rec_ids[0]): citation_list[rec_ids[0]] = [] #append unless this key already has the item if not thisrecid in citation_list[rec_ids[0]]: citation_list[rec_ids[0]].append(thisrecid) #and update result result[rec_ids[0]] += 1 if not reference_list.has_key(thisrecid): reference_list[thisrecid] = [] if not rec_ids[0] in reference_list[thisrecid]: reference_list[thisrecid].append(rec_ids[0]) else: #the reference we wanted was not found among our records. #put the reference in the "missing".. however, it will look #bad.. gfhgf/1254312, so get the corresponding 999C5s (full ref) too #This should really be done in the next loop d_references_s #but the 999C5s fields are not yet normalized #rectext = print_record(thisrecid, format='hm', ot=pubreftag[:-1]) rectext = "" # print_record() call disabled to speed things up lines = rectext.split("\n") rpart = p #to be used.. for l in lines: if ( l.find(p) > 0 ): #the gfhgf/1254312 was found.. get the s-part of it st = l.find('$s') if (st > 0): end = l.find('$', st) if (end == st): end = len(l) rpart = l[st + 2:end] insert_into_missing(thisrecid, rpart) mesg = "d_references_report_numbers done fully" write_message(mesg) task_update_progress(mesg) t2 = os.times()[4] #try to find references based on 999C5s, like Phys.Rev.Lett. 53 (1986) 2285 write_message("Phase 2: d_references_s") done = 0 numrecs = len(d_references_s) for thisrecid, refss in d_references_s.iteritems(): if (done % 1000 == 0): mesg = "d_references_s done " + str(done) + " of " + str(numrecs) write_message(mesg) task_update_progress(mesg) #write to db! insert_into_cit_db(reference_list, "reversedict") insert_into_cit_db(citation_list, "citationdict") task_sleep_now_if_required() done = done + 1 for refs in refss: if refs: p = refs #remove the latter page number if it is like 67-74 matches = re.compile("(.*)(-\d+$)").findall(p) if matches and matches[0]: p = matches[0][0] rec_id = None try: rec_ids = list(search_unit(p, 'journal')) except: rec_ids = None write_message("These match searching " + p + " in journal: " + str(rec_id), verbose=9) if rec_ids and rec_ids[0]: #the refered publication is in our collection, remove #from missing remove_from_missing(p) else: #it was not found so add in missing insert_into_missing(thisrecid, p) #check citation and reference for this.. if rec_ids and rec_ids[0]: #the above should always hold if not result.has_key(rec_ids[0]): result[rec_ids[0]] = 0 if not citation_list.has_key(rec_ids[0]): citation_list[rec_ids[0]] = [] if not thisrecid in citation_list[rec_ids[0]]: citation_list[rec_ids[0]].append( thisrecid) #append actual list result[rec_ids[0]] += 1 #add count for this.. #update reference_list accordingly if not reference_list.has_key(thisrecid): reference_list[thisrecid] = [] if not rec_ids[0] in reference_list[thisrecid]: reference_list[thisrecid].append(rec_ids[0]) mesg = "d_references_s done fully" write_message(mesg) task_update_progress(mesg) t3 = os.times()[4] done = 0 numrecs = len(d_reports_numbers) write_message("Phase 3: d_reports_numbers") #search for stuff like CERN-TH-4859/87 in list of refs for thisrecid, reportcodes in d_reports_numbers.iteritems(): if (done % 1000 == 0): mesg = "d_report_numbers done " + str(done) + " of " + str(numrecs) write_message(mesg) task_update_progress(mesg) done = done + 1 for reportcode in reportcodes: if reportcode: rec_ids = [] try: rec_ids = get_recids_matching_query(reportcode, pubrefntag) except: rec_ids = [] if rec_ids: for recid in rec_ids: #normal checks.. if not citation_list.has_key(thisrecid): citation_list[thisrecid] = [] if not reference_list.has_key(recid): reference_list[recid] = [] if not result.has_key(thisrecid): result[thisrecid] = 0 #normal updates if not recid in citation_list[thisrecid]: result[thisrecid] += 1 citation_list[thisrecid].append(recid) if not thisrecid in reference_list[recid]: reference_list[recid].append(thisrecid) mesg = "d_report_numbers done fully" write_message(mesg) task_update_progress(mesg) #find this record's pubinfo in other records' bibliography write_message("Phase 4: d_records_s") done = 0 numrecs = len(d_records_s) t4 = os.times()[4] for thisrecid, recs in d_records_s.iteritems(): if (done % 1000 == 0): mesg = "d_records_s done " + str(done) + " of " + str(numrecs) write_message(mesg) task_update_progress(mesg) done = done + 1 p = recs.replace("\"", "") #search the publication string like Phys. Lett., B 482 (2000) 417 in 999C5s rec_ids = list(search_unit(f=pubreftag, p=p, m='a')) write_message("These records match " + p + " in " + pubreftag + " : " + str(rec_ids), verbose=9) if rec_ids: for rec_id in rec_ids: #normal checks if not result.has_key(thisrecid): result[thisrecid] = 0 if not citation_list.has_key(thisrecid): citation_list[thisrecid] = [] if not reference_list.has_key(rec_id): reference_list[rec_id] = [] if not rec_id in citation_list[thisrecid]: result[thisrecid] += 1 citation_list[thisrecid].append(rec_id) if not thisrecid in reference_list[rec_id]: reference_list[rec_id].append(thisrecid) mesg = "d_records_s done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 5: reverse lists") #remove empty lists in citation and reference keys = citation_list.keys() for k in keys: if not citation_list[k]: del citation_list[k] keys = reference_list.keys() for k in keys: if not reference_list[k]: del reference_list[k] write_message("Phase 6: self-citations") selfdic = {} #get the initial self citation dict initial_self_dict = get_cit_dict("selfcitdict") selfdic = initial_self_dict #add new records to selfdic acit = task_get_option("author-citations") if not acit: write_message( "Self cite processing disabled. Use -A option to enable it.") else: write_message("self cite and author citations enabled") selfdic = get_self_citations(updated_rec_list, citation_list, initial_self_dict, config) #selfdic consists of #key k -> list of values [v1,v2,..] #where k is a record with author A and k cites v1,v2.. and A appears in v1,v2.. #create a reverse "x cited by y" self cit dict selfcitedbydic = {} for k in selfdic.keys(): vlist = selfdic[k] for v in vlist: if selfcitedbydic.has_key(v): tmplist = selfcitedbydic[v] if not k in tmplist: tmplist.append(k) else: tmplist = [k] selfcitedbydic[v] = tmplist write_message("Getting author citations") #get author citations for records in updated_rec_list initial_author_dict = get_initial_author_dict() authorcitdic = initial_author_dict acit = task_get_option("author-citations") if not acit: print "Author cites disabled. Use -A option to enable it." else: write_message("author citations enabled") authorcitdic = get_author_citations(updated_rec_list, citation_list, initial_author_dict, config) if task_get_task_param('verbose') >= 3: #print only X first to prevent flood tmpdict = {} tmp = citation_list.keys()[0:10] for t in tmp: tmpdict[t] = citation_list[t] write_message("citation_list (x is cited by y): " + str(tmpdict)) write_message("size: " + str(len(citation_list.keys()))) tmp = reference_list.keys()[0:10] tmpdict = {} for t in tmp: tmpdict[t] = reference_list[t] write_message("reference_list (x cites y): " + str(tmpdict)) write_message("size: " + str(len(reference_list.keys()))) tmp = selfcitedbydic.keys()[0:10] tmpdict = {} for t in tmp: tmpdict[t] = selfcitedbydic[t] mesg = "selfcitedbydic (x is cited by y and one of the authors of x same as y's):" mesg += str(tmpdict) write_message(mesg) write_message("size: " + str(len(selfcitedbydic.keys()))) tmp = selfdic.keys()[0:100] tmpdict = {} for t in tmp: tmpdict[t] = selfdic[t] mesg = "selfdic (x cites y and one of the authors of x same as y's): " + str( tmpdict) write_message(mesg) write_message("size: " + str(len(selfdic.keys()))) tmp = authorcitdic.keys()[0:10] tmpdict = {} for t in tmp: tmpdict[t] = authorcitdic[t] write_message("authorcitdic (author is cited in recs): " + str(tmpdict)) write_message("size: " + str(len(authorcitdic.keys()))) insert_cit_ref_list_intodb(citation_list, reference_list, selfcitedbydic, selfdic, authorcitdic) t5 = os.times()[4] write_message( "Execution time for analyzing the citation information generating the dictionary:" ) write_message("... checking ref number: %.2f sec" % (t2 - t1)) write_message("... checking ref ypvt: %.2f sec" % (t3 - t2)) write_message("... checking rec number: %.2f sec" % (t4 - t3)) write_message("... checking rec ypvt: %.2f sec" % (t5 - t4)) write_message("... total time of ref_analyze: %.2f sec" % (t5 - t1)) return result
def deleted_recids_cache(cache={}): if 'deleted_records' not in cache: cache['deleted_records'] = search_unit(p='DELETED', f='980', m='a') return cache['deleted_records']
def ref_analyzer(citation_informations, dicts, updated_recids, tags, do_catchup=True): """Analyze the citation informations and calculate the citation weight and cited by list dictionary. """ citations_weight = dicts['cites_weight'] citations = dicts['cites'] references = dicts['refs'] selfcites = dicts['selfcites'] selfrefs = dicts['selfrefs'] authorcites = dicts['authorcites'] def step(msg_prefix, recid, done, total): if done % 30 == 0: task_sleep_now_if_required() if done % 1000 == 0: mesg = "%s done %s of %s" % (msg_prefix, done, total) write_message(mesg) task_update_progress(mesg) write_message("Processing: %s" % recid, verbose=9) def add_to_dicts(citer, cited): # Make sure we don't add ourselves # Workaround till we know why we are adding ourselves. if citer == cited: return if cited not in citations_weight: citations_weight[cited] = 0 # Citations and citations weight if citer not in citations.setdefault(cited, []): citations[cited].append(citer) citations_weight[cited] += 1 # References if cited not in references.setdefault(citer, []): references[citer].append(cited) # dict of recid -> institute_give_publ_id records_info, references_info = citation_informations t1 = os.times()[4] write_message("Phase 0: temporarily remove changed records from " \ "citation dictionaries; they will be filled later") if do_catchup: for somerecid in updated_recids: try: del citations[somerecid] except KeyError: pass for somerecid in updated_recids: try: del references[somerecid] except KeyError: pass # Try to find references based on 999C5r # e.g 8 -> ([astro-ph/9889],[hep-ph/768]) # meaning: rec 8 contains these in bibliography write_message("Phase 1: Report numbers references") done = 0 for thisrecid, refnumbers in references_info['report-numbers'].iteritems(): step("Report numbers references", thisrecid, done, len(references_info['report-numbers'])) done += 1 for refnumber in (r for r in refnumbers if r): field = 'reportnumber' refnumber = standardize_report_number(refnumber) # Search for "hep-th/5644654 or such" in existing records recids = get_recids_matching_query(p=refnumber, f=field) write_message("These match searching %s in %s: %s" % \ (refnumber, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, refnumber) else: remove_from_missing(refnumber) if len(recids) > 1: store_citation_warning('multiple-matches', refnumber) msg = "Whoops: record '%d' report number value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, refnumber, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t2 = os.times()[4] # Try to find references based on 999C5s # e.g. Phys.Rev.Lett. 53 (1986) 2285 write_message("Phase 2: Journal references") done = 0 for thisrecid, refs in references_info['journals'].iteritems(): step("Journal references", thisrecid, done, len(references_info['journals'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'journal' # check reference value to see whether it is well formed: if not re_CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK.match(p): store_citation_warning('not-well-formed', p) msg = "Whoops, record '%d' reference value '%s' " \ "is not well formed; skipping it." % (thisrecid, p) write_message(msg, stream=sys.stderr) continue # skip this ill-formed value recids = search_unit(p, field) - INTBITSET_OF_DELETED_RECORDS write_message("These match searching %s in %s: %s" \ % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' reference value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t3 = os.times()[4] # Try to find references based on 999C5a # e.g. 10.1007/BF03170733 write_message("Phase 3: DOI references") done = 0 for thisrecid, refs in references_info['doi'].iteritems(): step("DOI references", thisrecid, done, len(references_info['doi'])) done += 1 for reference in (r for r in refs if r): p = reference field = 'doi' recids = get_recids_matching_query(p, field) write_message("These match searching %s in %s: %s" \ % (reference, field, list(recids)), verbose=9) if not recids: insert_into_missing(thisrecid, p) else: remove_from_missing(p) if len(recids) > 1: store_citation_warning('multiple-matches', p) msg = "Whoops: record '%d' DOI value '%s' " \ "matches many records; taking only the first one. %s" % \ (thisrecid, p, repr(recids)) write_message(msg, stream=sys.stderr) for recid in list(recids)[:1]: # take only the first one add_to_dicts(thisrecid, recid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) t4 = os.times()[4] # Search for stuff like CERN-TH-4859/87 in list of refs write_message("Phase 4: report numbers catchup") done = 0 for thisrecid, reportcodes in records_info['report-numbers'].iteritems(): step("Report numbers catchup", thisrecid, done, len(records_info['report-numbers'])) done += 1 for reportcode in (r for r in reportcodes if r): if reportcode.startswith('arXiv'): std_reportcode = standardize_report_number(reportcode) report_pattern = r'^%s( *\[[a-zA-Z.-]*\])?' % \ re.escape(std_reportcode) recids = get_recids_matching_query(report_pattern, tags['refs_report_number'], 'r') else: recids = get_recids_matching_query(reportcode, tags['refs_report_number'], 'e') for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) # Find this record's pubinfo in other records' bibliography write_message("Phase 5: journals catchup") done = 0 t5 = os.times()[4] for thisrecid, rec_journals in records_info['journals'].iteritems(): step("Journals catchup", thisrecid, done, len(records_info['journals'])) done += 1 for journal in rec_journals: journal = journal.replace("\"", "") # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5s recids = search_unit(p=journal, f=tags['refs_journal'], m='a') \ - INTBITSET_OF_DELETED_RECORDS write_message("These records match %s in %s: %s" \ % (journal, tags['refs_journal'], list(recids)), verbose=9) for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 6: DOI catchup") done = 0 t6 = os.times()[4] for thisrecid, dois in records_info['doi'].iteritems(): step("DOI catchup", thisrecid, done, len(records_info['doi'])) done += 1 for doi in dois: # Search the publication string like # Phys. Lett., B 482 (2000) 417 in 999C5a recids = search_unit(p=doi, f=tags['refs_doi'], m='a') \ - INTBITSET_OF_DELETED_RECORDS write_message("These records match %s in %s: %s" \ % (doi, tags['refs_doi'], list(recids)), verbose=9) for recid in recids: add_to_dicts(recid, thisrecid) mesg = "done fully" write_message(mesg) task_update_progress(mesg) write_message("Phase 7: remove empty lists from dicts") # Remove empty lists in citation and reference keys = citations.keys() for k in keys: if not citations[k]: del citations[k] keys = references.keys() for k in keys: if not references[k]: del references[k] if task_get_task_param('verbose') >= 3: # Print only X first to prevent flood write_message("citation_list (x is cited by y):") write_message(dict(islice(citations.iteritems(), 10))) write_message("size: %s" % len(citations)) write_message("reference_list (x cites y):") write_message(dict(islice(references.iteritems(), 10))) write_message("size: %s" % len(references)) write_message("selfcitedbydic (x is cited by y and one of the " \ "authors of x same as y's):") write_message(dict(islice(selfcites.iteritems(), 10))) write_message("size: %s" % len(selfcites)) write_message("selfdic (x cites y and one of the authors of x " \ "same as y's):") write_message(dict(islice(selfrefs.iteritems(), 10))) write_message("size: %s" % len(selfrefs)) write_message("authorcitdic (author is cited in recs):") write_message(dict(islice(authorcites.iteritems(), 10))) write_message("size: %s" % len(authorcites)) t7 = os.times()[4] write_message("Execution time for analyzing the citation information " \ "generating the dictionary:") write_message("... checking ref report numbers: %.2f sec" % (t2-t1)) write_message("... checking ref journals: %.2f sec" % (t3-t2)) write_message("... checking ref DOI: %.2f sec" % (t4-t3)) write_message("... checking rec report numbers: %.2f sec" % (t5-t4)) write_message("... checking rec journals: %.2f sec" % (t6-t5)) write_message("... checking rec DOI: %.2f sec" % (t7-t6)) write_message("... total time of ref_analyze: %.2f sec" % (t7-t1)) return citations_weight, citations, references, selfcites, \ selfrefs, authorcites