def rank_by_citations(hitset, verbose): """Rank by the amount of citations. Calculate the cited-by values for all the members of the hitset Rreturns: ((recordid,weight),prefix,postfix,message) """ voutput = "" if len(hitset) > CFG_WEBSEARCH_CITESUMMARY_SCAN_THRESHOLD: cites_counts = get_citation_dict("citations_counts") ret = [(recid, weight) for recid, weight in cites_counts if recid in hitset] recids_without_cites = hitset - get_citation_dict("citations_keys") ret.extend([(recid, 0) for recid in recids_without_cites]) ret = list(reversed(ret)) else: ret = get_cited_by_weight(hitset) ret.sort(key=itemgetter(1)) if verbose > 0: voutput += "\nhitset %s\nrank_by_citations ret %s" % (hitset, ret) if ret: return ret, "(", ")", voutput else: return [], "", "", voutput
def rank_by_citations(hitset, verbose): """Rank by the amount of citations. Calculate the cited-by values for all the members of the hitset Returns: ((recordid,weight),prefix,postfix,message) """ voutput = "" if len(hitset) > CFG_WEBSEARCH_CITESUMMARY_SCAN_THRESHOLD: cites_counts = get_citation_dict('citations_counts') citedhitset = hitset & get_citation_dict('citations_keys') ret = [ recweight for recweight in cites_counts if recweight[0] in citedhitset ] hitset_without_cites = hitset - citedhitset ret.extend([(recid, 0) for recid in hitset_without_cites]) ret = list(reversed(ret)) else: ret = get_cited_by_weight(hitset) ret.sort(key=itemgetter(1)) if verbose > 0: voutput += "\nhitset %s\nrank_by_citations ret %s" % (hitset, ret) if ret: return ret, "(", ")", voutput else: return [], "", "", voutput
def get_citation_dict(message): dictname = sj.String.cast_(message.getParam('dictname')) cd = bcs.get_citation_dict(dictname) if cd: hm = sj.HashMap().of_(sj.String, sj.JArray_int) for k,v in cd.items(): j_array = sj.JArray_int(v) hm.put(k, j_array) message.put('result', hm)
def compute_citations_counts(recids, dict_name): """Compute # cites for each recid Input - d_recids: list of recids for each collection {'HEP': [1,2,3,5,8]} Output: - citers_counts: list of # cites/recid {'HEP': [(1, 10), (2, 5), (3, 23), (5, 0), (8, 0)]} """ cites_count = get_citation_dict(dict_name) counts = [(recid, cites_count.get(recid, 0)) for recid in recids] counts.sort(key=itemgetter(1), reverse=True) return counts
def calculate_citations(recids): """calculates records in classes of citations defined by thresholds. returns a dictionary that contains total, avg, records and a dictionary of threshold names and number corresponding to it""" total_cites = 0 recids_breakdown = {} if len(recids) < CFG_WEBSEARCH_CITESUMMARY_SCAN_THRESHOLD: cites_counts = compute_citations_counts(recids, 'citations_weights') else: cites_counts = get_citation_dict('citations_counts') for recid, numcites in cites_counts: if recid in recids: total_cites += numcites for low, high, name in CFG_CITESUMMARY_FAME_THRESHOLDS: if low <= numcites <= high: recids_breakdown.setdefault(name, []).append(recid) if low == 0: non_cited = recids - get_citation_dict("citations_keys") recids_breakdown.setdefault(name, []).extend(non_cited) return total_cites, recids_breakdown
def render_extended_citation_summary(req, ln, recids, collections, searchpattern, searchfield): title = websearch_templates.tmpl_citesummary2_title(searchpattern, ln) req.write(title) initial_collections = collections collections_recids = get_recids(recids, collections) def coll_self_cites(name): return name + '<br />excluding self cites' def coll_not_rpp(name): return name + '<br />excluding RPP' # Add self cites sets and "-title:rpp" sets if CFG_INSPIRE_SITE: notrpp_searchpattern = searchpattern + ' -title:rpp' notrpp_recids = intbitset(perform_request_search(p=notrpp_searchpattern)) for coll, coll_recids in collections_recids.items(): collections_recids[coll_self_cites(coll)] = coll_recids if CFG_INSPIRE_SITE: collections_recids[coll_not_rpp(coll)] = notrpp_recids & coll_recids # Add self cites collections collections = [] search_patterns = {} for coll, query in initial_collections: search_patterns[coll] = searchpattern search_patterns[coll_self_cites(coll)] = searchpattern if CFG_INSPIRE_SITE: search_patterns[coll_not_rpp(coll)] = notrpp_searchpattern collections += [ (coll, query), (coll_self_cites(coll), query), (coll_not_rpp(coll), query), ] else: collections += [ (coll, query), (coll_self_cites(coll), query), ] cites_counts = get_cites_counts(recids) if len(recids) < CFG_WEBSEARCH_CITESUMMARY_SCAN_THRESHOLD: selfcites_counts = compute_citations_counts(recids, 'selfcites_weights') else: selfcites_counts = get_citation_dict('selfcites_counts') citers_counts = {} for coll, dummy in initial_collections: citers_counts[coll] = cites_counts citers_counts[coll_self_cites(coll)] = selfcites_counts citers_counts[coll_not_rpp(coll)] = cites_counts stats = {} for coll, dummy in collections: stats[coll] = compute_citation_stats(collections_recids[coll], citers_counts[coll]) render_citesummary_prologue(req, ln, recids, collections, search_patterns, searchfield, collections_recids) render_citesummary_overview(req, ln, collections, stats) render_citations_breakdown(req, ln, collections, stats, search_patterns, searchfield) render_h_index(req, ln, collections, stats) # 6) hcs epilogue: eplilogue = websearch_templates.tmpl_citesummary_epilogue() req.write(eplilogue) back_link = websearch_templates.tmpl_citesummary_back_link(searchpattern, ln) req.write(back_link)
def get_cites_counts(recids): if len(recids) < CFG_WEBSEARCH_CITESUMMARY_SCAN_THRESHOLD: cites_counts = compute_citations_counts(recids, 'citations_weights') else: cites_counts = get_citation_dict('citations_counts') return cites_counts
import os import cPickle from invenio import bibrank_citation_searcher as bcs from invenio import intbitset '''Utility to dump cached citation dictionary into a filesystem''' basedir = '/opt/rchyla/citdicts' cit_names = ['citationdict', 'reversedict', 'selfcitdict', 'selfcitedbydict'] for dname in cit_names: print 'loading: %s' % dname cd = bcs.get_citation_dict(dname) # load the dictionary f = os.path.join(basedir, dname) # dump it out fo = open(f, 'wb') print 'dumping of %s entries started' % len(cd) if isinstance(cd, intbitset): cPickle.dump(cd.fastdump(), fo) else: cPickle.dump(cd, fo) fo.close() print 'dumped %s into %s' % (dname, f)
use_refrec = lambda x: x use_ref = itemgetter(0, 1) use_rec = itemgetter(2) except: #python 2.4 compatibility, a bit slower than itemgetter use_refrec = lambda x: x use_ref = lambda x: x[0:2] use_rec = lambda x: x[2] # At first glance this may look silly. # However, if we load the dictionaries # uncoditionally there will be only # one instance of them in the memory after # fork cit_dict = get_citation_dict("citationdict") recit_dict = get_citation_dict("reversedict") caches = [] def create_new_cache(): ret = {} caches.append(ret) return ret def clear_all_caches(): for c in caches: c.clear()
def get_citation_dict(dictname): return bcs.get_citation_dict(dictname)
use_refrec = lambda x : x use_ref = itemgetter(0, 1) use_rec = itemgetter(2) except: #python 2.4 compatibility, a bit slower than itemgetter use_refrec = lambda x: x use_ref = lambda x: x[0:2] use_rec = lambda x: x[2] # At first glance this may look silly. # However, if we load the dictionaries # uncoditionally there will be only # one instance of them in the memory after # fork cit_dict = get_citation_dict("citationdict") recit_dict = get_citation_dict("reversedict") caches = [] def create_new_cache(): ret = {} caches.append(ret) return ret def clear_all_caches(): for c in caches: c.clear() _replacer = re.compile("[^a-zA-Z]")
def render_extended_citation_summary(req, ln, recids, collections, searchpattern, searchfield): title = websearch_templates.tmpl_citesummary2_title(searchpattern, ln) req.write(title) initial_collections = collections collections_recids = get_recids(recids, collections) def coll_self_cites(name): return name + '<br />excluding self cites' def coll_not_rpp(name): return name + '<br />excluding RPP' # Add self cites sets and "-title:rpp" sets if CFG_INSPIRE_SITE: notrpp_searchpattern = searchpattern + ' -title:rpp' notrpp_recids = intbitset( perform_request_search(p=notrpp_searchpattern)) for coll, coll_recids in collections_recids.items(): collections_recids[coll_self_cites(coll)] = coll_recids if CFG_INSPIRE_SITE: collections_recids[coll_not_rpp( coll)] = notrpp_recids & coll_recids # Add self cites collections collections = [] search_patterns = {} for coll, query in initial_collections: search_patterns[coll] = searchpattern search_patterns[coll_self_cites(coll)] = searchpattern if CFG_INSPIRE_SITE: search_patterns[coll_not_rpp(coll)] = notrpp_searchpattern collections += [ (coll, query), (coll_self_cites(coll), query), (coll_not_rpp(coll), query), ] else: collections += [ (coll, query), (coll_self_cites(coll), query), ] cites_counts = get_cites_counts(recids) if len(recids) < CFG_WEBSEARCH_CITESUMMARY_SCAN_THRESHOLD: selfcites_counts = compute_citations_counts(recids, 'selfcites_weights') else: selfcites_counts = get_citation_dict('selfcites_counts') citers_counts = {} for coll, dummy in initial_collections: citers_counts[coll] = cites_counts citers_counts[coll_self_cites(coll)] = selfcites_counts citers_counts[coll_not_rpp(coll)] = cites_counts stats = {} for coll, dummy in collections: stats[coll] = compute_citation_stats(collections_recids[coll], citers_counts[coll]) render_citesummary_prologue(req, ln, recids, collections, search_patterns, searchfield, collections_recids) render_citesummary_overview(req, ln, collections, stats) render_citations_breakdown(req, ln, collections, stats, search_patterns, searchfield) render_h_index(req, ln, collections, stats) # 6) hcs epilogue: eplilogue = websearch_templates.tmpl_citesummary_epilogue() req.write(eplilogue) back_link = websearch_templates.tmpl_citesummary_back_link( searchpattern, ln) req.write(back_link)
num_proc = multiprocessing.cpu_count() except: num_proc = default POOL = multiprocessing.Pool(processes=num_proc) else: POOL = multiprocessing.Pool(processes=num_proc) # ====================================================== # Some code to execute on lazy-initialization # ====================================================== from invenio import bibrank_citation_searcher as bcs, \ search_engine_summarizer as ses # initialize citation dictionaries in parent (so that forks have them shared) bcs.get_citation_dict("citationdict") bcs.get_citation_dict("reversedict") # ====================================================== # Dispatching code # ====================================================== def dispatch(func_name, *args, **kwargs): """Dispatches the call to the remote worker""" g = globals() func_name_pre = '%s_local_pre' % func_name func_name_post = '%s_local_post' % func_name if func_name_pre in g: args = list(args)