예제 #1
0
def rank_by_citations(hitset, verbose):
    """Rank by the amount of citations.

    Calculate the cited-by values for all the members of the hitset
    Rreturns: ((recordid,weight),prefix,postfix,message)
    """
    voutput = ""

    if len(hitset) > CFG_WEBSEARCH_CITESUMMARY_SCAN_THRESHOLD:
        cites_counts = get_citation_dict("citations_counts")
        ret = [(recid, weight) for recid, weight in cites_counts if recid in hitset]
        recids_without_cites = hitset - get_citation_dict("citations_keys")
        ret.extend([(recid, 0) for recid in recids_without_cites])
        ret = list(reversed(ret))
    else:
        ret = get_cited_by_weight(hitset)
        ret.sort(key=itemgetter(1))

    if verbose > 0:
        voutput += "\nhitset %s\nrank_by_citations ret %s" % (hitset, ret)

    if ret:
        return ret, "(", ")", voutput
    else:
        return [], "", "", voutput
예제 #2
0
def rank_by_citations(hitset, verbose):
    """Rank by the amount of citations.

    Calculate the cited-by values for all the members of the hitset
    Returns: ((recordid,weight),prefix,postfix,message)
    """
    voutput = ""

    if len(hitset) > CFG_WEBSEARCH_CITESUMMARY_SCAN_THRESHOLD:
        cites_counts = get_citation_dict('citations_counts')
        citedhitset = hitset & get_citation_dict('citations_keys')
        ret = [
            recweight for recweight in cites_counts
            if recweight[0] in citedhitset
        ]
        hitset_without_cites = hitset - citedhitset
        ret.extend([(recid, 0) for recid in hitset_without_cites])
        ret = list(reversed(ret))
    else:
        ret = get_cited_by_weight(hitset)
        ret.sort(key=itemgetter(1))

    if verbose > 0:
        voutput += "\nhitset %s\nrank_by_citations ret %s" % (hitset, ret)

    if ret:
        return ret, "(", ")", voutput
    else:
        return [], "", "", voutput
예제 #3
0
def get_citation_dict(message):
    dictname = sj.String.cast_(message.getParam('dictname'))
    cd = bcs.get_citation_dict(dictname)
    if cd:
        hm = sj.HashMap().of_(sj.String, sj.JArray_int)

        for k,v in cd.items():
            j_array = sj.JArray_int(v)
            hm.put(k, j_array)

        message.put('result', hm)
def compute_citations_counts(recids, dict_name):
    """Compute # cites for each recid

    Input
    - d_recids: list of recids for each collection
           {'HEP': [1,2,3,5,8]}
    Output:
    - citers_counts: list of # cites/recid
           {'HEP': [(1, 10), (2, 5), (3, 23), (5, 0), (8, 0)]}
    """
    cites_count = get_citation_dict(dict_name)
    counts = [(recid, cites_count.get(recid, 0)) for recid in recids]
    counts.sort(key=itemgetter(1), reverse=True)
    return counts
def compute_citations_counts(recids, dict_name):
    """Compute # cites for each recid

    Input
    - d_recids: list of recids for each collection
           {'HEP': [1,2,3,5,8]}
    Output:
    - citers_counts: list of # cites/recid
           {'HEP': [(1, 10), (2, 5), (3, 23), (5, 0), (8, 0)]}
    """
    cites_count = get_citation_dict(dict_name)
    counts = [(recid, cites_count.get(recid, 0)) for recid in recids]
    counts.sort(key=itemgetter(1), reverse=True)
    return counts
def calculate_citations(recids):
    """calculates records in classes of citations
       defined by thresholds. returns a dictionary that
       contains total, avg, records and a dictionary
       of threshold names and number corresponding to it"""
    total_cites = 0
    recids_breakdown = {}

    if len(recids) < CFG_WEBSEARCH_CITESUMMARY_SCAN_THRESHOLD:
        cites_counts = compute_citations_counts(recids, 'citations_weights')
    else:
        cites_counts = get_citation_dict('citations_counts')

    for recid, numcites in cites_counts:
        if recid in recids:
            total_cites += numcites
            for low, high, name in CFG_CITESUMMARY_FAME_THRESHOLDS:
                if low <= numcites <= high:
                    recids_breakdown.setdefault(name, []).append(recid)
                if low == 0:
                    non_cited = recids - get_citation_dict("citations_keys")
                    recids_breakdown.setdefault(name, []).extend(non_cited)

    return total_cites, recids_breakdown
def calculate_citations(recids):
    """calculates records in classes of citations
       defined by thresholds. returns a dictionary that
       contains total, avg, records and a dictionary
       of threshold names and number corresponding to it"""
    total_cites = 0
    recids_breakdown = {}

    if len(recids) < CFG_WEBSEARCH_CITESUMMARY_SCAN_THRESHOLD:
        cites_counts = compute_citations_counts(recids, 'citations_weights')
    else:
        cites_counts = get_citation_dict('citations_counts')

    for recid, numcites in cites_counts:
        if recid in recids:
            total_cites += numcites
            for low, high, name in CFG_CITESUMMARY_FAME_THRESHOLDS:
                if low <= numcites <= high:
                    recids_breakdown.setdefault(name, []).append(recid)
                if low == 0:
                    non_cited = recids - get_citation_dict("citations_keys")
                    recids_breakdown.setdefault(name, []).extend(non_cited)

    return total_cites, recids_breakdown
def render_extended_citation_summary(req, ln, recids, collections,
                                                   searchpattern, searchfield):
    title = websearch_templates.tmpl_citesummary2_title(searchpattern, ln)
    req.write(title)

    initial_collections = collections
    collections_recids = get_recids(recids, collections)

    def coll_self_cites(name):
        return name + '<br />excluding self cites'

    def coll_not_rpp(name):
        return name + '<br />excluding RPP'

    # Add self cites sets and "-title:rpp" sets
    if CFG_INSPIRE_SITE:
        notrpp_searchpattern = searchpattern + ' -title:rpp'
        notrpp_recids = intbitset(perform_request_search(p=notrpp_searchpattern))
    for coll, coll_recids in collections_recids.items():
        collections_recids[coll_self_cites(coll)] = coll_recids
        if CFG_INSPIRE_SITE:
            collections_recids[coll_not_rpp(coll)] = notrpp_recids & coll_recids
    # Add self cites collections
    collections = []
    search_patterns = {}
    for coll, query in initial_collections:
        search_patterns[coll] = searchpattern
        search_patterns[coll_self_cites(coll)] = searchpattern
        if CFG_INSPIRE_SITE:
            search_patterns[coll_not_rpp(coll)] = notrpp_searchpattern
            collections += [
                (coll, query),
                (coll_self_cites(coll), query),
                (coll_not_rpp(coll), query),
            ]
        else:
            collections += [
                (coll, query),
                (coll_self_cites(coll), query),
            ]

    cites_counts = get_cites_counts(recids)

    if len(recids) < CFG_WEBSEARCH_CITESUMMARY_SCAN_THRESHOLD:
        selfcites_counts = compute_citations_counts(recids, 'selfcites_weights')
    else:
        selfcites_counts = get_citation_dict('selfcites_counts')

    citers_counts = {}

    for coll, dummy in initial_collections:
        citers_counts[coll] = cites_counts
        citers_counts[coll_self_cites(coll)] = selfcites_counts
        citers_counts[coll_not_rpp(coll)] = cites_counts

    stats = {}
    for coll, dummy in collections:
        stats[coll] = compute_citation_stats(collections_recids[coll], citers_counts[coll])

    render_citesummary_prologue(req,
                                ln,
                                recids,
                                collections,
                                search_patterns,
                                searchfield,
                                collections_recids)
    render_citesummary_overview(req,
                                ln,
                                collections,
                                stats)
    render_citations_breakdown(req,
                               ln,
                               collections,
                               stats,
                               search_patterns,
                               searchfield)
    render_h_index(req, ln, collections, stats)

    # 6) hcs epilogue:
    eplilogue = websearch_templates.tmpl_citesummary_epilogue()
    req.write(eplilogue)

    back_link = websearch_templates.tmpl_citesummary_back_link(searchpattern, ln)
    req.write(back_link)
def get_cites_counts(recids):
    if len(recids) < CFG_WEBSEARCH_CITESUMMARY_SCAN_THRESHOLD:
        cites_counts = compute_citations_counts(recids, 'citations_weights')
    else:
        cites_counts = get_citation_dict('citations_counts')
    return cites_counts
예제 #10
0
import os
import cPickle
from invenio import bibrank_citation_searcher as bcs
from invenio import intbitset

'''Utility to dump cached citation dictionary into a filesystem'''

basedir = '/opt/rchyla/citdicts'

cit_names = ['citationdict',
             'reversedict', 'selfcitdict', 'selfcitedbydict']
             
for dname in cit_names:
    print 'loading: %s' % dname
    cd = bcs.get_citation_dict(dname)  # load the dictionary
    f = os.path.join(basedir, dname)  # dump it out
    fo = open(f, 'wb')
    print 'dumping of %s entries started' % len(cd)
    if isinstance(cd, intbitset):
        cPickle.dump(cd.fastdump(), fo)
    else:
        cPickle.dump(cd, fo)
    fo.close()
    print 'dumped %s into %s' % (dname, f)
    
    

예제 #11
0
    use_refrec = lambda x: x
    use_ref = itemgetter(0, 1)
    use_rec = itemgetter(2)
except:
    #python 2.4 compatibility, a bit slower than itemgetter
    use_refrec = lambda x: x
    use_ref = lambda x: x[0:2]
    use_rec = lambda x: x[2]

# At first glance this may look silly.
# However, if we load the dictionaries
# uncoditionally there will be only
# one instance of them in the memory after
# fork

cit_dict = get_citation_dict("citationdict")
recit_dict = get_citation_dict("reversedict")

caches = []


def create_new_cache():
    ret = {}
    caches.append(ret)
    return ret


def clear_all_caches():
    for c in caches:
        c.clear()
예제 #12
0
def get_citation_dict(dictname):
    return bcs.get_citation_dict(dictname)
예제 #13
0
    use_refrec = lambda x : x
    use_ref = itemgetter(0, 1)
    use_rec = itemgetter(2)
except:
    #python 2.4 compatibility, a bit slower than itemgetter
    use_refrec = lambda x: x
    use_ref = lambda x: x[0:2]
    use_rec = lambda x: x[2]

# At first glance this may look silly.
# However, if we load the dictionaries
# uncoditionally there will be only
# one instance of them in the memory after
# fork

cit_dict = get_citation_dict("citationdict")
recit_dict = get_citation_dict("reversedict")

caches = []
def create_new_cache():
    ret = {}
    caches.append(ret)
    return ret


def clear_all_caches():
    for c in caches:
        c.clear()


_replacer = re.compile("[^a-zA-Z]")
def render_extended_citation_summary(req, ln, recids, collections,
                                     searchpattern, searchfield):
    title = websearch_templates.tmpl_citesummary2_title(searchpattern, ln)
    req.write(title)

    initial_collections = collections
    collections_recids = get_recids(recids, collections)

    def coll_self_cites(name):
        return name + '<br />excluding self cites'

    def coll_not_rpp(name):
        return name + '<br />excluding RPP'

    # Add self cites sets and "-title:rpp" sets
    if CFG_INSPIRE_SITE:
        notrpp_searchpattern = searchpattern + ' -title:rpp'
        notrpp_recids = intbitset(
            perform_request_search(p=notrpp_searchpattern))
    for coll, coll_recids in collections_recids.items():
        collections_recids[coll_self_cites(coll)] = coll_recids
        if CFG_INSPIRE_SITE:
            collections_recids[coll_not_rpp(
                coll)] = notrpp_recids & coll_recids
    # Add self cites collections
    collections = []
    search_patterns = {}
    for coll, query in initial_collections:
        search_patterns[coll] = searchpattern
        search_patterns[coll_self_cites(coll)] = searchpattern
        if CFG_INSPIRE_SITE:
            search_patterns[coll_not_rpp(coll)] = notrpp_searchpattern
            collections += [
                (coll, query),
                (coll_self_cites(coll), query),
                (coll_not_rpp(coll), query),
            ]
        else:
            collections += [
                (coll, query),
                (coll_self_cites(coll), query),
            ]

    cites_counts = get_cites_counts(recids)

    if len(recids) < CFG_WEBSEARCH_CITESUMMARY_SCAN_THRESHOLD:
        selfcites_counts = compute_citations_counts(recids,
                                                    'selfcites_weights')
    else:
        selfcites_counts = get_citation_dict('selfcites_counts')

    citers_counts = {}

    for coll, dummy in initial_collections:
        citers_counts[coll] = cites_counts
        citers_counts[coll_self_cites(coll)] = selfcites_counts
        citers_counts[coll_not_rpp(coll)] = cites_counts

    stats = {}
    for coll, dummy in collections:
        stats[coll] = compute_citation_stats(collections_recids[coll],
                                             citers_counts[coll])

    render_citesummary_prologue(req, ln, recids, collections, search_patterns,
                                searchfield, collections_recids)
    render_citesummary_overview(req, ln, collections, stats)
    render_citations_breakdown(req, ln, collections, stats, search_patterns,
                               searchfield)
    render_h_index(req, ln, collections, stats)

    # 6) hcs epilogue:
    eplilogue = websearch_templates.tmpl_citesummary_epilogue()
    req.write(eplilogue)

    back_link = websearch_templates.tmpl_citesummary_back_link(
        searchpattern, ln)
    req.write(back_link)
def get_cites_counts(recids):
    if len(recids) < CFG_WEBSEARCH_CITESUMMARY_SCAN_THRESHOLD:
        cites_counts = compute_citations_counts(recids, 'citations_weights')
    else:
        cites_counts = get_citation_dict('citations_counts')
    return cites_counts
            num_proc = multiprocessing.cpu_count()
        except:
            num_proc = default
        POOL = multiprocessing.Pool(processes=num_proc)
    else:
        POOL = multiprocessing.Pool(processes=num_proc)

# ======================================================
#    Some code to execute on lazy-initialization
# ======================================================

from invenio import bibrank_citation_searcher as bcs, \
    search_engine_summarizer as ses

# initialize citation dictionaries in parent (so that forks have them shared)
bcs.get_citation_dict("citationdict")
bcs.get_citation_dict("reversedict")


# ======================================================
#    Dispatching code
# ======================================================

def dispatch(func_name, *args, **kwargs):
    """Dispatches the call to the remote worker"""
    g = globals()
    func_name_pre = '%s_local_pre' % func_name
    func_name_post = '%s_local_post' % func_name

    if func_name_pre in g:
        args = list(args)