def get_self_citations_count(recids, algorithm='simple', precompute=CFG_BIBRANK_SELFCITES_PRECOMPUTE): """Depending on our site we config, we either: * compute self-citations (using a simple algorithm) * or fetch self-citations from pre-computed table""" total_cites = 0 if not precompute: tags = get_authors_tags() selfcites_fun = ALL_ALGORITHMS[algorithm] for recid in recids: citers = get_cited_by(recid) self_cites = selfcites_fun(recid, tags) total_cites += len(citers) - len(self_cites) else: results = get_precomputed_self_cites_list(recids) results_dict = {} for r in results: results_dict[r[0]] = r[1] for r in recids: citers = get_cited_by(r) self_cites = results_dict.get(r, 0) total_cites += len(citers) - self_cites return total_cites
def find_citations(rank_method_code, recID, hitset, verbose): """Rank by the amount of citations.""" #calculate the cited-by values for all the members of the hitset #returns: ((recordid,weight),prefix,postfix,message) global voutput voutput = "" #If the recID is numeric, return only stuff that cites it. Otherwise return #stuff that cites hitset #try to convert to int recisint = True recidint = 0 try: recidint = int(recID) except: recisint = False ret = [] if recisint: myrecords = get_cited_by(recidint) #this is a simple list ret = get_cited_by_weight(myrecords) else: ret = get_cited_by_weight(hitset) ret.sort(lambda x,y:cmp(x[1],y[1])) #ascending by the second member of the tuples if verbose > 0: voutput = voutput+"\nrecID "+str(recID)+" is int: "+str(recisint)+" hitset "+str(hitset)+"\n"+"find_citations retlist "+str(ret) #voutput = voutput + str(ret) if ret: return (ret,"(", ")", "") else: return ((),"", "", "")
def related_records(recids, recids_processed): if fmt == "HDREF" and recids: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recids) def check_date(mod_date): return mod_date.strftime( "%Y-%m-%d %H:%M:%S") < latest_bibrank_run rel_recids = intbitset([ recid for recid, mod_date in run_sql(sql) if check_date(mod_date) ]) for r in rel_recids: recids |= intbitset(get_cited_by(r)) # To not process recids twice recids -= recids_processed # Adds to the set of processed recids recids_processed += recids return recids
def calculate_citation_history_coordinates(recid): """Return a list of citation graph coordinates for RECID, sorted by year.""" result = {} for year in calculate_citation_graphe_x_coordinates(recid): result[year] = 0 if len(result) < CFG_BIBRANK_CITATION_HISTORY_MIN_X_POINTS: # do not generate graphs that have less than X points return [] for recid in get_cited_by(recid): rec_date = get_record_year(recid) # Some records simlpy do not have these fields if rec_date: # Maybe rec_date[0][0:4] has a typo and cannot # be converted to an int try: d = strptime(rec_date[0][:4], '%Y') except ValueError: pass else: if d.year in result: result[d.year] += 1 return sorted(result.iteritems())
def compute_self_citations(recid, tags, authors_fun): """Compute the self-citations We return the total numbers of citations minus the number of self-citations Args: - recid: record id - lciters: list of record ids citing this record - authors_cache: the authors cache which will be used to store an author friends (to not compute friends twice) - tags: the tag number for author, coauthors, collaborations, required since it depends on how the marc was defined """ citers = get_cited_by(recid) if not citers: return set() self_citations = set() authors = frozenset(get_authors_from_record(recid, tags)) collaborations = None if not authors or len(authors) > 20: collaborations = frozenset( get_collaborations_from_record(recid, tags)) if collaborations: # Use collaborations names for cit in citers: cit_collaborations = frozenset( get_collaborations_from_record(cit, tags)) if collaborations.intersection(cit_collaborations): self_citations.add(cit) else: # Use authors names for cit in citers: cit_authors = get_authors_from_record(cit, tags) if (not authors or len(cit_authors) > 20) and \ get_collaborations_from_record(cit, tags): # Record from a collaboration that cites # a record from an author, it's fine pass else: cit_coauthors = frozenset(authors_fun(cit, tags)) if authors.intersection(cit_coauthors): self_citations.add(cit) return self_citations
def compute_self_citations(recid, tags, authors_fun): """Compute the self-citations We return the total numbers of citations minus the number of self-citations Args: - recid: record id - lciters: list of record ids citing this record - authors_cache: the authors cache which will be used to store an author friends (to not compute friends twice) - tags: the tag number for author, coauthors, collaborations, required since it depends on how the marc was defined """ citers = get_cited_by(recid) if not citers: return set() self_citations = set() authors = frozenset(get_authors_from_record(recid, tags)) collaborations = None if not authors or len(authors) > 20: collaborations = frozenset(get_collaborations_from_record(recid, tags)) if collaborations: # Use collaborations names for cit in citers: cit_collaborations = frozenset( get_collaborations_from_record(cit, tags)) if collaborations.intersection(cit_collaborations): self_citations.add(cit) else: # Use authors names for cit in citers: cit_authors = get_authors_from_record(cit, tags) if (not authors or len(cit_authors) > 20) and \ get_collaborations_from_record(cit, tags): # Record from a collaboration that cites # a record from an author, it's fine pass else: cit_coauthors = frozenset(authors_fun(cit, tags)) if authors.intersection(cit_coauthors): self_citations.add(cit) return self_citations
def citations_nb_counts(): """Get number of citations for the record `recid`.""" recid = request.view_args.get('recid') if recid is None: return from intbitset import intbitset from invenio.legacy.bibrank.citation_searcher import (get_cited_by, get_cited_by_count) if CFG_BIBRANK_SHOW_CITATION_LINKS: if CFG_INSPIRE_SITE: from invenio.legacy.search_engine import search_unit citers_recids = intbitset(get_cited_by(recid)) citeable_recids = search_unit(p='citeable', f='collection') return len(citers_recids & citeable_recids) else: return get_cited_by_count(recid) return 0
def citations_nb_counts(): """Get number of citations for the record `recid`.""" recid = request.view_args.get("recid") if recid is None: return from intbitset import intbitset from invenio.legacy.bibrank.citation_searcher import get_cited_by, get_cited_by_count if CFG_BIBRANK_SHOW_CITATION_LINKS: if CFG_INSPIRE_SITE: from invenio.legacy.search_engine import search_unit citers_recids = intbitset(get_cited_by(recid)) citeable_recids = search_unit(p="citeable", f="collection") return len(citers_recids & citeable_recids) else: return get_cited_by_count(recid) return 0
def related_records(recids, recids_processed): if fmt == "HDREF" and recids: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recids) def check_date(mod_date): return mod_date.strftime( "%Y-%m-%d %H:%M:%S") < latest_bibrank_run rel_recids = intbitset([recid for recid, mod_date in run_sql(sql) if check_date(mod_date)]) for r in rel_recids: recids |= intbitset(get_cited_by(r)) # To not process recids twice recids -= recids_processed # Adds to the set of processed recids recids_processed += recids return recids
def bibreformat_task(fmt, sql, sql_queries, cds_query, process_format, process, recids): """ BibReformat main task @param fmt: output format to use @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case @param sql_queries: a list of sql queries to be executed to select records to reformat. @param cds_query: a search query to be executed to select records to reformat @param process_format: @param process: @param recids: a list of record IDs to reformat @return: None """ write_message("Processing format %s" % fmt) t1 = os.times()[4] start_date = datetime.now() ### Query the database ### task_update_progress('Fetching records to process') if process_format: # '-without' parameter write_message("Querying database for records without cache...") without_format = without_fmt(sql) recIDs = intbitset(recids) if cds_query['field'] != "" or \ cds_query['collection'] != "" or \ cds_query['pattern'] != "": write_message("Querying database (CDS query)...") if cds_query['collection'] == "": # use search_pattern() whenever possible, as it can search # even in private collections res = search_pattern(p=cds_query['pattern'], f=cds_query['field'], m=cds_query['matching']) else: # use perform_request_search when '-c' argument has been # defined, as it is not supported by search_pattern() res = intbitset(perform_request_search(req=None, of='id', c=cds_query['collection'], p=cds_query['pattern'], f=cds_query['field'])) recIDs |= res for sql_query in sql_queries: write_message("Querying database (%s) ..." % sql_query, verbose=2) recIDs |= intbitset(run_sql(sql_query)) if fmt == "HDREF" and recIDs: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes latest_bibrank_run = get_bibrankmethod_lastupdate('citation') start_date = latest_bibrank_run sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recIDs) def check_date(mod_date): return mod_date < latest_bibrank_run recIDs = intbitset([recid for recid, mod_date in run_sql(sql) \ if check_date(mod_date)]) for r in recIDs: recIDs |= intbitset(get_cited_by(r)) ### list of corresponding record IDs was retrieved ### now format the selected records if process_format: write_message("Records to be processed: %d" % (len(recIDs) \ + len(without_format))) write_message("Out of it records without existing cache: %d" % len(without_format)) else: write_message("Records to be processed: %d" % (len(recIDs))) ### Initialize main loop total_rec = 0 # Total number of records tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) if process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this # when migration from php to # python bibformat is done (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_old(recIDs, fmt) else: (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_new(recIDs, fmt) total_rec += total_rec_1 tbibformat += tbibformat_1 tbibupload += tbibupload_1 ### Iterate over all records prepared in list II (no_format) if process_format and process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this # when migration from php to # python bibformat is done (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_old(without_format, fmt) else: (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_new(without_format, fmt) total_rec += total_rec_2 tbibformat += tbibformat_2 tbibupload += tbibupload_2 ### Store last run time if task_has_option("last"): write_message("storing run date to %s" % start_date) store_last_updated(fmt, start_date) ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec write_message(message) message = "total processing time: %2f sec" % elapsed write_message(message) message = "Time spent on external call (os.system):" write_message(message) message = " bibformat: %2f sec" % tbibformat write_message(message) message = " bibupload: %2f sec" % tbibupload write_message(message)