def get_self_citations_count(recids, algorithm='simple', precompute=CFG_BIBRANK_SELFCITES_PRECOMPUTE): """Depending on our site we config, we either: * compute self-citations (using a simple algorithm) * or fetch self-citations from pre-computed table""" total_cites = 0 if not precompute: tags = get_authors_tags() selfcites_fun = ALL_ALGORITHMS[algorithm] for recid in recids: citers = get_cited_by(recid) self_cites = selfcites_fun(recid, tags) total_cites += len(citers) - len(self_cites) else: results = get_precomputed_self_cites_list(recids) results_dict = {} for r in results: results_dict[r[0]] = r[1] for r in recids: citers = get_cited_by(r) self_cites = results_dict.get(r, 0) total_cites += len(citers) - self_cites return total_cites
def test_db_adding_and_removing_records(self): from invenio.bibrank_citation_searcher import get_cited_by from invenio.bibrank_citation_indexer import store_dicts store_dicts([42222], refs={42222: set([43333])}, cites={42222: set([40000, 40001])}) cited_by_42222 = get_cited_by(42222) cited_by_43333 = get_cited_by(43333) store_dicts([42222], refs={42222: set()}, cites={42222: set()}) self.assertEqual(cited_by_42222, set([40000, 40001])) self.assertEqual(cited_by_43333, set([42222])) self.assertEqual(get_cited_by(42222), set()) self.assertEqual(get_cited_by(43333), set())
def calculate_citation_history_coordinates(recid): """Return a list of citation graph coordinates for RECID, sorted by year.""" result = {} for year in calculate_citation_graphe_x_coordinates(recid): result[year] = 0 if len(result) < CFG_BIBRANK_CITATION_HISTORY_MIN_X_POINTS: # do not generate graphs that have less than X points return [] for recid in get_cited_by(recid): rec_date = get_record_year(recid) # Some records simlpy do not have these fields if rec_date: # Maybe rec_date[0][0:4] has a typo and cannot # be converted to an int try: d = strptime(rec_date[0][:4], '%Y') except ValueError: pass else: if d.year in result: result[d.year] += 1 return sorted(result.iteritems())
def related_records(recids, recids_processed): if fmt == "HDREF" and recids: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recids) def check_date(mod_date): return mod_date.strftime( "%Y-%m-%d %H:%M:%S") < latest_bibrank_run rel_recids = intbitset([ recid for recid, mod_date in run_sql(sql) if check_date(mod_date) ]) for r in rel_recids: recids |= intbitset(get_cited_by(r)) # To not process recids twice recids -= recids_processed # Adds to the set of processed recids recids_processed += recids return recids
def find_citations(rank_method_code, recID, hitset, verbose): """Rank by the amount of citations.""" #calculate the cited-by values for all the members of the hitset #returns: ((recordid,weight),prefix,postfix,message) global voutput voutput = "" #If the recID is numeric, return only stuff that cites it. Otherwise return #stuff that cites hitset #try to convert to int recisint = True recidint = 0 try: recidint = int(recID) except: recisint = False ret = [] if recisint: myrecords = get_cited_by(recidint) #this is a simple list ret = get_cited_by_weight(myrecords) else: ret = get_cited_by_weight(hitset) ret.sort(lambda x,y:cmp(x[1],y[1])) #ascending by the second member of the tuples if verbose > 0: voutput = voutput+"\nrecID "+str(recID)+" is int: "+str(recisint)+" hitset "+str(hitset)+"\n"+"find_citations retlist "+str(ret) #voutput = voutput + str(ret) if ret: return (ret,"(", ")", "") else: return ((),"", "", "")
def find_cites(author): """ Find and return all necessary components for plotting the data set. Returned values: 1. year_dict: a dictionary keyed by years with values of the citations that occured in that year 2. start_year: an integer that holds the year the first citation occured, used for calculating points to plot 3. lifetime_cites: an integer holding the total amount of cites the author has in the present day, used to scale the final data set This definition first grabs a list of all the papers written by the author. From there, it iterates through the list, pulling the citations of each paper and incrementing the appropriate year in the year_dict dictionary for that citations year. Next, it iterates through the year ditionary to fill in values for missing years, setting them to zero. It also calculates the lifetime cites during this iteration. """ print "# Author:", author papers = get_realauthor_data(author, 'bibrec_id') year_dict = {} lifetime_cites = 0 for paper in papers: cites = get_cited_by(int(paper[1])) # print papers[i][1], cites for cite in cites: fieldvalues_yearlist = get_fieldvalues(cite, '269__C') if len(fieldvalues_yearlist) > 0: year = year_re.search(fieldvalues_yearlist[0]) if year: if int(year.group()) not in year_dict: year_dict[int(year.group())] = 1 else: year_dict[int(year.group())] += 1 # print year.group() if len(year_dict) > 0: start_year = min(year_dict.keys()) end_year = max(year_dict.keys()) for i in range(start_year, end_year + 1): if i not in year_dict: year_dict[i] = 0 lifetime_cites += year_dict[i] else: print "# Author has no citations" # print year_dict return year_dict, start_year, float(lifetime_cites)
def print_rec_ids(rec_ids): complete_paper_list = intbitset(perform_request_search(p='year:2009->2010')) print "Rec ID, Clicks, Citations:" for key in rec_ids: paper_citation_list = intbitset(get_cited_by(key)) narrowed_citation_count = len(paper_citation_list & complete_paper_list) print "%d %d %d" % (key, rec_ids[key], narrowed_citation_count)
def compute_self_citations(recid, tags, authors_fun): """Compute the self-citations We return the total numbers of citations minus the number of self-citations Args: - recid: record id - lciters: list of record ids citing this record - authors_cache: the authors cache which will be used to store an author friends (to not compute friends twice) - tags: the tag number for author, coauthors, collaborations, required since it depends on how the marc was defined """ citers = get_cited_by(recid) if not citers: return set() self_citations = set() authors = frozenset(get_authors_from_record(recid, tags)) collaborations = None if not authors or len(authors) > 20: collaborations = frozenset( get_collaborations_from_record(recid, tags)) if collaborations: # Use collaborations names for cit in citers: cit_collaborations = frozenset( get_collaborations_from_record(cit, tags)) if collaborations.intersection(cit_collaborations): self_citations.add(cit) else: # Use authors names for cit in citers: cit_authors = get_authors_from_record(cit, tags) if (not authors or len(cit_authors) > 20) and \ get_collaborations_from_record(cit, tags): # Record from a collaboration that cites # a record from an author, it's fine pass else: cit_coauthors = frozenset(authors_fun(cit, tags)) if authors.intersection(cit_coauthors): self_citations.add(cit) return self_citations
def print_rec_ids(rec_ids,offset=365): print "Rec ID, Clicks,date, arXiv, Citations(1yr), Citations(6mo):" output = [] for key in rec_ids: dates = get_fieldvalues(key, '269__c') if len(dates) > 0: date = dates[0] reps = get_fieldvalues(key, '037__a') if len(reps) > 0: rep = reps[0] cats = get_fieldvalues(key, '037__c') if len(cats) > 0: cat = cats[0] output.append([key, rec_ids[key], date, rep, cat]) date1='' output.sort(key = lambda record:record[2]) for record in output: if record[2] != date1: date = datetime.date(int(record[2].rsplit('-')[0]),int(record[2].rsplit('-')[1]),1) date2 = date + datetime.timedelta(offset/2) date3 = date + datetime.timedelta(offset) ## check and split across yearsdue to search bug. assumes that ## if small offset splits the year, the big one does too (i.e. we ## don't go back or forward more than 6 mos if date.year != date2.year: join = str(date.year) +'-12-31 or year:' + str(date2.year) + '-01-01->' else: join = '' date1 = date.strftime("%Y-%m") date2 = date2.strftime("%Y-%m") date3 = date3.strftime("%Y-%m") print date1, date2, date3 complete_paper_list = intbitset(perform_request_search(p='year:'+date1+'->' + join + date2)) half_complete_paper_list = intbitset(perform_request_search(p='year:'+date1+'->' + join + date3)) paper_citation_list = intbitset(get_cited_by(record[0])) narrowed_citation_count = len(paper_citation_list & complete_paper_list) half_narrowed_citation_count = len(paper_citation_list & half_complete_paper_list) print '%d,%d,%s,%s,%s,%d,%d' % (record[0],record[1],record[2],record[3],record[4], half_narrowed_citation_count,narrowed_citation_count)
def related_records(recids, recids_processed): if fmt == "HDREF" and recids: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recids) def check_date(mod_date): return mod_date.strftime("%Y-%m-%d %H:%M:%S") < latest_bibrank_run rel_recids = intbitset([recid for recid, mod_date in run_sql(sql) if check_date(mod_date)]) for r in rel_recids: recids |= intbitset(get_cited_by(r)) # To not process recids twice recids -= recids_processed # Adds to the set of processed recids recids_processed += recids return recids
def bibreformat_task(fmt, sql, sql_queries, cds_query, process_format, process, recids): """ BibReformat main task @param fmt: output format to use @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case @param sql_queries: a list of sql queries to be executed to select records to reformat. @param cds_query: a search query to be executed to select records to reformat @param process_format: @param process: @param recids: a list of record IDs to reformat @return: None """ write_message("Processing format %s" % fmt) t1 = os.times()[4] start_date = datetime.now() ### Query the database ### task_update_progress('Fetching records to process') if process_format: # '-without' parameter write_message("Querying database for records without cache...") without_format = without_fmt(sql) recIDs = intbitset(recids) if cds_query['field'] != "" or \ cds_query['collection'] != "" or \ cds_query['pattern'] != "": write_message("Querying database (CDS query)...") if cds_query['collection'] == "": # use search_pattern() whenever possible, as it can search # even in private collections res = search_pattern(p=cds_query['pattern'], f=cds_query['field'], m=cds_query['matching']) else: # use perform_request_search when '-c' argument has been # defined, as it is not supported by search_pattern() res = intbitset( perform_request_search(req=None, of='id', c=cds_query['collection'], p=cds_query['pattern'], f=cds_query['field'])) recIDs |= res for sql_query in sql_queries: write_message("Querying database (%s) ..." % sql_query, verbose=2) recIDs |= intbitset(run_sql(sql_query)) if fmt == "HDREF" and recIDs: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes latest_bibrank_run = get_bibrankmethod_lastupdate('citation') start_date = latest_bibrank_run sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recIDs) def check_date(mod_date): return mod_date < latest_bibrank_run recIDs = intbitset([recid for recid, mod_date in run_sql(sql) \ if check_date(mod_date)]) for r in recIDs: recIDs |= intbitset(get_cited_by(r)) ### list of corresponding record IDs was retrieved ### now format the selected records if process_format: write_message("Records to be processed: %d" % (len(recIDs) \ + len(without_format))) write_message("Out of it records without existing cache: %d" % len(without_format)) else: write_message("Records to be processed: %d" % (len(recIDs))) ### Initialize main loop total_rec = 0 # Total number of records tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) if process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this # when migration from php to # python bibformat is done (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_old(recIDs, fmt) else: (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_new(recIDs, fmt) total_rec += total_rec_1 tbibformat += tbibformat_1 tbibupload += tbibupload_1 ### Iterate over all records prepared in list II (no_format) if process_format and process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this # when migration from php to # python bibformat is done (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_old(without_format, fmt) else: (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_new(without_format, fmt) total_rec += total_rec_2 tbibformat += tbibformat_2 tbibupload += tbibupload_2 ### Store last run time if task_has_option("last"): write_message("storing run date to %s" % start_date) store_last_updated(fmt, start_date) ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec write_message(message) message = "total processing time: %2f sec" % elapsed write_message(message) message = "Time spent on external call (os.system):" write_message(message) message = " bibformat: %2f sec" % tbibformat write_message(message) message = " bibupload: %2f sec" % tbibupload write_message(message)
def find_citesb(author): """ This defition plays the same role as the above defition of a similar name. However, it creates a different dictionary, as this definition is used only when the user wants to plot citations that occured in the past five years from papers published in the past five years only. The year dictionary in rather keyed by year of paper published. The values of the keys are then another dictionary that holds years as the keys (of citations) with the values as the number of citations. Ex: {paper year: {citation year, count}} All other return values are the same, with the addition of 'end_year', which is an integer denoting the final year the author had a paper cited. """ print "# Author:", author papers = get_realauthor_data(author, 'bibrec_id') year_dict = {} # print papers, "Papers" # print 'Number of papers:', len(papers) lifetime_cites = 0 end_year = 0 for paper in papers: paper_yearlist = get_fieldvalues(int(paper[1]), '269__C') # print paper_yearlist, "Paper year list" # print paper[1] if len(paper_yearlist) > 0: paper_year_match = year_re.search(paper_yearlist[0]) if paper_year_match: paper_year = int(paper_year_match.group()) # print paper_year cites = get_cited_by(int(paper[1])) # print cites for cite in cites: fieldvalues_yearlist = get_fieldvalues(cite, '269__C') if len(fieldvalues_yearlist) > 0: cite_year_match = year_re.search(fieldvalues_yearlist[0]) if cite_year_match: cite_year = int(cite_year_match.group()) if cite_year > end_year: end_year = cite_year # print "Years:", paper_year, cite_year if paper_year not in year_dict: year_dict[paper_year] = {cite_year: 1} elif cite_year not in year_dict[paper_year]: year_dict[paper_year][cite_year] = 1 else: year_dict[paper_year][cite_year] += 1 if len(year_dict) > 0: start_year = min(year_dict.keys()) for i in year_dict: for j in year_dict[i]: lifetime_cites += year_dict[i][j] else: print "# Author has no citations" # print year_dict return year_dict, start_year, end_year, float(lifetime_cites)
def _find_citations(bib): return get_cited_by(bib[2])
def bibreformat_task(fmt, sql, sql_queries, cds_query, process_format, process, recids): """ BibReformat main task @param fmt: output format to use @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case @param sql_queries: a list of sql queries to be executed to select records to reformat. @param cds_query: a search query to be executed to select records to reformat @param process_format: @param process: @param recids: a list of record IDs to reformat @return: None """ t1 = os.times()[4] ### Query the database ### task_update_progress('Fetching records to process') if process_format: # '-without' parameter write_message("Querying database for records without cache...") without_format = without_fmt(sql) recIDs = recids if cds_query['field'] != "" or \ cds_query['collection'] != "" or \ cds_query['pattern'] != "": write_message("Querying database (CDS query)...") if cds_query['collection'] == "": # use search_pattern() whenever possible, as it can search # even in private collections res = search_pattern(p=cds_query['pattern'], f=cds_query['field'], m=cds_query['matching']) else: # use perform_request_search when '-c' argument has been # defined, as it is not supported by search_pattern() res = intbitset(perform_request_search(req=None, of='id', c=cds_query['collection'], p=cds_query['pattern'], f=cds_query['field'])) recIDs |= res for sql_query in sql_queries: write_message("Querying database (%s) ..." % sql_query, verbose=2) recIDs |= intbitset(run_sql(sql_query)) if fmt == "HDREF": # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes latest_bibrank_run = get_bibrankmethod_lastupdate('citation') sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recIDs) def check_date(mod_date): return mod_date < latest_bibrank_run recIDs = intbitset([recid for recid, mod_date in run_sql(sql) \ if check_date(mod_date)]) for r in recIDs: recIDs |= intbitset(get_cited_by(r)) ### list of corresponding record IDs was retrieved ### now format the selected records if process_format: write_message("Records to be processed: %d" % (len(recIDs) \ + len(without_format))) write_message("Out of it records without existing cache: %d" % len(without_format)) else: write_message("Records to be processed: %d" % (len(recIDs))) ### Initialize main loop total_rec = 0 # Total number of records tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) if process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this # when migration from php to # python bibformat is done (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_old(recIDs, fmt) else: (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_new(recIDs, fmt) total_rec += total_rec_1 tbibformat += tbibformat_1 tbibupload += tbibupload_1 ### Iterate over all records prepared in list II (no_format) if process_format and process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this # when migration from php to # python bibformat is done (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_old(without_format, fmt) else: (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_new(without_format, fmt) total_rec += total_rec_2 tbibformat += tbibformat_2 tbibupload += tbibupload_2 ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec write_message(message) message = "total processing time: %2f sec" % elapsed write_message(message) message = "Time spent on external call (os.system):" write_message(message) message = " bibformat: %2f sec" % tbibformat write_message(message) message = " bibupload: %2f sec" % tbibupload write_message(message)