def eprints(): total = 0 fermilab = get_collection_reclist('Fermilab') print '{0:4s} {1:3s} {2:3s} {3:3s}'.format('Date', 'All', 'FNA', '%') date_range = ['1904', '1905', '1906'] #date_range = range(1, 20) for yymm in date_range: yymm = str(yymm) if len(yymm) == 1: yymm = '0' + yymm search_f = '037__a:fermilab* 037__c:physics.acc-ph 037__a:"arXiv:' + \ yymm + '*"' search = '037__c:physics.acc-ph 037__a:"arXiv:' + yymm + '*"' x = perform_request_search(p=search, cc='HEP') search = '037__c:acc-phys 037__a:"acc-phys/' + yymm + '*"' y = perform_request_search(p=search, cc='HEP') x_f = intbitset(x) & fermilab y_f = intbitset(y) & fermilab length = len(x) + len(y) length_f = len(x_f) + len(y_f) try: ratio = float(length_f) / float(length) * 100.0 except ZeroDivisionError: ratio = 0 print '{0:4s} {1:3d} {2:3d} {3:3f}'.format(yymm, length, length_f, ratio) total += length print "Total =", total
def tokenize_for_phrases(self, recID): """Get the country names and country codes of the institutions affiliated with the authors of the publication """ # Get the name of the institution affiliated institution_names = [] for tag in self.institution_tags: institution_names += get_fieldvalues(recID, tag) # Get the hitset of all the institutes institution_collection_hitset = intbitset([]) for collection in CFG_WEBSEARCH_INSTITUTION_COLLECTIONS: institution_collection_hitset += get_collection_reclist(collection) # Search for the institution name and get a list of institution ids institution_ids = intbitset([]) for name in institution_names: if name.strip(): result_hitset = search_pattern(p=name, f=self.institution_name_field) institution_hitset = result_hitset & institution_collection_hitset institution_ids += list(institution_hitset) # Get the country tokens tokens = [] for instID in institution_ids: tokens += self._tokenize_from_country_name_tag(instID) tokens += self._tokenize_from_country_code_tag(instID) # Remove duplicates tokens = list(set(tokens)) return tokens
def get_normalized_ranking_scores(response, hitset_filter = None, recids = []): """ Returns the result having normalized ranking scores, interval [0, 100]. hitset_filter - optional filter for the results recids - optional recids that shall remain in the result despite the filter """ if not len(response.results): return ([], intbitset()) # response.maxScore does not work in case of something was added to the response max_score = float(response.results[0]['score']) ranked_result = [] matched_recs = intbitset() for hit in response.results: recid = int(hit['id']) if (not hitset_filter and hitset_filter != []) or recid in hitset_filter or recid in recids: normalised_score = 0 if max_score > 0: normalised_score = int(100.0 / max_score * float(hit['score'])) ranked_result.append((recid, normalised_score)) matched_recs.add(recid) ranked_result.reverse() return (ranked_result, matched_recs)
def tmpl_citations_box(self, summarize_records, pubs, ln, add_box=True, loading=False): _ = gettext_set_language(ln) if CFG_INSPIRE_SITE: addition = ' (from papers in INSPIRE)' else: addition = '' line1 = "<strong>" + _("Citations%s" % addition) + "</strong>" if not loading: summarize_records, rec_query = summarize_records for i in summarize_records[0].keys(): summarize_records[0][i] = intbitset(summarize_records[0][i]) str_buffer = StringIO() render_citation_summary(str_buffer, ln, intbitset(pubs), stats=summarize_records, searchpattern=rec_query, searchfield="") str_buffer.write(websearch_templates.tmpl_citesummary_footer()) line2 = str_buffer.getvalue() line2 = '<span style="white-space: nowrap;">' + line2 + "</span>" else: line2 = self.loading_html() if add_box: citations_box = self.tmpl_print_searchresultbox('citations', line1, line2) return citations_box else: return line2
def generate_list_to_send(search): ''' Generate a list to send to MSNET. ''' filename = 'tmp_' + __file__ filename = re.sub('.py', '_send.txt', filename) output = open(filename,'w') recids_nomatch = find_recids_nomatch() print search result_m = perform_request_search(p=search, cc='HEP') print search, len(result_m) search = "035__9:msnet" result_i = perform_request_search(p=search, cc='HEP') search = "0247_2:doi" result_d = perform_request_search(p=search, cc='HEP') result = intbitset(result_m) & intbitset(result_d) - intbitset(result_i) result = result - intbitset(recids_nomatch) for recid in result: try: doi = get_fieldvalues(recid, '0247_a')[0] except IndexError: print 'Problem with:', recid, doi break output.write(str(recid) + ',' + doi + '\n') output.close() print filename
def filter_out_based_on_date_range(recids, fromdate="", untildate="", set_spec=None): """ Filter out recids based on date range.""" if fromdate: fromdate = normalize_date(fromdate, "T00:00:00Z") else: fromdate = get_earliest_datestamp() fromdate = utc_to_localtime(fromdate) if untildate: untildate = normalize_date(untildate, "T23:59:59Z") else: untildate = get_latest_datestamp() untildate = utc_to_localtime(untildate) if set_spec is not None: ## either it has a value or it empty, thus meaning all records last_updated = get_set_last_update(set_spec) if last_updated is not None: last_updated = utc_to_localtime(last_updated) if last_updated > fromdate: fromdate = utc_to_localtime(get_earliest_datestamp()) recids = intbitset(recids) ## Let's clone :-) if fromdate and untildate: recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date BETWEEN %s AND %s", (fromdate, untildate))) elif fromdate: recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date >= %s", (fromdate, ))) elif untildate: recids &= intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date <= %s", (untildate, ))) return recids - get_all_restricted_recids()
def test_get_ranked_larger_hitset(self): """solrutils - ranking larger hitset""" hitset = intbitset.intbitset([47, 56, 58, 68, 85, 89]) self.assertEqual(tuple(), self._get_ranked_result_sequence(query='Willnotfind', hitset=hitset)) hitset = intbitset.intbitset([47, 56, 55, 56, 58, 68, 85, 89]) self.assertEqual((55, 56), self._get_ranked_result_sequence(query='"higgs boson"', hitset=hitset))
def get_data_for_definition_marc(tags, recids): '''Having a list of tags and a list of recids, it returns a dictionary with the values correspondig to the tags''' #x = all_recids; [get_fieldvalues(recid, '037__a') for recid in x] #user: 140s, sys: 21s, total: 160s - cdsdev if isinstance(recids, (int, long)): recids = intbitset([recids, ]) # for each recid we need only one value #on which we sort, so we can stop looking for a value # as soon as we find one tag_index = 0 field_data_dict = {} while len(recids) > 0 and tag_index < len(tags): write_message('%s records queried for values for tags %s.' \ %(len(recids), tags), verbose=5) res = _get_values_from_marc_tag(tags[tag_index], recids) res_dict = dict(res) #field_data_dict.update(res_dict) #we can not use this, because res_dict might contain recids #that are already in field_data_dict, and we should not overwrite their value field_data_dict = dict(res_dict, **field_data_dict) #there might be keys that we do not want (ex: using 'between') #so we should remove them res_dict_keys = intbitset(res_dict.keys()) recids_not_needed = res_dict_keys.difference(recids) for recid in recids_not_needed: del field_data_dict[recid] #update the recids to contain only the recid that do not have values yet recids.difference_update(res_dict_keys) tag_index += 1 return field_data_dict
def test_set_consistence(self): """intbitset - set consistence""" tests = ( ( (20, 30, 1000, 40), 'x\x9cc`\x10p``d\x18\x18\x80d/\x00*\xb6\x00S', 'x\x9cc`\x10p`\x18(\xf0\x1f\x01\x00k\xe6\x0bF' ), ( (20, 30, 1000, 41), 'x\x9cc`\x10p``b\x18\x18\xc0\x88`\x02\x00+9\x00T', 'x\x9cc`\x10p`\x18(\xf0\x1f\x01\x00k\xe6\x0bF' ), ( (20, 30, 1001, 41), 'x\x9cc`\x10p``b\x18\x18\x80d/\x00+D\x00U', 'x\x9cc`\x10p`\x18(\xf0\xef?\x1c\x00\x00k\xdb\x0bE' ) ) for original, dumped, dumped_trails in tests: intbitset1 = intbitset(original) intbitset2 = intbitset(original, trailing_bits=True) intbitset3 = intbitset(dumped) intbitset4 = intbitset(dumped_trails) self._helper_sanity_test(intbitset1) self._helper_sanity_test(intbitset2) self._helper_sanity_test(intbitset3) self._helper_sanity_test(intbitset4) self.assertEqual(intbitset1.fastdump(), dumped) self.assertEqual(intbitset1, intbitset3) self.assertEqual(intbitset2.fastdump(), dumped_trails) self.assertEqual(intbitset2, intbitset4)
def test_pickling(self): """intbitset - pickling""" import cPickle for set1 in self.sets + [[]]: self.assertEqual(intbitset(set1), cPickle.loads(cPickle.dumps(intbitset(set1), -1))) for set1 in self.sets + [[]]: self.assertEqual(intbitset(set1, trailing_bits=True), cPickle.loads(cPickle.dumps(intbitset(set1, trailing_bits=True), -1)))
def query_records(params): """Produce record IDs from given query parameters. By passing the appriopriate CLI options, we can query here for additional records. """ write_message("Querying database (records query)...") res = intbitset() if params['field'] or params['collection'] or params['pattern']: if not params['collection']: # use search_pattern() whenever possible, as it can search # even in private collections res = search_pattern(p=params['pattern'], f=params['field'], m=params['matching']) else: # use perform_request_search when '-c' argument has been # defined, as it is not supported by search_pattern() res = intbitset( perform_request_search(req=None, of='id', c=params['collection'], p=params['pattern'], f=params['field'])) return res
def process_affiliations(record_ids=None, all_records=False): name = 'affiliations' if all_records: records = intbitset(run_sql("SELECT id FROM bibrec")) start_time = datetime.now() elif record_ids: records = intbitset(record_ids) start_time = None else: dummy_last_recid, last_updated = fetch_last_updated(name) start_time = datetime.now() sql = """SELECT `id` FROM `bibrec` WHERE `modification_date` >= %s AND `modification_date` <= %s ORDER BY `modification_date`""" records = intbitset(run_sql(sql, [last_updated.isoformat(), start_time.isoformat()])) records_iter = iter(records) processed_records_count = 0 while True: task_sleep_now_if_required() chunk = list(islice(records_iter, CHUNK_SIZE)) if not chunk: break process_and_store(chunk) processed_records_count += len(chunk) task_update_progress('processed %s out of %s records' % (processed_records_count, len(records))) if start_time: store_last_updated(None, start_time, name)
def query_records(params): """Prduces record IDs from given query parameters By passing the appriopriate CLI options, we can query here for additional records. """ write_message("Querying database (records query)...") res = intbitset() if params['field'] or params['collection'] or params['pattern']: if not params['collection']: # use search_pattern() whenever possible, as it can search # even in private collections res = search_pattern(p=params['pattern'], f=params['field'], m=params['matching']) else: # use perform_request_search when '-c' argument has been # defined, as it is not supported by search_pattern() res = intbitset(perform_request_search(req=None, of='id', c=params['collection'], p=params['pattern'], f=params['field'])) return res
def get_recids_for_set_spec(set_spec): """ Returns the list (as intbitset) of recids belonging to 'set' Parameters: set_spec - *str* the set_spec for which we would like to get the recids """ recids = intbitset() for set_def in get_set_definitions(set_spec): new_recids = perform_request_search(c=[coll.strip() \ for coll in set_def['c'].split(',')], p1=set_def['p1'], f1=set_def['f1'], m1=set_def['m1'], op1=set_def['op1'], p2=set_def['p2'], f2=set_def['f2'], m2=set_def['m2'], op2=set_def['op2'], p3=set_def['p3'], f3=set_def['f3'], m3=set_def['m3'], ap=0) recids |= intbitset(new_recids) return recids
def missing_caches(fmt, chunk_size=100000): """Produces record IDs to be formated, because their fmt cache is missing @param fmt: format to query for @return: record IDs generator without pre-created format cache """ write_message("Querying database for records without cache...") # https://mariadb.com/kb/en/library/subqueries-and-joins/ # "select id from bibrec left join bibfmt on id=id_bibrec where id_bibrec is NULL" is slow # subquery is a lot faster here return intbitset( run_sql( 'select id from bibrec where id not in (select id_bibrec from bibfmt where format=%s)', (fmt, ))) all_recids = intbitset() max_id = run_sql("SELECT max(id) FROM bibrec")[0][0] or 0 for start in xrange(1, max_id + 1, chunk_size): end = start + chunk_size sql = "SELECT id FROM bibrec WHERE id BETWEEN %s AND %s" recids = intbitset(run_sql(sql, (start, end))) sql = """SELECT id_bibrec FROM bibfmt WHERE id_bibrec BETWEEN %s AND %s AND format = %s""" without_fmt = intbitset(run_sql(sql, (start, end, fmt))) all_recids += recids - without_fmt return all_recids
def setUp(self): self.sets = [[1024], [10, 20], [10, 40], [60, 70], [60, 80], [10, 20, 60, 70], [10, 40, 60, 80], [1000], [10000], [23, 45, 67, 89, 110, 130, 174, 1002, 2132, 23434], [700, 2000], range(1000, 1100), [30], [31], [32], [33], [62], [63], [64], [65], [126], [127], [128], [129]] self.fncs_list = [ (intbitset.__and__, set.__and__, int.__and__, False), (intbitset.__or__, set.__or__, int.__or__, False), (intbitset.__xor__, set.__xor__, int.__xor__, False), (intbitset.__sub__, set.__sub__, int.__sub__, False), (intbitset.__iand__, set.__iand__, int.__and__, True), (intbitset.__ior__, set.__ior__, int.__or__, True), (intbitset.__ixor__, set.__ixor__, int.__xor__, True), (intbitset.__isub__, set.__isub__, int.__sub__, True), ] self.cmp_list = [ (intbitset.__eq__, set.__eq__, lambda x, y: cmp(x, y) == 0), (intbitset.__ge__, set.__ge__, lambda x, y: cmp(x, y) >= 0), (intbitset.__gt__, set.__gt__, lambda x, y: cmp(x, y) > 0), (intbitset.__le__, set.__le__, lambda x, y: cmp(x, y) <= 0), (intbitset.__lt__, set.__lt__, lambda x, y: cmp(x, y) < 0), (intbitset.__ne__, set.__ne__, lambda x, y: cmp(x, y) != 0), ] self.big_examples = [list(intbitset(CFG_INTBITSET_BIG_EXAMPLE))] self.corrupted_strdumps = [ "ciao", intbitset([2, 6000000]).strbits(), "djflsdkfjsdljfsldkfjsldjlfk", ]
def __init__(self, name=""): "Creates collection instance by querying the DB configuration database about 'name'." self.calculate_reclist_run_already = 0 # to speed things up without much refactoring self.update_reclist_run_already = 0 # to speed things up without much refactoring self.reclist_with_nonpublic_subcolls = intbitset() # used to store the temporary result of the calculation of nbrecs of an external collection self.nbrecs_tmp = None if not name: self.name = CFG_SITE_NAME # by default we are working on the home page self.id = 1 self.dbquery = None self.nbrecs = None self.reclist = intbitset() else: self.name = name try: res = run_sql("""SELECT id,name,dbquery,nbrecs,reclist FROM collection WHERE name=%s""", (name,)) if res: self.id = res[0][0] self.name = res[0][1] self.dbquery = res[0][2] self.nbrecs = res[0][3] try: self.reclist = intbitset(res[0][4]) except: self.reclist = intbitset() else: # collection does not exist! self.id = None self.dbquery = None self.nbrecs = None self.reclist = intbitset() except Error, e: print "Error %d: %s" % (e.args[0], e.args[1]) sys.exit(1)
def filter_out_based_on_date_range(recids, fromdate="", untildate=""): """ Filter out recids based on date range.""" if fromdate != "": fromdate = normalize_date(fromdate, "T00:00:00Z") else: fromdate = get_earliest_datestamp() fromdate = utc_to_localtime(fromdate) if untildate != "": untildate = normalize_date(untildate, "T23:59:59Z") else: untildate = get_latest_datestamp() untildate = utc_to_localtime(untildate) recids = intbitset(recids) ## Let's clone :-) if fromdate and untildate: recids &= intbitset( run_sql( "SELECT id FROM bibrec WHERE modification_date BETWEEN %s AND %s", (fromdate, untildate))) elif fromdate: recids &= intbitset( run_sql("SELECT id FROM bibrec WHERE modification_date >= %s", (fromdate, ))) elif untildate: recids &= intbitset( run_sql("SELECT id FROM bibrec WHERE modification_date <= %s", (untildate, ))) return recids - get_all_restricted_recids()
def eprints(): total = 0 fermilab = get_collection_reclist('Fermilab') print '{0:4s} {1:3s} {2:3s} {3:3s}'.format('Date', 'All', 'FNA', '%') date_range = ['1901', '1902', '1903'] date_range = range(1, 20) for yymm in date_range: yymm = str(yymm) if len(yymm) == 1: yymm = '0' + yymm search_f = '037__a:fermilab* 037__c:physics.acc-ph 037__a:"arXiv:' + \ yymm + '*"' search = '037__c:physics.acc-ph 037__a:"arXiv:' + yymm + '*"' x = perform_request_search(p=search, cc='HEP') search = '037__c:acc-phys 037__a:"acc-phys/' + yymm + '*"' y = perform_request_search(p=search, cc='HEP') x_f = intbitset(x) & fermilab y_f = intbitset(y) & fermilab length = len(x) + len(y) length_f = len(x_f) + len(y_f) try: ratio = float(length_f)/float(length)*100.0 except ZeroDivisionError: ratio = 0 print '{0:4s} {1:3d} {2:3d} {3:3f}'.format(yymm, length, length_f, ratio) total += length print "Total =", total
def generate_list_to_send(search): ''' Generate a list to send to MSNET. ''' filename = 'tmp_' + __file__ filename = re.sub('.py', '_send.txt', filename) output = open(filename, 'w') recids_nomatch = find_recids_nomatch() print search result_m = perform_request_search(p=search, cc='HEP') print search, len(result_m) search = "035__9:msnet" result_i = perform_request_search(p=search, cc='HEP') search = "0247_2:doi" result_d = perform_request_search(p=search, cc='HEP') result = intbitset(result_m) & intbitset(result_d) - intbitset(result_i) result = result - intbitset(recids_nomatch) for recid in result: try: doi = get_fieldvalues(recid, '0247_a')[0] except IndexError: print 'Problem with:', recid, doi break output.write(str(recid) + ',' + doi + '\n') output.close() print filename
def get_recids_for_set_spec(set_spec): """ Returns the list (as intbitset) of recids belonging to 'set' Parameters: set_spec - *str* the set_spec for which we would like to get the recids """ recids = intbitset() for set_def in get_set_definitions(set_spec): new_recids = perform_request_search( c=[coll.strip() for coll in set_def["c"].split(",")], p1=set_def["p1"], f1=set_def["f1"], m1=set_def["m1"], op1=set_def["op1"], p2=set_def["p2"], f2=set_def["f2"], m2=set_def["m2"], op2=set_def["op2"], p3=set_def["p3"], f3=set_def["f3"], m3=set_def["m3"], ap=0, ) recids |= intbitset(new_recids) return recids
def main(key, value, start, end): '''Add up all citations over a period.''' search = 'find {0} {1} and topcite 1+'.format(key, value) if key == 'exp': search = 'find {0} {1}* and topcite 1+'.format(key, value) entity_papers = intbitset(perform_request_search(p=search, cc='HEP')) citation_list = get_cited_by_list(entity_papers) citation_dict = dict( (cite[0], intbitset(cite[1])) for cite in citation_list) print 'The {0} papers of {1}'.format(len(entity_papers), value) all_papers = {} years = range(start, end) for year in years: search = 'earliestdate:' + str(year) all_papers[year] = intbitset(perform_request_search(p=search, cc='HEP')) citations_year = {} total = 0 for year in years: citations_year[year] = 0 for entity_paper in entity_papers: citations_year[year] += len(citation_dict[entity_paper] & all_papers[year]) total += citations_year[year] print '{0:6d}\t{1:6d}\t{2:6d}'.format(year, citations_year[year], total)
def outdated_caches(fmt, last_updated, chunk_size=2000000): sql = """SELECT br.id FROM bibrec AS br INNER JOIN bibfmt AS bf ON bf.id_bibrec = br.id WHERE br.modification_date >= %s AND bf.format = %s AND bf.last_updated < br.modification_date AND br.id BETWEEN %s AND %s""" random.seed() if random.random() < 0.98: tdelta = timedelta(hours=4) else: tdelta = timedelta(days=365) last_updated_str = (last_updated - tdelta).strftime('%Y-%m-%d %H:%M:%S') write_message("Querying database for outdated cache since %s" % last_updated_str) recids = intbitset() max_id = run_sql("SELECT max(id) FROM bibrec")[0][0] or 0 for start in xrange(1, max_id + 1, chunk_size): end = start + chunk_size recids += intbitset(run_sql(sql, (last_updated_str, fmt, start, end))) return recids
def test_no_segmentation_fault(self): """intbitset - test no segmentation fault with foreign data types""" for intbitset_fnc, set_fnc, dummy, dummy in self.fncs_list: self.assertRaises(TypeError, intbitset_fnc, (intbitset([1, 2, 3]), set([1, 2, 3]))) self.assertRaises(TypeError, set_fnc, (set([1, 2, 3]), intbitset([1, 2, 3])))
def test_set_getitem(self): """intbitset - __getitem__""" for set1 in self.sets + [[]]: intbitset1 = intbitset(set1) pythonlist1 = list(set1) for i in xrange(-2 * len(set1) - 2, 2 * len(set1) + 2): try: res1 = pythonlist1[i] except IndexError: self.assertRaises(IndexError, intbitset1.__getitem__, i) continue res2 = intbitset1[i] self.assertEqual(res1, res2) for set1 in self.sets + [[]]: intbitset1 = intbitset(set1) pythonlist1 = list(set1) for start in xrange(-2 * len(set1) - 2, 2 * len(set1) + 2): for stop in xrange(-2 * len(set1) - 2, 2 * len(set1) + 2): for step in xrange(1, 3): res1 = pythonlist1[start:stop:step] res2 = intbitset1[start:stop:step] self.assertEqual( res1, list(res2), "Failure with set %s, start %s, stop %s, step %s, found %s, expected %s, indices: %s" % (set1, start, stop, step, list(res2), res1, slice(start, stop, step).indices( len(pythonlist1))))
def related_records(recids, recids_processed): if fmt == "HDREF" and recids: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recids) def check_date(mod_date): return mod_date.strftime( "%Y-%m-%d %H:%M:%S") < latest_bibrank_run rel_recids = intbitset([ recid for recid, mod_date in run_sql(sql) if check_date(mod_date) ]) for r in rel_recids: recids |= intbitset(get_cited_by(r)) # To not process recids twice recids -= recids_processed # Adds to the set of processed recids recids_processed += recids return recids
def tokenize_for_phrases(self, recID): """Get the country names and country codes of the institutions affiliated with the authors of the publication """ # Get the name of the institution affiliated institution_names = [] for tag in self.institution_tags: institution_names += get_fieldvalues(recID, tag) # Get the hitset of all the institutes institution_collection_hitset = intbitset([]) for collection in CFG_WEBSEARCH_INSTITUTION_COLLECTIONS: institution_collection_hitset += get_collection_reclist(collection) # Search for the institution name and get a list of institution ids institution_ids = intbitset([]) for name in institution_names: result_hitset = search_pattern( p=name, f=self.institution_name_field ) institution_hitset = result_hitset & institution_collection_hitset institution_ids += list(institution_hitset) # Get the country tokens tokens = [] for instID in institution_ids: tokens += self._tokenize_from_country_name_tag(instID) tokens += self._tokenize_from_country_code_tag(instID) # Remove duplicates tokens = list(set(tokens)) return tokens
def get_records_with_num_cites(numstr, allrecs=intbitset([])): """Return an intbitset of record IDs that are cited X times, X defined in numstr. Warning: numstr is string and may not be numeric! It can be 10,0->100 etc """ cache_cited_by_dictionary = get_citation_dict("citationdict") cache_cited_by_dictionary_keys = get_citation_dict("citationdict_keys") cache_cited_by_dictionary_keys_intbitset = get_citation_dict( "citationdict_keys_intbitset") matches = intbitset([]) #once again, check that the parameter is a string if not (type(numstr) == type("thisisastring")): return intbitset([]) numstr = numstr.replace(" ", '') numstr = numstr.replace('"', '') num = 0 #first, check if numstr is just a number singlenum = re.findall("(^\d+$)", numstr) if singlenum: num = int(singlenum[0]) if num == 0: #we return recids that are not in keys return allrecs - cache_cited_by_dictionary_keys_intbitset for k in cache_cited_by_dictionary_keys: li = cache_cited_by_dictionary[k] if len(li) == num: matches.add(k) return matches #try to get 1->10 or such firstsec = re.findall("(\d+)->(\d+)", numstr) if firstsec: first = 0 sec = -1 try: first = int(firstsec[0][0]) sec = int(firstsec[0][1]) except: return intbitset([]) if (first == 0): #start with those that have no cites.. matches = allrecs - cache_cited_by_dictionary_keys_intbitset if (first <= sec): for k in cache_cited_by_dictionary_keys: li = cache_cited_by_dictionary[k] if len(li) >= first: if len(li) <= sec: matches.add(k) return matches firstsec = re.findall("(\d+)\+", numstr) if firstsec: first = firstsec[0] for k in cache_cited_by_dictionary_keys: li = cache_cited_by_dictionary[k] if len(li) > int(first): matches.add(k) return matches
def get_citedby_hitset(ahitset, input_limit=None): """ Return a hitset of records that are cited by records in the given ahitset. Useful for search engine's citedby:author:ellis feature. The parameter 'input_limit' is the maximum number of records of 'ahitset' to consider. If it is None (the default value) all the records will be used. """ out = intbitset() if ahitset: try: iter(ahitset) except OverflowError: # ignore attempt to iterate over infinite ahitset pass else: # We don't want to overwrite the input parameter if input_limit is not None: limited_ahitset = ahitset[:input_limit] else: limited_ahitset = ahitset in_sql = ','.join('%s' for dummy in limited_ahitset) rows = run_sql( """SELECT citee FROM rnkCITATIONDICT WHERE citer IN (%s)""" % in_sql, limited_ahitset) out = intbitset(rows) return out
def get_data_for_definition_marc(tags, recids): '''Having a list of tags and a list of recids, it returns a dictionary with the values correspondig to the tags''' #x = all_recids; [get_fieldvalues(recid, '037__a') for recid in x] #user: 140s, sys: 21s, total: 160s - cdsdev if isinstance(recids, (int, long)): recids = intbitset([ recids, ]) # for each recid we need only one value #on which we sort, so we can stop looking for a value # as soon as we find one tag_index = 0 field_data_dict = {} while len(recids) > 0 and tag_index < len(tags): write_message('%s records queried for values for tags %s.' \ %(len(recids), tags), verbose=5) res = _get_values_from_marc_tag(tags[tag_index], recids) res_dict = dict(res) #field_data_dict.update(res_dict) #we can not use this, because res_dict might contain recids #that are already in field_data_dict, and we should not overwrite their value field_data_dict = dict(res_dict, **field_data_dict) #there might be keys that we do not want (ex: using 'between') #so we should remove them res_dict_keys = intbitset(res_dict.keys()) recids_not_needed = res_dict_keys.difference(recids) for recid in recids_not_needed: del field_data_dict[recid] #update the recids to contain only the recid that do not have values yet recids.difference_update(res_dict_keys) tag_index += 1 return field_data_dict
def unlinked(req): """ Return an id-ordered list of citation log entries of at most 10000 rows. """ from invenio.dbquery import run_sql from invenio.search_engine import get_fieldvalues, get_collection_reclist useful_personids1 = intbitset(run_sql("SELECT distinct personid FROM aidPERSONIDDATA WHERE tag LIKE 'extid:%'")) useful_personids2 = intbitset(run_sql("SELECT distinct personid from aidPERSONIDPAPERS where flag=2")) linked_personids = intbitset(run_sql("SELECT personid FROM aidPERSONIDDATA WHERE tag='extid:INSPIREID'")) names = dict(run_sql("SELECT personid, data FROM aidPERSONIDDATA WHERE tag='canonical_name'")) matched_names = [name.lower().strip() for name in get_fieldvalues(get_collection_reclist('HepNames'), '035__a')] personid_to_match = (useful_personids1 | useful_personids2) - linked_personids body = ['<ol>'] for personid in personid_to_match: name = names.get(personid, str(personid)) if name.lower().strip() in matched_names: continue body.append('<li><a href="%(siteurl)s/author/profile/%(bai)s" target="_blank">%(bai)s</a></li>' % { 'siteurl': escape(CFG_SITE_SECURE_URL, True), 'bai': escape(name, True)}) body.append('</ol>') body = '\n'.join(body) return page(req=req, body=body, title="Unlinked useful BAIs")
def tmpl_papers_box(self, req, pubs, bibauthorid_data, num_downloads, ln, add_box=True, loading=False): _ = gettext_set_language(ln) if not loading and pubs: ib_pubs = intbitset(pubs) if bibauthorid_data["cid"]: baid_query = 'exactauthor:%s' % wrap_author_name_in_quotes_if_needed(bibauthorid_data["cid"]) elif bibauthorid_data["pid"] > -1: baid_query = 'exactauthor:%s' % wrap_author_name_in_quotes_if_needed(bibauthorid_data["pid"]) baid_query = baid_query + " " rec_query = baid_query searchstr = create_html_link(websearch_templates.build_search_url(p=rec_query), {}, "<strong>" + "All papers (" + str(len(pubs)) + ")" + "</strong>",) line2 = searchstr if CFG_BIBRANK_SHOW_DOWNLOAD_STATS and num_downloads: line2 += " (" + _("downloaded") + " " line2 += str(num_downloads) + " " + _("times") + ")" if CFG_INSPIRE_SITE: CFG_COLLS = ['Book', 'ConferencePaper', 'Introductory', 'Lectures', 'Preprint', 'Published', 'Review', 'Thesis'] else: CFG_COLLS = ['Article', 'Book', 'Preprint', ] collsd = {} for coll in CFG_COLLS: coll_papers = list(ib_pubs & intbitset(perform_request_search(rg=0, f="collection", p=coll))) if coll_papers: collsd[coll] = coll_papers colls = collsd.keys() colls.sort(lambda x, y: cmp(len(collsd[y]), len(collsd[x]))) # sort by number of papers for coll in colls: rec_query = baid_query + 'collection:' + wrap_author_name_in_quotes_if_needed(coll) line2 += "<br />" + create_html_link(websearch_templates.build_search_url(p=rec_query), {}, coll + " (" + str(len(collsd[coll])) + ")",) elif not pubs and not loading: line2 = _("No Papers") elif loading: line2 = self.loading_html() else: line2 = 'This is a bug and should be corrected' if not add_box: return line2 line1 = "<strong>" + _("Papers") + "</strong>" papers_box = self.tmpl_print_searchresultbox("papers", line1, line2) return papers_box
def get_records_with_num_cites(numstr, allrecs = intbitset([])): """Return an intbitset of record IDs that are cited X times, X defined in numstr. Warning: numstr is string and may not be numeric! It can be 10,0->100 etc """ cache_cited_by_dictionary = get_citation_dict("citationdict") cache_cited_by_dictionary_keys = get_citation_dict("citationdict_keys") cache_cited_by_dictionary_keys_intbitset = get_citation_dict("citationdict_keys_intbitset") matches = intbitset([]) #once again, check that the parameter is a string if not (type(numstr) == type("thisisastring")): return intbitset([]) numstr = numstr.replace(" ",'') numstr = numstr.replace('"','') num = 0 #first, check if numstr is just a number singlenum = re.findall("(^\d+$)", numstr) if singlenum: num = int(singlenum[0]) if num == 0: #we return recids that are not in keys return allrecs - cache_cited_by_dictionary_keys_intbitset for k in cache_cited_by_dictionary_keys: li = cache_cited_by_dictionary[k] if len(li) == num: matches.add(k) return matches #try to get 1->10 or such firstsec = re.findall("(\d+)->(\d+)", numstr) if firstsec: first = 0 sec = -1 try: first = int(firstsec[0][0]) sec = int(firstsec[0][1]) except: return intbitset([]) if (first == 0): #start with those that have no cites.. matches = allrecs - cache_cited_by_dictionary_keys_intbitset if (first <= sec): for k in cache_cited_by_dictionary_keys: li = cache_cited_by_dictionary[k] if len(li) >= first: if len(li) <= sec: matches.add(k) return matches firstsec = re.findall("(\d+)\+", numstr) if firstsec: first = firstsec[0] for k in cache_cited_by_dictionary_keys: li = cache_cited_by_dictionary[k] if len(li) > int(first): matches.add(k) return matches
def bst_prodsync(method='afs', with_citations='yes', with_claims='yes', skip_collections=''): """ Synchronize to either 'afs' or 'redis' with_citations: yes/no, whether records that now matches a record will need to be re-exported.abs with_claims: yes/no, whether record involved in some new claim need to be re-exported. skip_collections: comma-separated-lists of values for which records having 980:VALUE should be ignored, e.g. skip_collections='HEP,HEPNAMES,HEPHIDDEN' """ if not CFG_REDIS_HOST_LABS: method = 'afs' write_message("Prodsync started using %s method" % method) now = datetime.datetime.now() future_lastrun = now.strftime('%Y-%m-%d %H:%M:%S') lastrun_path = os.path.join(CFG_TMPSHAREDDIR, 'prodsync_%s_lastrun.txt' % method) try: last_run = open(lastrun_path).read().strip() write_message("Syncing records modified since %s" % last_run) with run_ro_on_slave_db(): modified_records = intbitset(run_sql("SELECT id FROM bibrec WHERE modification_date>=%s", (last_run, ))) compacttime = last_run.replace('-', '').replace(' ', '').replace(':', '') notimechangerecs = search_unit("%s->20250101000000" % compacttime, f='005', m='a') modified_records += notimechangerecs if with_citations.lower() == 'yes': for citee, citer in run_sql("SELECT citee, citer FROM rnkCITATIONDICT WHERE last_updated>=%s", (last_run, )): modified_records.add(citer) if with_claims.lower() == 'yes': modified_records |= intbitset(run_sql("SELECT bibrec FROM aidPERSONIDPAPERS WHERE last_updated>=%s", (last_run, ))) modified_records |= intbitset(run_sql('SELECT bibrec FROM aidPERSONIDPAPERS AS p JOIN aidPERSONIDDATA as d' ' ON p.personid = d.personid WHERE d.tag = "canonical_name" and d.last_updated>=%s', (last_run, ))) except IOError: # Default to everything with run_ro_on_slave_db(): modified_records = intbitset(run_sql("SELECT id FROM bibrec")) write_message("Syncing all records") skip_collections = skip_collections.split(',') skip_collections.remove('') for collection in skip_collections: modified_records -= search_pattern(p='980:%s' % collection) if not modified_records: write_message("Nothing to do") return True tot = len(modified_records) time_estimator = get_time_estimator(tot) write_message("Adding %s new or modified records" % tot) if method == 'afs': afs_sync(reversed(modified_records), time_estimator, tot, now) open(lastrun_path, "w").write(future_lastrun) write_message("DONE!") else: if redis_sync(reversed(modified_records), time_estimator, tot): open(lastrun_path, "w").write(future_lastrun) write_message("DONE!") else: write_message("Skipping prodsync: Redis queue is not yet empty")
def test_tuple_of_tuples(self): """intbitset - support tuple of tuples""" for set1 in self.sets + [[]]: tmp_tuple = tuple([(elem, ) for elem in set1]) self.assertEqual(list(intbitset(set1)), list(intbitset(tmp_tuple))) for set1 in self.sets + [[]]: tmp_tuple = tuple([(elem, ) for elem in set1]) self.assertEqual(intbitset(set1, trailing_bits=True), intbitset(tmp_tuple, trailing_bits=True))
def test_set_repr(self): """intbitset - Pythonic representation""" for set1 in self.sets + [[]]: intbitset1 = intbitset(set1) self.assertEqual(intbitset1, eval(repr(intbitset1))) for set1 in self.sets + [[]]: intbitset1 = intbitset(set1, trailing_bits=True) self.assertEqual(intbitset1, eval(repr(intbitset1)))
def get_institution_ids(text): # HACK: I know... I am sorry for that. It's for a good cause # FIXME: use redis global INSTITUTION_CACHE if text not in INSTITUTION_CACHE: INSTITUTION_CACHE[text] = intbitset(perform_request_search(cc='Institutions', p='110__u:"%s"' % text)) or \ intbitset(perform_request_search(cc='Institutions', p='110__t:"%s"' % text)) return INSTITUTION_CACHE[text]
def get_records_for_user(qid, uid): key = get_search_results_cache_key_from_qid(qid) data = search_results_cache.get(key) if data is None: return intbitset([]) cc = search_results_cache.get(key + '::cc') return get_records_that_can_be_displayed(current_user, intbitset().fastload(data), cc)
def test_get_ranked_smaller_hitset(self): """solrutils - ranking smaller hitset""" hitset = intbitset.intbitset([47, 56, 58, 68, 85, 89]) self.assertEqual((47, 56, 58, 68, 89, 85), self._get_ranked_result_sequence(query='higgs', hitset=hitset)) hitset = intbitset.intbitset([45, 50, 61, 74, 94]) self.assertEqual((50, 61, 74, 45, 94), self._get_ranked_result_sequence(query='of', hitset=hitset)) self.assertEqual((74, 45, 94), self._get_ranked_result_sequence(query='of', hitset=hitset, rows=3))
def find_records(): '''Looks for candidate records.''' search = "find fc g not fc m not fc t and tc p and jy " + str(YEAR) result_m = perform_request_search(p=search, cc='HEP') search = "035__9:msnet" result_i = perform_request_search(p=search, cc='HEP') result = intbitset(result_m) - intbitset(result_i) return result
def test_set_repr(self): """intbitset - Pythonic representation""" for set1 in self.sets + [[]] + self.big_examples: intbitset1 = intbitset(set1) self.assertEqual(intbitset1, eval(repr(intbitset1))) for set1 in self.sets + [[]] + self.big_examples: intbitset1 = intbitset(set1, trailing_bits=True) self.assertEqual(intbitset1, eval(repr(intbitset1)))
def calculate_reclist(self): """Calculate, set and return the (reclist, reclist_with_nonpublic_subcolls) tuple for given collection.""" if self.calculate_reclist_run_already or str(self.dbquery).startswith("hostedcollection:"): # do we have to recalculate? return (self.reclist, self.reclist_with_nonpublic_subcolls) write_message("... calculating reclist of %s" % self.name, verbose=6) reclist = intbitset() # will hold results for public sons only; good for storing into DB reclist_with_nonpublic_subcolls = intbitset() # will hold results for both public and nonpublic sons; good for deducing total # number of documents if not self.dbquery: # A - collection does not have dbquery, so query recursively all its sons # that are either non-restricted or that have the same restriction rules for coll in self.get_sons(): coll_reclist, coll_reclist_with_nonpublic_subcolls = coll.calculate_reclist() if ((coll.restricted_p() is None) or (coll.restricted_p() == self.restricted_p())): # add this reclist ``for real'' only if it is public reclist.union_update(coll_reclist) reclist_with_nonpublic_subcolls.union_update(coll_reclist_with_nonpublic_subcolls) elif self.dbquery and self.get_sons(): # A - collection does not have dbquery, so query recursively all its sons # that are either non-restricted or that have the same restriction rules for coll in self.get_sons(): coll_reclist, coll_reclist_with_nonpublic_subcolls = coll.calculate_reclist() if ((coll.restricted_p() is None) or (coll.restricted_p() == self.restricted_p())): # add this reclist ``for real'' only if it is publicf reclist.union_update(coll_reclist) reclist_with_nonpublic_subcolls.union_update(coll_reclist_with_nonpublic_subcolls) # B - collection does have dbquery, so compute it: # (note: explicitly remove DELETED records) reclist_self = None if CFG_CERN_SITE: reclist_self = search_pattern_parenthesised(None, self.dbquery + \ ' -980__:"DELETED" -980__:"DUMMY"') else: reclist_self = search_pattern_parenthesised(None, self.dbquery + ' -980__:"DELETED"') reclist.union_update(reclist_self) self_reclist_with_nonpublic_subcolls = copy.deepcopy(reclist_self) reclist_with_nonpublic_subcolls.union_update(self_reclist_with_nonpublic_subcolls) else: # B - collection does have dbquery, so compute it: # (note: explicitly remove DELETED records) if CFG_CERN_SITE: reclist = search_pattern_parenthesised(None, self.dbquery + \ ' -980__:"DELETED" -980__:"DUMMY"') else: reclist = search_pattern_parenthesised(None, self.dbquery + ' -980__:"DELETED"') reclist_with_nonpublic_subcolls = copy.deepcopy(reclist) # store the results: self.nbrecs = len(reclist_with_nonpublic_subcolls) self.reclist = reclist self.reclist_with_nonpublic_subcolls = reclist_with_nonpublic_subcolls # last but not least, update the speed-up flag: self.calculate_reclist_run_already = 1 # return the two sets: return (self.reclist, self.reclist_with_nonpublic_subcolls)
def test_record_sorter(self): """bibrank record sorter - sorting records""" hitset = intbitset() hitset += (1,2,5) hitset2 = intbitset() hitset2.add(5) rec_termcount = {1: 1, 2: 1, 5: 1} (res1, res2) = bibrank_word_searcher.sort_record_relevance({1: 50, 2:30, 3:70,4:10},rec_termcount,hitset, 50,0) self.assertEqual(([(1, 71), (3, 100)], list(hitset2)), (res1, list(res2)))
def test_set_clear(self): """intbitset - clearing""" for set1 in self.sets + [[]]: intbitset1 = intbitset(set1) intbitset1.clear() self.assertEqual(list(intbitset1), []) intbitset1 = intbitset(set1, trailing_bits=True) intbitset1.clear() self.assertEqual(list(intbitset1), [])
def citation(rank_method_code, related_to, hitset, rank_limit_relevance, verbose): """Sort records by number of citations""" if related_to: from invenio.search_engine import search_pattern hits = intbitset() for pattern in related_to: hits |= hitset & intbitset(search_pattern(p='refersto:%s' % pattern)) else: hits = hitset return rank_by_citations(hits, verbose)
def test_marshalling(self): """intbitset - marshalling""" for set1 in self.sets + [[]]: self.assertEqual(intbitset(set1), intbitset().fastload((intbitset(set1).fastdump()))) for set1 in self.sets + [[]]: self.assertEqual(intbitset(set1, trailing_bits=True), intbitset().fastload(intbitset(set1, trailing_bits=True).fastdump())) for set1 in self.sets + [[]]: self.assertEqual(intbitset(set1), intbitset().fastload((intbitset(set1).fastdump()))) for set1 in self.sets + [[]]: self.assertEqual(intbitset(set1, trailing_bits=True), intbitset().fastload(intbitset(set1, trailing_bits=True).fastdump()))
def get_citedby_hitset(ahitset): """ Return a hitset of records that are cited by records in the given ahitset. Useful for search engine's citedby:author:ellis feature. """ cache_cited_by_dictionary = get_citation_dict("reversedict") out = intbitset() if ahitset: for recid in ahitset: out = out | intbitset(cache_cited_by_dictionary.get(recid, [])) return out
def print_rec_ids(rec_ids): complete_paper_list = intbitset(perform_request_search(p='year:2009->2010')) print "Rec ID, Clicks, Citations:" for key in rec_ids: paper_citation_list = intbitset(get_cited_by(key)) narrowed_citation_count = len(paper_citation_list & complete_paper_list) print "%d %d %d" % (key, rec_ids[key], narrowed_citation_count)