def fetch_index_update(): """Fetch last runtime of given task""" end_date = get_bibrankmethod_lastupdate('citation') if CFG_BIBRANK_SELFCITES_USE_BIBAUTHORID: bibauthorid_end_date = fetch_bibauthorid_last_update() end_date = min(end_date, bibauthorid_end_date) return end_date
def fetch_concerned_records(name, ids_param): """Fetch records that have been updated since the last run of the daemon""" if ids_param: recids = intbitset() for first, last in ids_param: recids += range(first, last+1) end_date = None else: start_date = get_bibrankmethod_lastupdate(name) end_date = fetch_index_update() recids = fetch_records(start_date, end_date) return recids, end_date
def fetch_concerned_records(name): start_date = get_bibrankmethod_lastupdate(name) end_date = fetch_index_update() return fetch_records(start_date, end_date)
def bibreformat_task(fmt, recids, without_fmt, process): """BibReformat main task. @param fmt: output format to use @param process: @param recids: a list of record IDs to reformat @return: None """ write_message("Processing format %s" % fmt) t1 = os.times()[4] start_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S") latest_bibrank_run = get_bibrankmethod_lastupdate('citation') def related_records(recids, recids_processed): if fmt == "HDREF" and recids: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recids) def check_date(mod_date): return mod_date.strftime( "%Y-%m-%d %H:%M:%S") < latest_bibrank_run rel_recids = intbitset([ recid for recid, mod_date in run_sql(sql) if check_date(mod_date) ]) for r in rel_recids: recids |= intbitset(get_cited_by(r)) # To not process recids twice recids -= recids_processed # Adds to the set of processed recids recids_processed += recids return recids def recid_chunker(recids): recids_processed = intbitset() chunk = intbitset() for recid in recids: if len(chunk) == 5000: for r in related_records(chunk, recids_processed): yield r recids_processed += chunk chunk = intbitset() if recid not in recids_processed: chunk.add(recid) if chunk: for r in related_records(chunk, recids_processed): yield r recIDs = list(recid_chunker(recids)) ### list of corresponding record IDs was retrieved ### now format the selected records if without_fmt: write_message("Records to be processed: %d" % len(recIDs)) write_message("Out of it records without existing cache: %d" % len(without_fmt)) else: write_message("Records to be processed: %d" % len(recIDs)) ### Initialize main loop total_rec = 0 # Total number of records tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) if process: total_rec_1, tbibformat_1, tbibupload_1 = iterate_over_new(recIDs, fmt) total_rec += total_rec_1 tbibformat += tbibformat_1 tbibupload += tbibupload_1 ### Store last run time if task_has_option("last"): write_message("storing run date to %s" % start_date) store_last_updated(fmt, start_date) ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec write_message(message) message = "total processing time: %2f sec" % elapsed write_message(message) message = "Time spent on external call (os.system):" write_message(message) message = " bibformat: %2f sec" % tbibformat write_message(message) message = " bibupload: %2f sec" % tbibupload write_message(message)
def bibreformat_task(fmt, recids, without_fmt, process): """ BibReformat main task @param fmt: output format to use @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case @param sql_queries: a list of sql queries to be executed to select records to reformat. @param cds_query: a search query to be executed to select records to reformat @param process_format: @param process: @param recids: a list of record IDs to reformat @return: None """ write_message("Processing format %s" % fmt) t1 = os.times()[4] start_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S") latest_bibrank_run = get_bibrankmethod_lastupdate('citation') def related_records(recids, recids_processed): if fmt == "HDREF" and recids: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recids) def check_date(mod_date): return mod_date.strftime("%Y-%m-%d %H:%M:%S") < latest_bibrank_run rel_recids = intbitset([recid for recid, mod_date in run_sql(sql) if check_date(mod_date)]) for r in rel_recids: recids |= intbitset(get_cited_by(r)) # To not process recids twice recids -= recids_processed # Adds to the set of processed recids recids_processed += recids return recids def recid_chunker(recids): recids_processed = intbitset() chunk = intbitset() for recid in recids: if len(chunk) == 5000: for r in related_records(chunk, recids_processed): yield r recids_processed += chunk chunk = intbitset() if recid not in recids_processed: chunk.add(recid) if chunk: for r in related_records(chunk, recids_processed): yield r recIDs = list(recid_chunker(recids)) ### list of corresponding record IDs was retrieved ### now format the selected records if without_fmt: write_message("Records to be processed: %d" % len(recIDs)) write_message("Out of it records without existing cache: %d" % len(without_fmt)) else: write_message("Records to be processed: %d" % len(recIDs)) ### Initialize main loop total_rec = 0 # Total number of records tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) if process: total_rec_1, tbibformat_1, tbibupload_1 = iterate_over_new(recIDs, fmt) total_rec += total_rec_1 tbibformat += tbibformat_1 tbibupload += tbibupload_1 ### Store last run time if task_has_option("last"): write_message("storing run date to %s" % start_date) store_last_updated(fmt, start_date) ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec write_message(message) message = "total processing time: %2f sec" % elapsed write_message(message) message = "Time spent on external call (os.system):" write_message(message) message = " bibformat: %2f sec" % tbibformat write_message(message) message = " bibupload: %2f sec" % tbibupload write_message(message)
def bibreformat_task(fmt, sql, sql_queries, cds_query, process_format, process, recids): """ BibReformat main task @param fmt: output format to use @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case @param sql_queries: a list of sql queries to be executed to select records to reformat. @param cds_query: a search query to be executed to select records to reformat @param process_format: @param process: @param recids: a list of record IDs to reformat @return: None """ t1 = os.times()[4] ### Query the database ### task_update_progress('Fetching records to process') if process_format: # '-without' parameter write_message("Querying database for records without cache...") without_format = without_fmt(sql) recIDs = recids if cds_query['field'] != "" or \ cds_query['collection'] != "" or \ cds_query['pattern'] != "": write_message("Querying database (CDS query)...") if cds_query['collection'] == "": # use search_pattern() whenever possible, as it can search # even in private collections res = search_pattern(p=cds_query['pattern'], f=cds_query['field'], m=cds_query['matching']) else: # use perform_request_search when '-c' argument has been # defined, as it is not supported by search_pattern() res = intbitset(perform_request_search(req=None, of='id', c=cds_query['collection'], p=cds_query['pattern'], f=cds_query['field'])) recIDs |= res for sql_query in sql_queries: write_message("Querying database (%s) ..." % sql_query, verbose=2) recIDs |= intbitset(run_sql(sql_query)) if fmt == "HDREF": # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes latest_bibrank_run = get_bibrankmethod_lastupdate('citation') sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recIDs) def check_date(mod_date): return mod_date < latest_bibrank_run recIDs = intbitset([recid for recid, mod_date in run_sql(sql) \ if check_date(mod_date)]) for r in recIDs: recIDs |= intbitset(get_cited_by(r)) ### list of corresponding record IDs was retrieved ### now format the selected records if process_format: write_message("Records to be processed: %d" % (len(recIDs) \ + len(without_format))) write_message("Out of it records without existing cache: %d" % len(without_format)) else: write_message("Records to be processed: %d" % (len(recIDs))) ### Initialize main loop total_rec = 0 # Total number of records tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) if process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this # when migration from php to # python bibformat is done (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_old(recIDs, fmt) else: (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_new(recIDs, fmt) total_rec += total_rec_1 tbibformat += tbibformat_1 tbibupload += tbibupload_1 ### Iterate over all records prepared in list II (no_format) if process_format and process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this # when migration from php to # python bibformat is done (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_old(without_format, fmt) else: (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_new(without_format, fmt) total_rec += total_rec_2 tbibformat += tbibformat_2 tbibupload += tbibupload_2 ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec write_message(message) message = "total processing time: %2f sec" % elapsed write_message(message) message = "Time spent on external call (os.system):" write_message(message) message = " bibformat: %2f sec" % tbibformat write_message(message) message = " bibupload: %2f sec" % tbibupload write_message(message)
def bibreformat_task(fmt, sql, sql_queries, cds_query, process_format, process, recids): """ BibReformat main task @param fmt: output format to use @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case @param sql_queries: a list of sql queries to be executed to select records to reformat. @param cds_query: a search query to be executed to select records to reformat @param process_format: @param process: @param recids: a list of record IDs to reformat @return: None """ write_message("Processing format %s" % fmt) t1 = os.times()[4] start_date = datetime.now() ### Query the database ### task_update_progress('Fetching records to process') if process_format: # '-without' parameter write_message("Querying database for records without cache...") without_format = without_fmt(sql) recIDs = intbitset(recids) if cds_query['field'] != "" or \ cds_query['collection'] != "" or \ cds_query['pattern'] != "": write_message("Querying database (CDS query)...") if cds_query['collection'] == "": # use search_pattern() whenever possible, as it can search # even in private collections res = search_pattern(p=cds_query['pattern'], f=cds_query['field'], m=cds_query['matching']) else: # use perform_request_search when '-c' argument has been # defined, as it is not supported by search_pattern() res = intbitset( perform_request_search(req=None, of='id', c=cds_query['collection'], p=cds_query['pattern'], f=cds_query['field'])) recIDs |= res for sql_query in sql_queries: write_message("Querying database (%s) ..." % sql_query, verbose=2) recIDs |= intbitset(run_sql(sql_query)) if fmt == "HDREF" and recIDs: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes latest_bibrank_run = get_bibrankmethod_lastupdate('citation') start_date = latest_bibrank_run sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recIDs) def check_date(mod_date): return mod_date < latest_bibrank_run recIDs = intbitset([recid for recid, mod_date in run_sql(sql) \ if check_date(mod_date)]) for r in recIDs: recIDs |= intbitset(get_cited_by(r)) ### list of corresponding record IDs was retrieved ### now format the selected records if process_format: write_message("Records to be processed: %d" % (len(recIDs) \ + len(without_format))) write_message("Out of it records without existing cache: %d" % len(without_format)) else: write_message("Records to be processed: %d" % (len(recIDs))) ### Initialize main loop total_rec = 0 # Total number of records tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) if process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this # when migration from php to # python bibformat is done (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_old(recIDs, fmt) else: (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_new(recIDs, fmt) total_rec += total_rec_1 tbibformat += tbibformat_1 tbibupload += tbibupload_1 ### Iterate over all records prepared in list II (no_format) if process_format and process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this # when migration from php to # python bibformat is done (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_old(without_format, fmt) else: (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_new(without_format, fmt) total_rec += total_rec_2 tbibformat += tbibformat_2 tbibupload += tbibupload_2 ### Store last run time if task_has_option("last"): write_message("storing run date to %s" % start_date) store_last_updated(fmt, start_date) ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec write_message(message) message = "total processing time: %2f sec" % elapsed write_message(message) message = "Time spent on external call (os.system):" write_message(message) message = " bibformat: %2f sec" % tbibformat write_message(message) message = " bibupload: %2f sec" % tbibupload write_message(message)
def wait_for_task(task_id): sql = 'SELECT status FROM schTASK WHERE id = %s' while run_sql(sql, [task_id])[0][0] != 'DONE': time.sleep(5) def submit(recids): print 'submitting %s' % str(recids) task_id = task_low_level_submission('bibreformat', 'catchup-doi', '-o', FORMAT, '-P', '5', '-i', ','.join(str(r) for r in recids)) wait_for_task(task_id) max_id = run_sql("SELECT max(id) FROM bibrec")[0][0] latest_bibrank_run = get_bibrankmethod_lastupdate('citation') recids = xrange(1, max_id + 1) to_update = [] for recid in recids: if recid % 50 == 0: print '%s of %s' % (recid, max_id) ret = run_sql('SELECT id FROM bibrec WHERE id = %s', [recid]) if not ret: continue ret = run_sql('SELECT id_bibrec FROM bibfmt WHERE format = %s AND id_bibrec = %s', [FORMAT, recid]) if not ret: