def fill_self_cites_tables(config): """ This will fill the self-cites tables with data The purpose of this function is to fill these tables on a website that never ran the self-cites daemon """ algorithm = config['algorithm'] tags = get_authors_tags() all_ids = [r[0] for r in run_sql('SELECT id FROM bibrec ORDER BY id')] citations_fun = get_citations_fun(algorithm) write_message('using %s' % citations_fun.__name__) if algorithm == 'friends': # We only needs this table for the friends algorithm or assimilated # Fill intermediary tables for index, recid in enumerate(all_ids): if index % 1000 == 0: msg = 'intermediate %d/%d' % (index, len(all_ids)) task_update_progress(msg) write_message(msg) task_sleep_now_if_required() update_self_cites_tables(recid, config, tags) # Fill self-cites table for index, recid in enumerate(all_ids): if index % 1000 == 0: msg = 'final %d/%d' % (index, len(all_ids)) task_update_progress(msg) write_message(msg) task_sleep_now_if_required() compute_and_store_self_citations(recid, tags, citations_fun)
def fill_self_cites_tables(config): """ This will fill the self-cites tables with data The purpose of this function is to fill these tables on a website that never ran the self-cites daemon """ algorithm = config['algorithm'] tags = get_authors_tags() all_ids = [r[0] for r in run_sql('SELECT id FROM bibrec ORDER BY id')] citations_fun = get_citations_fun(algorithm) write_message('using %s' % citations_fun.__name__) if algorithm == 'friends': # We only needs this table for the friends algorithm or assimilated # Fill intermediary tables for index, recid in enumerate(all_ids): if index % 1000 == 0: msg = 'intermediate %d/%d' % (index, len(all_ids)) task_update_progress(msg) write_message(msg) task_sleep_now_if_required() update_self_cites_tables(recid, config, tags) # Fill self-cites table for index, recid in enumerate(all_ids): if index % 1000 == 0: msg = 'final %d/%d' % (index, len(all_ids)) task_update_progress(msg) write_message(msg) task_sleep_now_if_required() compute_and_store_self_citations(recid, tags, citations_fun)
def bst_fibonacci(n=30): """ Small tasklets that prints the the Fibonacci sequence for n. @param n: how many Fibonacci numbers to print. @type n: int """ ## Since it's tasklet, the parameter might be passed as a string. ## it should then be converted to an int. n = int(n) write_message("Printing %d Fibonacci numbers." % n, verbose=9) for i in range(0, n): if i > 0 and i % 4 == 0: write_message("Error: water in the CPU. Ignoring and continuing.", sys.stderr, verbose=3) elif i > 0 and i % 5 == 0: write_message( "Error: floppy drive dropped on the floor. Ignoring and continuing.", sys.stderr) write_message("fib(%d)=%d" % (i, fib(i))) task_update_progress("Done %d out of %d." % (i, n)) task_sleep_now_if_required(can_stop_too=True) time.sleep(1) task_update_progress("Done %d out of %d." % (n, n)) return 1
def match_remote_ids(remote_ids): """ Matches remote IDs to local records, IDs that cannot be matched are returned as a list.""" per_last = -1 def percent_update(index, percent_last): """ Calculates completion percentage, updates task progress """ per = 100 * float(index)/float(len(remote_ids)) if per > (percent_last + 0.5): percent_last = per task_update_progress("Local matching %.1f%% (%d/%d)" % (per, index, len(remote_ids))) return percent_last missing = [] for i, recid in enumerate(remote_ids): task_sleep_now_if_required(can_stop_too=True) per_last = percent_update(i, per_last) term = "035__9:%s and 035__a:%d" % (REMOTE_INSTANCE, recid) result = perform_request_search(p=term) if not result: missing.append(recid) _print("Of %d record IDs, %d were matched, %d are missing" % (len(remote_ids), (len(remote_ids) - len(missing)), len(missing))) return missing
def task_run_core(): """Run the indexing task. The row argument is the BibSched task queue row, containing if, arguments, etc. Return 1 in case of success and 0 in case of failure. """ if not task_get_option("run"): task_set_option("run", [name[0] for name in run_sql("SELECT name from rnkMETHOD")]) for key in task_get_option("run"): task_sleep_now_if_required(can_stop_too=True) write_message("") filename = CFG_ETCDIR + "/bibrank/" + key + ".cfg" write_message("Getting configuration from file: %s" % filename, verbose=9) config = ConfigParser.ConfigParser() try: config.readfp(open(filename)) except StandardError: write_message("Cannot find configuration file: %s. " "The rankmethod may also not be registered using " "the BibRank Admin Interface." % filename, sys.stderr) raise #Using the function variable to call the function related to the #rank method cfg_function = config.get("rank_method", "function") func_object = globals().get(cfg_function) if func_object: func_object(key) else: write_message("Cannot run method '%s', no function to call" % key) return True
def iterate_over_new(list, fmt): """ Iterate over list of IDs @param list: the list of record IDs to format @param fmt: the output format to use @return: tuple (total number of records, time taken to format, time taken to insert) """ global total_rec formatted_records = '' # (string-)List of formatted record of an iteration tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted tot = len(list) count = 0 for recID in list: t1 = os.times()[4] start_date = time.strftime('%Y-%m-%d %H:%M:%S') formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True)) run_sql('REPLACE LOW_PRIORITY INTO bibfmt (id_bibrec, format, last_updated, value) VALUES (%s, %s, %s, %s)', (recID, fmt, start_date, formatted_record)) t2 = os.times()[4] tbibformat += (t2 - t1) count += 1 if (count % 100) == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if (tot % 100) != 0: write_message(" ... formatted %s records out of %s" % (count, tot)) return (tot, tbibformat, tbibupload)
def solr_commit_if_necessary(next_commit_counter, final_commit=False, recid=None): # Counter full or final commit if counter set if next_commit_counter == task_get_option("flush") - 1 or ( final_commit and next_commit_counter > 0): recid_info = '' if recid: recid_info = ' for recid=%s' % recid status_msg = 'Solr ranking indexer COMMITTING' + recid_info write_message(status_msg) task_update_progress(status_msg) try: # Commits might cause an exception, most likely a # timeout while hitting a background merge # Changes will then be committed later by the # calling (periodical) task # Also, autocommits can be used in the solrconfig SOLR_CONNECTION.commit() except: register_exception(alert_admin=True) next_commit_counter = 0 task_sleep_now_if_required(can_stop_too=True) else: next_commit_counter = next_commit_counter + 1 return next_commit_counter
def fetch_xml_files(folder, els, new_files): """Recursively gets the downloaded xml files converts them to marc xml format and stores them in the same directory with the name "upload.xml".""" if exists(folder): for subfolder in listdir(folder): subfolder = join(folder, subfolder).lstrip() if isfile(subfolder): if not subfolder.endswith('upload.xml'): folders = subfolder.split('/') folders[-1] = 'upload.xml' file_loc = "/".join(folders) if not exists(file_loc): xmlFile = open(subfolder, "r") xmlString = xmlFile.read() xmlFile.close() dom_xml = xml.dom.minidom.parseString(xmlString) doi = els.get_publication_information(dom_xml)[-1] write_message("DOI in record: %s" % (doi,)) res = perform_request_search(p="doi:%s" % (doi,), of="id") if not res: write_message("DOI not found") doctype = els.get_doctype(dom_xml).lower() #ignore index pages if doctype in INTERESTING_DOCTYPES: marcfile = open(file_loc, 'w') marcfile.write(els.get_record(subfolder)) marcfile.close() new_files.append(file_loc) task_sleep_now_if_required(can_stop_too=False) else: write_message("DOI found: %s" % (res,)) else: fetch_xml_files(subfolder, els, new_files)
def afs_sync(modified_records, time_estimator, tot, now): """Sync to AFS.""" write_message("Appending output to %s" % CFG_OUTPUT_PATH) prodsyncname = CFG_OUTPUT_PATH + now.strftime("%Y%m%d%H%M%S") + '.xml.gz' r = gzip.open(prodsyncname, "w") print >> r, '<collection xmlns="http://www.loc.gov/MARC21/slim">' for i, recid in enumerate(modified_records): with run_ro_on_slave_db(): record = format_record(recid, 'xme', user_info=ADMIN_USER_INFO)[0] if not record: write_message("Error formatting record {0} as 'xme': {1}".format( recid, record)) else: print >> r, record if shall_sleep(recid, i, tot, time_estimator): r.flush() task_sleep_now_if_required() print >> r, '</collection>' r.close() prodsync_tarname = CFG_OUTPUT_PATH + '.tar' write_message("Adding %s to %s" % (prodsyncname, prodsync_tarname)) prodsync_tar = tarfile.open(prodsync_tarname, 'a') prodsync_tar.add(prodsyncname) prodsync_tar.close() os.remove(prodsyncname)
def generate_sitemaps(sitemap_index_writer, records, output_directory, sitemap_name): """ Generate sitemaps themselves. @param sitemap_index_writer: the instance of SitemapIndexWriter that will refer to these sitemaps @param records: the list of (recid, modification_date) tuples to process @param output_directory: directory where to store the sitemaps @param sitemap_name: the name (prefix) of the sitemap files(s) """ sitemap_id = 1 writer = SitemapWriter(sitemap_id, output_directory, sitemap_name) sitemap_index_writer.add_url(writer.get_sitemap_url()) nb_urls = 0 write_message("... Getting sitemap '%s'..." % sitemap_name) write_message("... Generating urls for %s records..." % len(records)) task_sleep_now_if_required(can_stop_too=True) for i, (recid, lastmod) in enumerate(records): if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS): sitemap_id += 1 writer = SitemapWriter(sitemap_id, output_directory, sitemap_name) sitemap_index_writer.add_url(writer.get_sitemap_url()) nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' % (CFG_SITE_RECORD, recid), lastmod = lastmod, changefreq = DEFAULT_CHANGEFREQ_RECORDS, priority = DEFAULT_PRIORITY_RECORDS) if i % 100 == 0: task_update_progress("Google Scholar sitemap '%s' for recid %s/%s" % (sitemap_name, i + 1, len(records))) task_sleep_now_if_required(can_stop_too=True)
def fetch_xml_files(folder, els, new_files): """Recursively gets the downloaded xml files converts them to marc xml format and stores them in the same directory with the name "upload.xml".""" if exists(folder): for subfolder in listdir(folder): subfolder = join(folder, subfolder).lstrip() if isfile(subfolder): if not subfolder.endswith('upload.xml'): folders = subfolder.split('/') folders[-1] = 'upload.xml' file_loc = "/".join(folders) if not exists(file_loc): xmlFile = open(subfolder, "r") xmlString = xmlFile.read() xmlFile.close() dom_xml = xml.dom.minidom.parseString(xmlString) doi = els.get_publication_information(dom_xml)[-1] write_message("DOI in record: %s" % (doi,)) res = perform_request_search(p="doi:%s" % (doi,), of="id") if not res: write_message("DOI not found") doctype = els.get_doctype(dom_xml).lower() #ignore index pages if doctype in INTERESTING_DOCTYPES: marcfile = open(file_loc, 'w') marcfile.write(els.get_record(subfolder)) marcfile.close() new_files.append(file_loc) task_sleep_now_if_required(can_stop_too=False) else: write_message("DOI found: %s" % (res,)) else: fetch_xml_files(subfolder, els, new_files)
def fetch_xml_files(folder, els, new_files): """Recursively gets the downloaded xml files converts them to marc xml format and stores them in the same directory with the name "upload.xml".""" for path, folders, files in walk(folder): for fl in files: if fl != "upload.xml": file_loc = join(path, "upload.xml") if not exists(file_loc): record_path = join(path, fl) dom_xml = parse(record_path) doi = els.get_publication_information(dom_xml)[-1] res = None if doi: write_message("DOI in record: %s" % (doi,)) res = perform_request_search(p="doi:%s" % (doi,), of="id") if not res: write_message("DOI not found in record: \n%s" % (join(path, fl),)) doctype = els.get_doctype(dom_xml).lower() # ignore index pages if doctype in INTERESTING_DOCTYPES: marcfile = open(file_loc, "w") marcfile.write(els.get_record(record_path)) marcfile.close() new_files.append(file_loc) task_sleep_now_if_required(can_stop_too=False) else: write_message("DOI found: %s" % (res,))
def iterate_over_new(recIDs, fmt): """Iterate over list of IDs. @param list: the list of record IDs to format @param fmt: the output format to use @return: tuple (total number of records, time taken to format, time taken to insert) """ tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call tot = len(recIDs) reformat_function = _CFG_BIBFORMAT_UPDATE_FORMAT_FUNCTIONS.get( fmt.lower(), _update_format) for count, recID in enumerate(recIDs): t1 = os.times()[4] reformat_function(recID, fmt) t2 = os.times()[4] tbibformat += t2 - t1 if count % 100 == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if tot % 100 != 0: write_message(" ... formatted %s records out of %s" % (tot, tot)) return tot, tbibformat, tbibupload
def iterate_over_new(list, fmt): "Iterate over list of IDs" global total_rec formatted_records = '' # (string-)List of formatted record of an iteration tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call start_date = task_get_task_param('task_starting_time') # Time at which the record was formatted tot = len(list) count = 0 for recID in list: t1 = os.times()[4] start_date = time.strftime('%Y-%m-%d %H:%M:%S') formatted_record = zlib.compress(format_record(recID, fmt, on_the_fly=True)) if run_sql('SELECT id FROM bibfmt WHERE id_bibrec=%s AND format=%s', (recID, fmt)): run_sql('UPDATE bibfmt SET last_updated=%s, value=%s WHERE id_bibrec=%s AND format=%s', (start_date, formatted_record, recID, fmt)) else: run_sql('INSERT INTO bibfmt(id_bibrec, format, last_updated, value) VALUES(%s, %s, %s, %s)', (recID, fmt, start_date, formatted_record)) t2 = os.times()[4] tbibformat += (t2 - t1) count += 1 if (count % 100) == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if (tot % 100) != 0: write_message(" ... formatted %s records out of %s" % (count, tot)) return (tot, tbibformat, tbibupload)
def iterate_over_new(recIDs, fmt): """ Iterate over list of IDs @param list: the list of record IDs to format @param fmt: the output format to use @return: tuple (total number of records, time taken to format, time taken to insert) """ tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call tot = len(recIDs) for count, recID in enumerate(recIDs): t1 = os.times()[4] formatted_record, needs_2nd_pass = format_record_1st_pass(recID=recID, of=fmt, on_the_fly=True, save_missing=False) save_preformatted_record(recID=recID, of=fmt, res=formatted_record, needs_2nd_pass=needs_2nd_pass, low_priority=True) t2 = os.times()[4] tbibformat += t2 - t1 if count % 100 == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if tot % 100 != 0: write_message(" ... formatted %s records out of %s" % (tot, tot)) return tot, tbibformat, tbibupload
def convert_files(xml_files, els, prefix="", threshold_date=None): """Convert the list of publisher XML to MARCXML using given instance.""" results = {} for xml_file in xml_files: task_sleep_now_if_required() full_xml_filepath = join(prefix, xml_file) dom_xml = parse(full_xml_filepath) date = els.get_publication_information(dom_xml)[-2] if threshold_date and date < threshold_date: continue doctype = els.get_doctype(dom_xml).lower() if doctype in INTERESTING_DOCTYPES: new_full_xml_filepath = join(dirname(full_xml_filepath), "upload.xml") try: converted_xml = els.get_record(full_xml_filepath, refextract_callback=refextract) except Exception as e: _errors_detected.append(e) error_trace = traceback.format_exc() # Some error happened, lets gracefully quit results[full_xml_filepath] = (StatusCodes.CONVERSION_ERROR, error_trace) write_message('Error converting:' ' \n {0}'.format(error_trace)) continue with open(new_full_xml_filepath, "w") as marcfile: marcfile.write(converted_xml) results[full_xml_filepath] = (StatusCodes.OK, new_full_xml_filepath) else: results[full_xml_filepath] = (StatusCodes.DOCTYPE_WRONG, doctype) write_message("Doctype not interesting: {0}".format(doctype)) return results
def process_affiliations(record_ids=None, all_records=False): name = 'affiliations' if all_records: records = intbitset(run_sql("SELECT id FROM bibrec")) start_time = datetime.now() elif record_ids: records = intbitset(record_ids) start_time = None else: dummy_last_recid, last_updated = fetch_last_updated(name) start_time = datetime.now() sql = """SELECT `id` FROM `bibrec` WHERE `modification_date` >= %s AND `modification_date` <= %s ORDER BY `modification_date`""" records = intbitset(run_sql(sql, [last_updated.isoformat(), start_time.isoformat()])) records_iter = iter(records) processed_records_count = 0 while True: task_sleep_now_if_required() chunk = list(islice(records_iter, CHUNK_SIZE)) if not chunk: break process_and_store(chunk) processed_records_count += len(chunk) task_update_progress('processed %s out of %s records' % (processed_records_count, len(records))) if start_time: store_last_updated(None, start_time, name)
def convert_files(xml_files, els, prefix="", threshold_date=None): """Convert the list of publisher XML to MARCXML using given instance.""" results = {} for xml_file in xml_files: task_sleep_now_if_required() full_xml_filepath = join(prefix, xml_file) dom_xml = parse(full_xml_filepath) date = els.get_publication_information(dom_xml)[-2] if threshold_date and date < threshold_date: continue doctype = els.get_doctype(dom_xml).lower() if doctype in INTERESTING_DOCTYPES: new_full_xml_filepath = join(dirname(full_xml_filepath), "upload.xml") try: converted_xml = els.get_record( full_xml_filepath, refextract_callback=refextract) except Exception as e: _errors_detected.append(e) error_trace = traceback.format_exc() # Some error happened, lets gracefully quit results[full_xml_filepath] = (StatusCodes.CONVERSION_ERROR, error_trace) write_message('Error converting:' ' \n {0}'.format(error_trace)) continue with open(new_full_xml_filepath, "w") as marcfile: marcfile.write(converted_xml) results[full_xml_filepath] = (StatusCodes.OK, new_full_xml_filepath) else: results[full_xml_filepath] = (StatusCodes.DOCTYPE_WRONG, doctype) write_message("Doctype not interesting: {0}".format(doctype)) return results
def iterate_over_new(recIDs, fmt): """Iterate over list of IDs. @param list: the list of record IDs to format @param fmt: the output format to use @return: tuple (total number of records, time taken to format, time taken to insert) """ tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call tot = len(recIDs) reformat_function = _CFG_BIBFORMAT_UPDATE_FORMAT_FUNCTIONS.get( fmt.lower(), _update_format) for count, recID in enumerate(recIDs): t1 = os.times()[4] reformat_function(recID, fmt) t2 = os.times()[4] tbibformat += t2 - t1 if count % 100 == 0: write_message(" ... formatted %s records out of %s" % (count, tot)) task_update_progress('Formatted %s out of %s' % (count, tot)) task_sleep_now_if_required(can_stop_too=True) if tot % 100 != 0: write_message(" ... formatted %s records out of %s" % (tot, tot)) return tot, tbibformat, tbibupload
def match_remote_ids(remote_ids): """ Matches remote IDs to local records, IDs that cannot be matched are returned as a list.""" per_last = -1 def percent_update(index, percent_last): """ Calculates completion percentage, updates task progress """ per = 100 * float(index) / float(len(remote_ids)) if per > (percent_last + 0.5): percent_last = per task_update_progress("Local matching %.1f%% (%d/%d)" % (per, index, len(remote_ids))) return percent_last missing = [] for i, recid in enumerate(remote_ids): task_sleep_now_if_required(can_stop_too=True) per_last = percent_update(i, per_last) term = "035__9:%s and 035__a:%d" % (REMOTE_INSTANCE, recid) result = perform_request_search(p=term) if not result: missing.append(recid) _print("Of %d record IDs, %d were matched, %d are missing" % (len(remote_ids), (len(remote_ids) - len(missing)), len(missing))) return missing
def process_record_batch(batch): """ Splitting the matching remotely job into parts, function does the matching of remote records to local IDs """ _print("Processing batch, recid #%d to #%d" % (batch[0], batch[-1]), 4) # Local ID: Remote ID appends = {} problems = [] for recid in batch: task_sleep_now_if_required(can_stop_too=True) # Here we are taking a rest time.sleep(0.5) _print("Processing recid %d" % recid, 9) record = get_remote_record(recid) if record is None: _print("Error: Could not fetch remote record %s" % (str(recid), ), 5) continue else: local_id = extract_035_id(record) if not local_record_exists(local_id): _print("Local record does not exist", 5) problems.append(recid) continue else: _print( "Matching remote id %d to local record %s" % (recid, local_id), 5) appends[local_id] = recid _print( "Batch matching done: %d IDs matched, %d IDs not matched" % (len(appends), len(problems)), 4) return appends, problems
def solr_commit_if_necessary(next_commit_counter, final_commit=False, recid=None): # Counter full or final commit if counter set if next_commit_counter == task_get_option("flush") - 1 or (final_commit and next_commit_counter > 0): recid_info = '' if recid: recid_info = ' for recid=%s' % recid status_msg = 'Solr ranking indexer COMMITTING' + recid_info write_message(status_msg) task_update_progress(status_msg) try: # Commits might cause an exception, most likely a # timeout while hitting a background merge # Changes will then be committed later by the # calling (periodical) task # Also, autocommits can be used in the solrconfig SOLR_CONNECTION.commit() except: register_exception(alert_admin=True) next_commit_counter = 0 task_sleep_now_if_required(can_stop_too=True) else: next_commit_counter = next_commit_counter + 1 return next_commit_counter
def solr_add_range(lower_recid, upper_recid): """ Adds the regarding field values of all records from the lower recid to the upper one to Solr. It preserves the fulltext information. """ for recid in range(lower_recid, upper_recid + 1): if record_exists(recid): try: abstract = unicode(remove_control_characters(get_fieldvalues(recid, CFG_MARC_ABSTRACT)[0]), 'utf-8') except: abstract = "" try: first_author = remove_control_characters(get_fieldvalues(recid, CFG_MARC_AUTHOR_NAME)[0]) additional_authors = remove_control_characters(reduce(lambda x, y: x + " " + y, get_fieldvalues(recid, CFG_MARC_ADDITIONAL_AUTHOR_NAME), '')) author = unicode(first_author + " " + additional_authors, 'utf-8') except: author = "" try: bibrecdocs = BibRecDocs(recid) fulltext = unicode(remove_control_characters(bibrecdocs.get_text()), 'utf-8') except: fulltext = "" try: keyword = unicode(remove_control_characters(reduce(lambda x, y: x + " " + y, get_fieldvalues(recid, CFG_MARC_KEYWORD), '')), 'utf-8') except: keyword = "" try: title = unicode(remove_control_characters(get_fieldvalues(recid, CFG_MARC_TITLE)[0]), 'utf-8') except: title = "" solr_add(recid, abstract, author, fulltext, keyword, title) SOLR_CONNECTION.commit() task_sleep_now_if_required(can_stop_too=True)
def afs_sync(modified_records, time_estimator, tot, now): """Sync to AFS.""" write_message("Appending output to %s" % CFG_OUTPUT_PATH) prodsyncname = CFG_OUTPUT_PATH + now.strftime("%Y%m%d%H%M%S") + '.xml.gz' r = gzip.open(prodsyncname, "w") print >> r, '<collection xmlns="http://www.loc.gov/MARC21/slim">' for i, recid in enumerate(modified_records): record = format_record(recid, 'xme', user_info=ADMIN_USER_INFO)[0] if not record: write_message("Error formatting record {0} as 'xme': {1}".format( recid, record )) else: print >> r, record if shall_sleep(recid, i, tot, time_estimator): r.flush() task_sleep_now_if_required() print >> r, '</collection>' r.close() prodsync_tarname = CFG_OUTPUT_PATH + '.tar' write_message("Adding %s to %s" % (prodsyncname, prodsync_tarname)) prodsync_tar = tarfile.open(prodsync_tarname, 'a') prodsync_tar.add(prodsyncname) prodsync_tar.close() os.remove(prodsyncname)
def match_missing_ids(remote_ids, batch_size): """ For ID pairings that are missing, this function splits the missing IDs into batches. The records are pulled from remote, the 035 field read and then the remote ID appended to the local record. Parameters: remote_ids - a list of missing remote rec-ids batch_size - How many records to match at a time Returns: count_appends - number of records being appended count_problems - number of records which could not be matched at all """ count_appends = 0 count_problems = 0 batches = [remote_ids[x:x+batch_size] for x in xrange(0, len(remote_ids), batch_size)] _print("Identified %d records which their remote IDs updating." % len(remote_ids)) _print("Processing %d batches of size %d" % (len(batches), batch_size)) for i, batch in enumerate(batches, 1): task_sleep_now_if_required(can_stop_too=True) task_update_progress("Batch %d of %d" % (i, len(batches))) _print("Batch %d of %d" % (i, len(batches))) try: appends, problems = process_record_batch(batch) count_appends += len(appends) count_problems += len(problems) write_to_file('missing_ids.txt', problems, append=True) _print("Submitting batch #%d to BibUpload for appending..." % i, 4) start_bibupload_job(appends) except StandardError, e: _print("Error occured during match of batch %d: %s\n%s" % (i, e, traceback.format_exc()), 2)
def generate_sitemaps(sitemap_index_writer, records, output_directory, sitemap_name): """ Generate sitemaps themselves. @param sitemap_index_writer: the instance of SitemapIndexWriter that will refer to these sitemaps @param records: the list of (recid, modification_date) tuples to process @param output_directory: directory where to store the sitemaps @param sitemap_name: the name (prefix) of the sitemap files(s) """ sitemap_id = 1 writer = SitemapWriter(sitemap_id, output_directory, sitemap_name) sitemap_index_writer.add_url(writer.get_sitemap_url()) nb_urls = 0 write_message("... Getting sitemap '%s'..." % sitemap_name) write_message("... Generating urls for %s records..." % len(records)) task_sleep_now_if_required(can_stop_too=True) for i, (recid, lastmod) in enumerate(records): if nb_urls % 100 == 0 and (writer.get_size() >= MAX_SIZE or nb_urls >= MAX_RECORDS): sitemap_id += 1 writer = SitemapWriter(sitemap_id, output_directory, sitemap_name) sitemap_index_writer.add_url(writer.get_sitemap_url()) nb_urls = writer.add_url(CFG_SITE_URL + '/%s/%s' % (CFG_SITE_RECORD, recid), lastmod=lastmod, changefreq=DEFAULT_CHANGEFREQ_RECORDS, priority=DEFAULT_PRIORITY_RECORDS) if i % 100 == 0: task_update_progress( "Google Scholar sitemap '%s' for recid %s/%s" % (sitemap_name, i + 1, len(records))) task_sleep_now_if_required(can_stop_too=True)
def bst_inspire_authority_ids_synchronizer( url=SYNC_URL_INSPIRE_RECORDS_SRC, tmp_dir=SYNC_LOCAL_TMP_DIR): """Synchronize INSPIRE authority ids. :param string url: valid URL to the gzip (.gz) file :param string tmp_dir: existing directory path for temporary files """ xml_content = get_inspire_dump( url, os.path.join(tmp_dir, SYNC_LOCAL_INSPIRE_RECORDS_FILE_NAME)) task_sleep_now_if_required() authority_ids = parse_inspire_xml(xml_content) task_sleep_now_if_required() if authority_ids: record_ids = get_record_ids() write_message( "Info: {0} record ids have been requested".format(len(record_ids))) if record_ids: synchronize( record_ids, authority_ids, os.path.join(tmp_dir, SYNC_LOCAL_CDS_RECORDS_UPDATES_FILE_NAME))
def fetch_xml_files(folder, els, new_files): """Recursively gets the downloaded xml files converts them to marc xml format and stores them in the same directory with the name "upload.xml".""" for path, folders, files in walk(folder): for fl in files: if fl != 'upload.xml': file_loc = join(path, 'upload.xml') if not exists(file_loc): record_path = join(path, fl) dom_xml = parse(record_path) doi = els.get_publication_information(dom_xml)[-1] res = None if doi: write_message("DOI in record: %s" % (doi, )) res = perform_request_search(p="doi:%s" % (doi, ), of="id") if not res: write_message("DOI not found in record: \n%s" % (join(path, fl), )) doctype = els.get_doctype(dom_xml).lower() #ignore index pages if doctype in INTERESTING_DOCTYPES: marcfile = open(file_loc, 'w') marcfile.write(els.get_record(record_path)) marcfile.close() new_files.append(file_loc) task_sleep_now_if_required(can_stop_too=False) else: write_message("DOI found: %s" % (res, ))
def process_record_batch(batch): """ Splitting the matching remotely job into parts, function does the matching of remote records to local IDs """ _print("Processing batch, recid #%d to #%d" % (batch[0], batch[-1]), 4) # Local ID: Remote ID appends = {} problems = [] for recid in batch: task_sleep_now_if_required(can_stop_too=True) # Here we are taking a rest time.sleep(0.5) _print("Processing recid %d" % recid, 9) record = get_remote_record(recid) if record is None: _print("Error: Could not fetch remote record %s" % (str(recid),), 5) continue else: local_id = extract_035_id(record) if not local_record_exists(local_id): _print("Local record does not exist", 5) problems.append(recid) continue else: _print("Matching remote id %d to local record %s" % (recid, local_id), 5) appends[local_id] = recid _print("Batch matching done: %d IDs matched, %d IDs not matched" % (len(appends), len(problems)), 4) return appends, problems
def task_run_core(): """Run the indexing task. The row argument is the BibSched task queue row, containing if, arguments, etc. Return 1 in case of success and 0 in case of failure. """ if not task_get_option("run"): task_set_option( "run", [name[0] for name in run_sql("SELECT name from rnkMETHOD")]) for key in task_get_option("run"): task_sleep_now_if_required(can_stop_too=True) write_message("") filename = CFG_ETCDIR + "/bibrank/" + key + ".cfg" write_message("Getting configuration from file: %s" % filename, verbose=9) config = ConfigParser.ConfigParser() try: config.readfp(open(filename)) except StandardError: write_message( "Cannot find configuration file: %s. " "The rankmethod may also not be registered using " "the BibRank Admin Interface." % filename, sys.stderr) raise #Using the function variable to call the function related to the #rank method cfg_function = config.get("rank_method", "function") func_object = globals().get(cfg_function) if func_object: func_object(key) else: write_message("Cannot run method '%s', no function to call" % key) return True
def compute_cache(pids): bibtask.write_message("WebAuthorProfile: %s persons to go" % len(pids), stream=sys.stdout, verbose=0) for i, p in enumerate(pids): bibtask.write_message("WebAuthorProfile: doing %s out of %s" % (pids.index(p) + 1, len(pids))) bibtask.task_update_progress("WebAuthorProfile: doing %s out of %s" % (pids.index(p) + 1, len(pids))) _compute_cache_for_person(p) bibtask.task_sleep_now_if_required(can_stop_too=True)
def bst_autoclaim(): orcid_personid_map = get_orcid_personid_map() papers = get_papers_with_orcid() for i, recid in enumerate(papers): autoclaim_paper(recid, orcid_personid_map) if i % 10 == 0: task_update_progress("Done %s out of %s records (%s%%)" % (i, len(papers), 100*(i)/len(papers))) task_sleep_now_if_required(can_stop_too=True)
def task_run_core(): """ Walks through all directories where metadata files are located and uploads them. Files are then moved to the corresponding DONE folders. """ daemon_dir = CFG_BATCHUPLOADER_DAEMON_DIR[0] == '/' and CFG_BATCHUPLOADER_DAEMON_DIR \ or CFG_PREFIX + '/' + CFG_BATCHUPLOADER_DAEMON_DIR # Check if directory /batchupload exists if not task_get_option('documents'): # Metadata upload parent_dir = daemon_dir + "/metadata/" progress = 0 try: os.makedirs(parent_dir) except OSError: pass list_of_folders = [ "insert", "append", "correct", "replace", "holdingpen" ] for folder in list_of_folders: files_dir = os.path.join(parent_dir, folder) files_done_dir = os.path.join(files_dir, "DONE") try: files = os.listdir(files_dir) except OSError, e: os.mkdir(files_dir) files = [] write_message(e, sys.stderr) write_message("Created new folder %s" % (files_dir, )) # Create directory DONE/ if doesn't exist try: os.mkdir(files_done_dir) except OSError: # Directory exists pass for metafile in files: if os.path.isfile(os.path.join(files_dir, metafile)): # Create temporary file to be uploaded (fd, filename) = tempfile.mkstemp( prefix=metafile + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_", dir=CFG_TMPSHAREDDIR) shutil.copy(os.path.join(files_dir, metafile), filename) # Send bibsched task mode = "--" + folder jobid = str( task_low_level_submission('bibupload', 'batchupload', mode, filename)) # Move file to done folder filename = metafile + "_" + time.strftime( "%Y%m%d%H%M%S", time.localtime()) + "_" + jobid os.rename(os.path.join(files_dir, metafile), os.path.join(files_done_dir, filename)) task_sleep_now_if_required(can_stop_too=True) progress += 1 task_update_progress("Done %d out of %d." % (progress, len(list_of_folders)))
def bst_synchronize_recids(search_terms=SEARCH_TERMS, log_dir=None, collection=COLLECTION, batch_size=BATCH_SIZE, debug=False, remote_ids=None): """Synchronize record IDs between the CERN Document Server (CDS) and Inspire This BibTasklet is intended to be a general purpose replacement for 'bst_inspire_cds_synchro' and 'bst_update_cds_inspire_id', it should be executable on both CDS and Inspire. Generally there should be no need to modify these parameters, the script uses CFG_INSPIRE_SITE and CFG_CERN_SITE from invenio.conf to determine what type of Invenio instance we're running on. These parameters will be set by default to the correct values to synchronise all IDs, though you may want to limit records manually. Parameters: search_terms - The term to use to get record IDs (Default "035:<LOCAL>) log_dir - The directory to store the log file in (Defaults to CFG_TMPSHAREDDIR) collection - What collection to take from (Default is no collection) batch_size - How many records to try and ammend at once (Default 200) debug - If True, this script will run against the TEST instances (Default false) remote_ids - Comma seperated values of remote IDs, if this is specified, remote IDs will not be searched for. """ configure_globals(search_terms, log_dir, debug) _print("All messages will be logged to %s/%s" % (LOG_DIR, LOG_FILE)) if not remote_ids: task_update_progress("Finding remote records on %s with %s IDs" % (REMOTE_INSTANCE, LOCAL_INSTANCE)) remote_ids = get_remote_ids(search_terms, collection) else: remote_ids = [int(rid) for rid in remote_ids.split(',')] task_sleep_now_if_required(can_stop_too=True) task_update_progress("Matching remote IDs to local records") missing_ids = match_remote_ids(remote_ids) count_appends, count_problems = match_missing_ids(missing_ids, batch_size) _print("======================== FINAL SCORE ========================", 1) _print(" Records matched: %d" % (len(remote_ids) - len(missing_ids)), 1) _print(" Records appended: %d" % count_appends, 1) _print(" IDs not matched (broken link!): %d" % count_problems, 1) _print("=============================================================", 1) _print("Finishing, messages logged to: %s/%s" % (LOG_DIR, LOG_FILE)) return True
def task_run_core(): """ Main daemon task. Returns True when run successfully. False otherwise. """ # Dictionary of "plugin_name" -> func tickets_to_apply = task_get_option('tickets') write_message("Ticket plugins found: %s" % (str(tickets_to_apply), ), verbose=9) task_update_progress("Loading records") records_concerned = get_recids_to_load() write_message("%i record(s) found" % (len(records_concerned), )) records_processed = 0 for record, last_date in load_records_from_id(records_concerned): records_processed += 1 recid = record_id_from_record(record) task_update_progress( "Processing records %s/%s (%i%%)" % (records_processed, len(records_concerned), int(float(records_processed) / len(records_concerned) * 100))) task_sleep_now_if_required(can_stop_too=True) for ticket_name, plugin in tickets_to_apply.items(): if plugin: write_message("Running template %s for %s" % (ticket_name, recid), verbose=5) try: ticket = BibCatalogTicket(recid=int(recid)) if plugin['check_record'](ticket, record): ticket = plugin['generate_ticket'](ticket, record) write_message("Ticket to be generated: %s" % (ticket, ), verbose=5) res = ticket.submit() if res: write_message("Ticket #%s created for %s" % (ticket.ticketid, recid)) else: write_message("Ticket already exists for %s" % (recid, )) else: write_message("Skipping record %s", (recid, )) except Exception, e: write_message("Error submitting ticket for record %s:" % (recid, )) write_message(traceback.format_exc()) raise e else: raise BibCatalogPluginException("Plugin not valid in %s" % (ticket_name, )) if last_date: store_last_updated(recid, last_date, name="bibcatalog")
def step(msg_prefix, recid, done, total): if done % 30 == 0: task_sleep_now_if_required() if done % 1000 == 0: mesg = "%s done %s of %s" % (msg_prefix, done, total) write_message(mesg) task_update_progress(mesg) write_message("Processing: %s" % recid, verbose=9)
def step(msg_prefix, recid, done, total): if done % 30 == 0: task_sleep_now_if_required() if done % 1000 == 0: mesg = "%s done %s of %s" % (msg_prefix, done, total) write_message(mesg) task_update_progress(mesg) write_message("Processing: %s" % recid, verbose=9)
def task_run_core(): """ run daemon """ if task_get_option("update-borrowers"): list_of_borrowers = db.get_all_borrowers() total_borrowers = len(list_of_borrowers) done = 0 for borrower in list_of_borrowers: user_id = borrower[0] update_user_info_from_ldap(user_id) done+=1 task_update_progress("Done %d out of %d." % (done, total_borrowers)) task_sleep_now_if_required(can_stop_too=True) if task_get_option("overdue-letters"): expired_loans = db.get_all_expired_loans() total_expired_loans = len(expired_loans) done = 0 for (borrower_id, _bor_name, recid, _barcode, _loaned_on, _due_date, _number_of_renewals, number_of_letters, date_letters, _notes, loan_id) in expired_loans: number_of_letters=int(number_of_letters) content = '' if number_of_letters == 0: content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL1'], loan_id) elif number_of_letters == 1 and must_send_second_recall(date_letters): content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL2'], loan_id) elif number_of_letters == 2 and must_send_third_recall(date_letters): content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id) elif number_of_letters >= 3 and must_send_third_recall(date_letters): content = generate_email_body(CFG_BIBCIRCULATION_TEMPLATES['RECALL3'], loan_id) if content != '': title = book_title_from_MARC(recid) subject = "LOAN RECALL: " + title update_expired_loan(loan_id) send_overdue_letter(borrower_id, subject, content) done+=1 task_update_progress("Done %d out of %d." % (done, total_expired_loans)) task_sleep_now_if_required(can_stop_too=True) time.sleep(1) return 1
def download_feed(feed_url, batch_size, delete_zip, new_sources, directory, feed_location): """ Get list of entries from XML document """ try: task_update_progress("Downloading and extracting files 1/2...") result_path = download_url(url=feed_url, content_type="xml", download_to_file=feed_location, retry_count=5, timeout=60.0) except InvenioFileDownloadError as err: _errors_detected.append(err) write_message("URL could not be opened: %s" % (feed_url,)) write_message(str(err)) write_message(traceback.format_exc()[:-1]) task_update_status("CERROR") return xml_files = [] entries = parse_feed(result_path) for fileUrl, fileName in entries: task_sleep_now_if_required() # Output location is directory + filename outFilename = join(directory, fileName) outFilename = outFilename.lstrip() # Check if file has already been fetched existing_files = list(locate(fileName, root=CFG_CONSYN_OUT_DIRECTORY)) if len(existing_files) == 1: write_message("Not downloading %s, already found %s in %s\n" % (fileUrl, existing_files[0], outFilename)) else: fileUrl = fileUrl.replace(' ', '%20') try: write_message("Downloading %s to %s\n" % (fileUrl, outFilename)) download_url(fileUrl, "zip", outFilename, 5, 60.0) new_sources.append(outFilename) except InvenioFileDownloadError as err: _errors_detected.append(err) write_message("URL could not be opened: %s" % fileUrl) write_message(str(err)) write_message(traceback.format_exc()[:-1]) task_update_status("CERROR") continue try: xml_files.extend(extractAll(outFilename, delete_zip, directory)) except BadZipfile: _errors_detected.append(err) write_message("Error BadZipfile %s", (outFilename,)) task_update_status("CERROR") remove(outFilename) return xml_files
def compute_cache(pids): bibtask.write_message("WebAuthorProfile: %s persons to go" % len(pids), stream=sys.stdout, verbose=0) for i, p in enumerate(pids): bibtask.write_message("WebAuthorProfile: doing %s out of %s" % (pids.index(p) + 1, len(pids))) bibtask.task_update_progress("WebAuthorProfile: doing %s out of %s" % (pids.index(p) + 1, len(pids))) _compute_cache_for_person(p) bibtask.task_sleep_now_if_required(can_stop_too=True)
def bst_synchronize_recids(search_terms=SEARCH_TERMS, log_dir=None, collection=COLLECTION, batch_size=BATCH_SIZE, debug=False, remote_ids=None): """Synchronize record IDs between the CERN Document Server (CDS) and Inspire This BibTasklet is intended to be a general purpose replacement for 'bst_inspire_cds_synchro' and 'bst_update_cds_inspire_id', it should be executable on both CDS and Inspire. Generally there should be no need to modify these parameters, the script uses CFG_INSPIRE_SITE and CFG_CERN_SITE from invenio.conf to determine what type of Invenio instance we're running on. These parameters will be set by default to the correct values to synchronise all IDs, though you may want to limit records manually. Parameters: search_terms - The term to use to get record IDs (Default "035:<LOCAL>) log_dir - The directory to store the log file in (Defaults to CFG_TMPSHAREDDIR) collection - What collection to take from (Default is no collection) batch_size - How many records to try and ammend at once (Default 200) debug - If True, this script will run against the TEST instances (Default false) remote_ids - Comma seperated values of remote IDs, if this is specified, remote IDs will not be searched for. """ configure_globals(search_terms, log_dir, debug) _print("All messages will be logged to %s/%s" % (LOG_DIR, LOG_FILE)) if not remote_ids: task_update_progress("Finding remote records on %s with %s IDs" % (REMOTE_INSTANCE, LOCAL_INSTANCE)) remote_ids = get_remote_ids(search_terms, collection) else: remote_ids = [int(rid) for rid in remote_ids.split(',')] task_sleep_now_if_required(can_stop_too=True) task_update_progress("Matching remote IDs to local records") missing_ids = match_remote_ids(remote_ids) count_appends, count_problems = match_missing_ids(missing_ids, batch_size) _print("======================== FINAL SCORE ========================", 1) _print(" Records matched: %d" % (len(remote_ids)-len(missing_ids)), 1) _print(" Records appended: %d" % count_appends, 1) _print(" IDs not matched (broken link!): %d" % count_problems, 1) _print("=============================================================", 1) _print("Finishing, messages logged to: %s/%s" % (LOG_DIR, LOG_FILE)) return True
def bst_dump_records(): try: os.makedirs(os.path.join(CFG_WEBDIR, 'dumps')) except OSError: pass html_index = open(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'), "w") print >> html_index, "<html><head><title>INSPIRE Dump</title></head><body><ul>" for collection in CFG_EXPORTED_COLLECTIONS: task_update_progress(collection) print >> html_index, """ <li><a href="%(prefix)s/dumps/%(collection)s-records.xml.gz">%(collection)s</a> (<a href="%(prefix)s/dumps/%(collection)s-records.xml.gz.md5">MD5</a>): %(date)s</li>""" % { 'prefix': CFG_SITE_URL, 'collection': collection, 'date': time.ctime() } write_message("Preparing %s-records.xml.gz" % collection) output_path = os.path.join(CFG_WEBDIR, 'dumps', '.%s-records.xml.gz' % collection) output = gzip.open(output_path, "w") print >> output, "<collection>" reclist = get_collection_reclist(collection) tot = len(reclist) time_estimator = get_time_estimator(tot) for i, recid in enumerate(reclist): with run_ro_on_slave_db(): print >> output, format_record(recid, 'xme', user_info={})[0] time_estimation = time_estimator()[1] if (i + 1) % 100 == 0: task_update_progress( "%s %s (%s%%) -> %s" % (collection, recid, (i + 1) * 100 / tot, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time_estimation)))) task_sleep_now_if_required() print >> output, "</collection>" output.close() write_message("Computing checksum") print >> open(output_path + '.md5', "w"), calculate_md5(output_path) os.rename( output_path, os.path.join(CFG_WEBDIR, 'dumps', '%s-records.xml.gz' % collection)) os.rename( output_path + '.md5', os.path.join(CFG_WEBDIR, 'dumps', '%s-records.xml.gz.md5' % collection)) write_message("DONE") print >> html_index, "</ul></body></html>" html_index.close() os.rename(os.path.join(CFG_WEBDIR, 'dumps', '.inspire-dump.html'), os.path.join(CFG_WEBDIR, 'dumps', 'inspire-dump.html'))
def process_records(name, records, func, extra_vars): count = 1 total = len(records) for recid, date in records: task_sleep_now_if_required(can_stop_too=True) msg = "Extracting for %s (%d/%d)" % (recid, count, total) task_update_progress(msg) write_message(msg) func(recid, **extra_vars) if date: store_last_updated(recid, date, name) count += 1
def task_run_core(): """ Walks through all directories where metadata files are located and uploads them. Files are then moved to the corresponding DONE folders. """ daemon_dir = ( CFG_BATCHUPLOADER_DAEMON_DIR[0] == "/" and CFG_BATCHUPLOADER_DAEMON_DIR or CFG_PREFIX + "/" + CFG_BATCHUPLOADER_DAEMON_DIR ) # Check if directory /batchupload exists if not task_get_option("documents"): # Metadata upload parent_dir = daemon_dir + "/metadata/" progress = 0 try: os.makedirs(parent_dir) except OSError: pass list_of_folders = ["insert", "append", "correct", "replace", "holdingpen"] for folder in list_of_folders: files_dir = os.path.join(parent_dir, folder) files_done_dir = os.path.join(files_dir, "DONE") try: files = os.listdir(files_dir) except OSError, e: os.mkdir(files_dir) files = [] write_message(e, sys.stderr) write_message("Created new folder %s" % (files_dir,)) # Create directory DONE/ if doesn't exist try: os.mkdir(files_done_dir) except OSError: # Directory exists pass for metafile in files: if os.path.isfile(os.path.join(files_dir, metafile)): # Create temporary file to be uploaded (fd, filename) = tempfile.mkstemp( prefix=metafile + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_", dir=CFG_TMPSHAREDDIR, ) shutil.copy(os.path.join(files_dir, metafile), filename) # Send bibsched task mode = "--" + folder jobid = str(task_low_level_submission("bibupload", "batchupload", mode, filename)) # Move file to done folder filename = metafile + "_" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + "_" + jobid os.rename(os.path.join(files_dir, metafile), os.path.join(files_done_dir, filename)) task_sleep_now_if_required(can_stop_too=True) progress += 1 task_update_progress("Done %d out of %d." % (progress, len(list_of_folders)))
def compute_cache_mp(pids): from multiprocessing import Pool p = Pool() bibtask.write_message("WebAuthorProfileMP: %s persons to go" % len(pids), stream=sys.stdout, verbose=0) sl = 100 ss = [pids[i: i + sl] for i in range(0, len(pids), sl)] for i, bunch in enumerate(ss): bibtask.write_message("WebAuthorProfileMP: doing bunch %s out of %s" % (str(i + 1), len(ss))) bibtask.task_update_progress("WebAuthorProfileMP: doing bunch %s out of %s" % (str(i + 1), len(ss))) p.map(_compute_cache_for_person, bunch) bibtask.task_sleep_now_if_required(can_stop_too=True)
def process_records(name, records, func, extra_vars): count = 1 total = len(records) for recid, date in records: task_sleep_now_if_required(can_stop_too=True) msg = "Extracting for %s (%d/%d)" % (recid, count, total) task_update_progress(msg) write_message(msg) func(recid, **extra_vars) if date: store_last_updated(recid, date, name) count += 1
def bst_inspire_cds_synchro(): task_update_progress("Phase 1: extracting IDs for %s" % CFG_OTHER_SITE) export_file = open(CFG_EXPORT_FILE + '.part', "w") for i, row in enumerate(iter_export_rows()): print >> export_file, row if i % 100 == 0: task_sleep_now_if_required(can_stop_too=True) export_file.close() shutil.move(CFG_EXPORT_FILE + '.part', CFG_EXPORT_FILE) task_sleep_now_if_required(can_stop_too=True) if os.path.exists(CFG_IMPORT_FILE): task_update_progress("Phase 2: importing IDs from %s" % CFG_OTHER_SITE) import_recid_list(open(CFG_IMPORT_FILE))
def redis_sync(modified_records, time_estimator, tot): """Sync to redis.""" r = redis.StrictRedis.from_url(CFG_REDIS_HOST_LABS) for i, recid in enumerate(modified_records): record = format_record(recid, 'xme', user_info=ADMIN_USER_INFO)[0] if not record: write_message("Error formatting record {0} as 'xme': {1}".format( recid, record )) else: r.rpush('legacy_records', zlib.compress(record)) if shall_sleep(recid, i, tot, time_estimator): task_sleep_now_if_required()
def bst_inspire_cds_synchro(): task_update_progress("Phase 1: extracting IDs for %s" % CFG_OTHER_SITE) export_file = open(CFG_EXPORT_FILE + ".part", "w") for i, row in enumerate(iter_export_rows()): print >> export_file, row if i % 100 == 0: task_sleep_now_if_required(can_stop_too=True) export_file.close() shutil.move(CFG_EXPORT_FILE + ".part", CFG_EXPORT_FILE) task_sleep_now_if_required(can_stop_too=True) if os.path.exists(CFG_IMPORT_FILE): task_update_progress("Phase 2: importing IDs from %s" % CFG_OTHER_SITE) import_recid_list(open(CFG_IMPORT_FILE))
def bst_fix_ffts(debug=0): debug = bool(int(debug)) ffts = {} for recid in get_broken_recids(): task_sleep_now_if_required(can_stop_too=True) write_message("Fixing %s" % recid) try: ffts[recid] = build_fft(get_last_pdf_for_record(BibRecDocs(recid))) except: register_exception(alert_admin=True) write_message("Uploading corrections") bibupload_ffts(ffts, append=True, do_debug=debug, interactive=False) return True
def bst_fix_ffts(debug=0): debug = bool(int(debug)) ffts = {} for recid in get_broken_recids(): task_sleep_now_if_required(can_stop_too=True) write_message("Fixing %s" % recid) try: ffts[recid] = build_fft(get_last_pdf_for_record(BibRecDocs(recid))) except: register_exception(alert_admin=True) write_message("Uploading corrections") bibupload_ffts(ffts, append=True, do_debug=debug, interactive=False) return True
def redis_sync(modified_records, time_estimator, tot): """Sync to redis.""" r = redis.StrictRedis.from_url(CFG_REDIS_HOST_LABS) for i, recid in enumerate(modified_records): with run_ro_on_slave_db(): record = format_record(recid, 'xme', user_info=ADMIN_USER_INFO)[0] if not record: write_message("Error formatting record {0} as 'xme': {1}".format( recid, record)) else: r.rpush('legacy_records', zlib.compress(record)) if shall_sleep(recid, i, tot, time_estimator): task_sleep_now_if_required()
def task_run_core(): """ Main daemon task. Returns True when run successfully. False otherwise. """ # Dictionary of "plugin_name" -> func tickets_to_apply = task_get_option('tickets') write_message("Ticket plugins found: %s" % (str(tickets_to_apply),), verbose=9) task_update_progress("Loading records") records_concerned = get_recids_to_load() write_message("%i record(s) found" % (len(records_concerned),)) records_processed = 0 for record, last_date in load_records_from_id(records_concerned): records_processed += 1 recid = record_id_from_record(record) task_update_progress("Processing records %s/%s (%i%%)" % (records_processed, len(records_concerned), int(float(records_processed) / len(records_concerned) * 100))) task_sleep_now_if_required(can_stop_too=True) for ticket_name, plugin in tickets_to_apply.items(): if plugin: write_message("Running template %s for %s" % (ticket_name, recid), verbose=5) try: ticket = BibCatalogTicket(recid=int(recid)) if plugin['check_record'](ticket, record): ticket = plugin['generate_ticket'](ticket, record) write_message("Ticket to be generated: %s" % (ticket,), verbose=5) res = ticket.submit() if res: write_message("Ticket #%s created for %s" % (ticket.ticketid, recid)) else: write_message("Ticket already exists for %s" % (recid,)) else: write_message("Skipping record %s", (recid,)) except Exception, e: write_message("Error submitting ticket for record %s:" % (recid,)) write_message(traceback.format_exc()) raise e else: raise BibCatalogPluginException("Plugin not valid in %s" % (ticket_name,)) if last_date: store_last_updated(recid, last_date, name="bibcatalog")
def single_tag_rank(config): """Connect the given tag with the data from the kb file given""" write_message("Loading knowledgebase file", verbose=9) kb_data = {} records = [] write_message("Reading knowledgebase file: %s" % \ config.get(config.get("rank_method", "function"), "kb_src")) input = open(config.get(config.get("rank_method", "function"), "kb_src"), 'r') data = input.readlines() for line in data: if not line[0:1] == "#": kb_data[string.strip((string.split(string.strip(line), "---"))[0])] = (string.split(string.strip(line), "---"))[1] write_message("Number of lines read from knowledgebase file: %s" % len(kb_data)) tag = config.get(config.get("rank_method", "function"), "tag") tags = config.get(config.get("rank_method", "function"), "check_mandatory_tags").split(", ") if tags == ['']: tags = "" records = [] for (recids, recide) in options["recid_range"]: task_sleep_now_if_required(can_stop_too=True) write_message("......Processing records #%s-%s" % (recids, recide)) recs = run_sql("SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (tag, recids, recide)) valid = HitSet(trailing_bits=1) valid.discard(0) for key in tags: newset = HitSet() newset += [recid[0] for recid in (run_sql("SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (key, recids, recide)))] valid.intersection_update(newset) if tags: recs = filter(lambda x: x[0] in valid, recs) records = records + list(recs) write_message("Number of records found with the necessary tags: %s" % len(records)) records = filter(lambda x: x[0] in options["validset"], records) rnkset = {} for key, value in records: if kb_data.has_key(value): if not rnkset.has_key(key): rnkset[key] = float(kb_data[value]) else: if kb_data.has_key(rnkset[key]) and float(kb_data[value]) > float((rnkset[key])[1]): rnkset[key] = float(kb_data[value]) else: rnkset[key] = 0 write_message("Number of records available in rank method: %s" % len(rnkset)) return rnkset
def single_tag_rank(config): """Connect the given tag with the data from the kb file given""" write_message("Loading knowledgebase file", verbose=9) kb_data = {} records = [] write_message("Reading knowledgebase file: %s" % config.get(config.get("rank_method", "function"), "kb_src")) with open(config.get(config.get("rank_method", "function"), "kb_src"), 'r') as f: for line in f: if not line[0:1] == "#": key, value = line.strip().split("---") kb_data[key.strip()] = value.strip() write_message("Number of lines read from knowledgebase file: %s" % len(kb_data)) tag = config.get(config.get("rank_method", "function"), "tag") tags = config.get(config.get("rank_method", "function"), "check_mandatory_tags").split(",") if tags == ['']: tags = "" records = [] for recids, recide in options["recid_range"]: task_sleep_now_if_required(can_stop_too=True) write_message("......Processing records #%s-%s" % (recids, recide)) recs = run_sql("SELECT id_bibrec, value FROM bib%sx, bibrec_bib%sx WHERE tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (tag, recids, recide)) valid = intbitset(trailing_bits=1) valid.discard(0) for key in tags: newset = intbitset(run_sql("SELECT id_bibrec FROM bib%sx, bibrec_bib%sx WHERE id_bibxxx=id AND tag=%%s AND id_bibxxx=id and id_bibrec >=%%s and id_bibrec<=%%s" % (tag[0:2], tag[0:2]), (key, recids, recide))) valid &= newset if tags: recs = [(rec, value) for recid, value in recs if recid in valid] records += list(recs) write_message("Number of records found with the necessary tags: %s" % len(records)) records = [(recid, value) for recid, value in records if recid in options["validset"]] rnkset = {} for key, value in records: if value in kb_data: if key not in rnkset: rnkset[key] = float(kb_data[value]) else: if kb_data.has_key(rnkset[key]) and float(kb_data[value]) > float((rnkset[key])[1]): rnkset[key] = float(kb_data[value]) else: rnkset[key] = 0 write_message("Number of records available in rank method: %s" % len(rnkset)) return rnkset