def task_submit_check_options(): if task_has_option('collection'): coll = get_collection(task_get_option("collection")) if coll.id is None: print 'ERROR: Collection "%s" does not exist' % coll.name return False return True
def task_run_core(): """Runs the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call.""" ## initialize parameters if task_get_option('format'): fmts = task_get_option('format') else: fmts = 'HB' # default value if no format option given for fmt in fmts.split(','): sql = { "all" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt, "last": "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='%s' and bf.last_updated < br.modification_date" % fmt, "q1" : "select br.id from bibrec as br", "q2" : "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt } sql_queries = [] cds_query = {} if task_has_option("all"): sql_queries.append(sql['all']) if task_has_option("last"): sql_queries.append(sql['last']) if task_has_option("collection"): cds_query['collection'] = task_get_option('collection') else: cds_query['collection'] = "" if task_has_option("field"): cds_query['field'] = task_get_option('field') else: cds_query['field'] = "" if task_has_option("pattern"): cds_query['pattern'] = task_get_option('pattern') else: cds_query['pattern'] = "" if task_has_option("matching"): cds_query['matching'] = task_get_option('matching') else: cds_query['matching'] = "" recids = intbitset() if task_has_option("recids"): for recid in task_get_option('recids').split(','): if ":" in recid: start = int(recid.split(':')[0]) end = int(recid.split(':')[1]) recids += range(start, end) else: recids.add(int(recid)) ### sql commands to be executed during the script run ### bibreformat_task(fmt, sql, sql_queries, cds_query, task_has_option('without'), not task_has_option('noprocess'), recids) return True
def task_submit_check_options(): """Last checks and updating on the options...""" if not (task_has_option('all') or task_has_option('collection') or task_has_option('field') or task_has_option('pattern') or task_has_option('matching') or task_has_option('recids')): task_set_option('last', 1) return True
def task_run_core(): """Run the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call. """ fmts = task_get_option('format', 'HB,RECJSON') for fmt in fmts.split(','): last_updated = fetch_last_updated(fmt) write_message("last stored run date is %s" % last_updated) recids = intbitset() if task_has_option("all"): recids += all_records() if task_has_option("last"): recids += outdated_caches(fmt, last_updated) if task_has_option('ignore_without'): without_fmt = intbitset() else: without_fmt = missing_caches(fmt) recids += without_fmt cli_recids = split_cli_ids_arg(task_get_option('recids', '')) recids += cli_recids query_params = {'collection': task_get_option('collection', ''), 'field': task_get_option('field', ''), 'pattern': task_get_option('pattern', ''), 'matching': task_get_option('matching', '')} recids += query_records(query_params) bibreformat_task(fmt, recids, without_fmt, not task_has_option('noprocess')) return True
def task_run_core(): """Run the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call. """ fmts = task_get_option('format', 'HB,RECJSON') for fmt in fmts.split(','): last_updated = fetch_last_updated(fmt) write_message("last stored run date is %s" % last_updated) recids = intbitset() if task_has_option("all"): recids += all_records() if task_has_option("last"): recids += outdated_caches(fmt, last_updated) if task_has_option('ignore_without'): without_fmt = intbitset() else: without_fmt = missing_caches(fmt) recids += without_fmt cli_recids = split_cli_ids_arg(task_get_option('recids', '')) recids += cli_recids query_params = { 'collection': task_get_option('collection', ''), 'field': task_get_option('field', ''), 'pattern': task_get_option('pattern', ''), 'matching': task_get_option('matching', '') } recids += query_records(query_params) bibreformat_task(fmt, recids, without_fmt, not task_has_option('noprocess')) return True
def update_rule_last_run(rule_name): """ Set the last time a rule was run to now. This function should be called after a rule has been ran. """ if task_has_option('record_ids') or task_get_option('no_upload', False) \ or task_get_option('no_tickets', False): return # We don't want to update the database in this case updated = run_sql("UPDATE bibcheck_rules SET last_run=%s WHERE name=%s;", (task_get_task_param('task_starting_time'), rule_name,)) if not updated: # rule not in the database, insert it run_sql("INSERT INTO bibcheck_rules(name, last_run) VALUES (%s, %s)", (rule_name, task_get_task_param('task_starting_time')))
def task_submit_check_options(): """Check that options are valid.""" if task_has_option('wjob'): jobnames = task_get_option('wjob') if jobnames: jobnames = jobnames.split(',') for jobname in jobnames: res = run_sql("SELECT COUNT(*) FROM expJOB WHERE jobname=%s", (jobname,)) if res and res[0][0]: # okay, jobname exists pass else: write_message("Sorry, job name %s is not known. Exiting." % jobname) return False return True
def update_rule_last_run(rule_name): """ Set the last time a rule was run to now. This function should be called after a rule has been ran. """ if task_has_option('record_ids') or task_get_option('no_upload', False) \ or task_get_option('no_tickets', False): return # We don't want to update the database in this case updated = run_sql("UPDATE bibcheck_rules SET last_run=%s WHERE name=%s;", ( task_get_task_param('task_starting_time'), rule_name, )) if not updated: # rule not in the database, insert it run_sql("INSERT INTO bibcheck_rules(name, last_run) VALUES (%s, %s)", (rule_name, task_get_task_param('task_starting_time')))
def task_run_core(): """Runs the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call.""" ## initialize parameters fmt = task_get_option('format') sql = { "all": "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt, "last": "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format='%s' and bf.last_updated < br.modification_date" % fmt, "q1": "select br.id from bibrec as br", "q2": "select br.id from bibrec as br, bibfmt as bf where bf.id_bibrec=br.id and bf.format ='%s'" % fmt } sql_queries = [] cds_query = {} if task_has_option("all"): sql_queries.append(sql['all']) if task_has_option("last"): sql_queries.append(sql['last']) if task_has_option("collection"): cds_query['collection'] = task_get_option('collection') else: cds_query['collection'] = "" if task_has_option("field"): cds_query['field'] = task_get_option('field') else: cds_query['field'] = "" if task_has_option("pattern"): cds_query['pattern'] = task_get_option('pattern') else: cds_query['pattern'] = "" if task_has_option("matching"): cds_query['matching'] = task_get_option('matching') else: cds_query['matching'] = "" recids = intbitset() if task_has_option("recids"): for recid in task_get_option('recids').split(','): if ":" in recid: start = int(recid.split(':')[0]) end = int(recid.split(':')[1]) recids += range(start, end) else: recids.add(int(recid)) ### sql commands to be executed during the script run ### bibreformat_task(fmt, sql, sql_queries, cds_query, task_has_option('without'), not task_has_option('noprocess'), recids) return True
def task_run_core(): """ Reimplement to add the body of the task.""" ## ## ------->--->time--->------> ## (-1) | ( 0) | ( 1) ## | | | ## [T.db] | [T.fc] | [T.db] ## | | | ## |<-tol|tol->| ## ## the above is the compare_timestamps_with_tolerance result "diagram" ## [T.db] stands fore the database timestamp and [T.fc] for the file cache timestamp ## ( -1, 0, 1) stand for the returned value ## tol stands for the tolerance in seconds ## ## When a record has been added or deleted from one of the collections the T.db becomes greater that the T.fc ## and when webcoll runs it is fully ran. It recalculates the reclists and nbrecs, and since it updates the ## collections db table it also updates the T.db. The T.fc is set as the moment the task started running thus ## slightly before the T.db (practically the time distance between the start of the task and the last call of ## update_reclist). Therefore when webcoll runs again, and even if no database changes have taken place in the ## meanwhile, it fully runs (because compare_timestamps_with_tolerance returns 0). This time though, and if ## no databases changes have taken place, the T.db remains the same while T.fc is updated and as a result if ## webcoll runs again it will not be fully ran ## task_run_start_timestamp = get_current_time_timestamp() colls = [] # decide whether we need to run or not, by comparing last updated timestamps: write_message("Database timestamp is %s." % get_database_last_updated_timestamp(), verbose=3) write_message("Collection cache timestamp is %s." % get_cache_last_updated_timestamp(), verbose=3) if task_has_option("part"): write_message("Running cache update part %s only." % task_get_option("part"), verbose=3) if check_nbrecs_for_all_external_collections() or task_has_option("force") or \ compare_timestamps_with_tolerance(get_database_last_updated_timestamp(), get_cache_last_updated_timestamp(), CFG_CACHE_LAST_UPDATED_TIMESTAMP_TOLERANCE) >= 0: ## either forced update was requested or cache is not up to date, so recreate it: # firstly, decide which collections to do: if task_has_option("collection"): coll = get_collection(task_get_option("collection")) colls.append(coll) if task_has_option("recursive"): r_type_descendants = coll.get_descendants(type='r') colls += r_type_descendants v_type_descendants = coll.get_descendants(type='v') colls += v_type_descendants else: res = run_sql("SELECT name FROM collection ORDER BY id") for row in res: colls.append(get_collection(row[0])) # secondly, update collection reclist cache: if task_get_option('part', 1) == 1: i = 0 for coll in colls: i += 1 write_message("%s / reclist cache update" % coll.name) if str(coll.dbquery).startswith("hostedcollection:"): coll.set_nbrecs_for_external_collection() else: coll.calculate_reclist() task_sleep_now_if_required() coll.update_reclist() task_update_progress("Part 1/2: done %d/%d" % (i, len(colls))) task_sleep_now_if_required(can_stop_too=True) # thirdly, update collection webpage cache: if task_get_option("part", 2) == 2: i = 0 for coll in colls: i += 1 write_message("%s / webpage cache update" % coll.name) coll.update_webpage_cache() task_update_progress("Part 2/2: done %d/%d" % (i, len(colls))) task_sleep_now_if_required(can_stop_too=True) # finally update the cache last updated timestamp: # (but only when all collections were updated, not when only # some of them were forced-updated as per admin's demand) if not task_has_option("collection"): set_cache_last_updated_timestamp(task_run_start_timestamp) write_message("Collection cache timestamp is set to %s." % get_cache_last_updated_timestamp(), verbose=3) else: ## cache up to date, we don't have to run write_message("Collection cache is up to date, no need to run.") ## we are done: return True
def task_run_core(): """Runs the task by fetching arguments from the BibSched task queue. This is what BibSched will be invoking via daemon call.""" ## initialize parameters if task_get_option('format'): fmts = task_get_option('format') else: fmts = 'HB' # default value if no format option given for fmt in fmts.split(','): last_updated = fetch_last_updated(fmt) write_message("last stored run date is %s" % last_updated) sql = { "all" : """SELECT br.id FROM bibrec AS br, bibfmt AS bf WHERE bf.id_bibrec = br.id AND bf.format = '%s'""" % fmt, "last": """SELECT br.id FROM bibrec AS br INNER JOIN bibfmt AS bf ON bf.id_bibrec = br.id WHERE br.modification_date >= '%(last_updated)s' AND bf.format='%(format)s' AND bf.last_updated < br.modification_date""" \ % {'format': fmt, 'last_updated': last_updated.strftime('%Y-%m-%d %H:%M:%S')}, "missing" : """SELECT br.id FROM bibrec as br LEFT JOIN bibfmt as bf ON bf.id_bibrec = br.id AND bf.format ='%s' WHERE bf.id_bibrec IS NULL AND br.id BETWEEN %%s AND %%s """ % fmt, } sql_queries = [] cds_query = {} if task_has_option("all"): sql_queries.append(sql['all']) if task_has_option("last"): sql_queries.append(sql['last']) if task_has_option("collection"): cds_query['collection'] = task_get_option('collection') else: cds_query['collection'] = "" if task_has_option("field"): cds_query['field'] = task_get_option('field') else: cds_query['field'] = "" if task_has_option("pattern"): cds_query['pattern'] = task_get_option('pattern') else: cds_query['pattern'] = "" if task_has_option("matching"): cds_query['matching'] = task_get_option('matching') else: cds_query['matching'] = "" if task_has_option("recids"): recids = list(split_cli_ids_arg(task_get_option('recids'))) else: recids = [] ### sql commands to be executed during the script run ### bibreformat_task(fmt, sql, sql_queries, cds_query, task_has_option('without'), not task_has_option('noprocess'), recids) return True
def _task_submit_elaborate_specific_parameter(key, value, opts, args): """ Must be defined for bibtask to create a task """ if args and len(args) > 0: ## There should be no standalone arguments for any refextract job ## This will catch args before the job is shipped to Bibsched raise StandardError("Error: Unrecognised argument '%s'.\n" % args[0]) ## Task name specified if key in ('-e', '--extraction-job'): ## Make sure that the user is not mixing job name with other defined ## Refextract flags on the command line if filter(lambda p: task_get_option(p), possible_task_option_keys): write_message("Error: cli and extraction-job extraction parameters specified together.") write_message("The extraction-job flag cannot be mixed with other cli flags.") return False ## ---- Get the task file with this name task_file_dir = os.path.join(CFG_ETCDIR, 'bibedit') ## The job file name task_file = value + '.cfg' abs_path = os.path.join(task_file_dir, task_file) try: ## Open and readlines from file file_hdl = open(abs_path, 'r') file_params = file_hdl.readlines() file_hdl.close() except IOError: write_message("Error: Unable to read job file '%s'" % \ abs_path, stream=sys.stdout, verbose=0) return False ## ---- Get the database 'last_updated' value for this name xtrJOB_row = _task_name_exists(value) ## Build the information for this extraction job ## These dictionaries will be extended with extra file parameters if xtrJOB_row: task_info = {'id' : xtrJOB_row[0][0], 'name' : xtrJOB_row[0][1], 'last_updated' : xtrJOB_row[0][2], 'collections' : [], 'recids' : [],} else: ## Save the name as the input argument for this job task_info = {'name' : value, 'last_updated' : None, 'collections' : [], 'recids' : [],} ## ---- Save job parameters for p in file_params: p = p.strip() ## Ignore comments and titles, and skip blank lines if (not p) or p.startswith('#') or p.startswith("["): continue ## Split arguments just once p_args = map(lambda x: x.strip(), p.split("=", 1)) ## Check cfg file param against list of vaild params if not (p_args[0] in CFG_REFEXTRACT_JOB_FILE_PARAMS): write_message("Error: Unknown task param '%s' inside '%s'." \ % (p_args[0], task_file), stream=sys.stdout, verbose=0) return False if p_args[0] == 'collection': ## Separate and strip collections collections = map(lambda c: c.strip(), p_args[1].split(',')) task_info['collections'].extend([c for c in collections if c.strip()]) #FIXME add author extraction functionality # elif p_args[0] == 'extraction-mode': # if p_args[0] == 'authors': # task_set_option('authors', p_args[1]) elif p_args[0] == 'recid': recids = p_args[1].split(",") task_info['recids'].extend([r for r in recids if r.strip()]) elif len(p_args) == 2: ## All other flags task_info[p_args[0]] = p_args[1] else: ## Standalone flag task_info[p_args[0]] = 1 if not ('xmlfile' in task_info): task_info['xmlfile'] = _generate_default_xml_out() ## Used to flag the creation of a bibupload task task_set_option('extraction-job', task_info) ## using the extraction-job options... ## set the task options for option, value in task_info.items(): if option == 'collections': for collection in value: collection_row = _collection_exists(collection) if not collection_row: write_message("Error: '%s' is not a valid collection." % collection, stream=sys.stdout, verbose=0) return 0 ## Use the collection name matched from the database task_get_option(option).append(collection_row[0][0]) elif option == 'recids': for recid in value: if not _recid_exists(recid): write_message("Error: '%s' is not a valid record id." % recid, stream=sys.stdout, verbose=0) return 0 ## Add this valid record id to the list of record ids task_get_option(option).append(recid) elif option not in ('id', 'name', 'last_updated'): ## Usual way of setting options, but this time from the extraction-job file task_set_option(option, value) else: ## Quick check to see if an extraction job has also been specified if task_has_option('extraction-job'): write_message("Error: cli and extraction-job extraction parameters specified together.") write_message("The extraction-job flag cannot be mixed with other cli flags.") return False # Recid option elif key in ("-i", "--recid"): split_recids = value.split(":") if len(split_recids) == 2: first = last = valid_range = None try: first = int(split_recids[0]) last = int(split_recids[1]) valid_range = first < last except ValueError: write_message("Error: Range values for --recid must be integers, " "not '%s'." % value, stream=sys.stdout, verbose=0) if first is None or last is None: return False if not _recid_exists(first) or not _recid_exists(last) or not valid_range: write_message("Error: '%s' is not a valid range of record ID's." % value, stream=sys.stdout, verbose=0) return False task_get_option('recids').extend(range(first, last)) else: int_val = None try: int_val = int(value) except ValueError: write_message("Error: The value specified for --recid must be a " "valid integer, not '%s'." % value, stream=sys.stdout, verbose=0) if not _recid_exists(value) or int_val is None: write_message("Error: '%s' is not a valid record ID." % value, stream=sys.stdout, verbose=0) return False task_get_option('recids').append(value) # Collection option elif key in ("-c", "--collection"): collection_row = _collection_exists(value) if not collection_row: write_message("Error: '%s' is not a valid collection." % value, stream=sys.stdout, verbose=0) return False task_get_option('collections').append(collection_row[0][0]) elif key in ('-z', '--raw-references'): task_set_option('raw-references', True) elif key in ('-r', '--output-raw-refs'): task_set_option('output-raw-refs', True) elif key in ('-x', '--xmlfile'): task_set_option('xmlfile', value) elif key in ('-d', '--dictfile'): task_set_option('dictfile', value) elif key in ('-p', '--inspire'): task_set_option('inspire', True) elif key in ('-j', '--kb-journal'): task_set_option('kb-journal', value) elif key in ('-n', '--kb-report-number'): task_set_option('kb-report-number', value) return True
def _task_run_core(): """calls extract_references in refextract""" def _append_recid_collection_list(collection, current_recids): """Updated list of recids with new recids from collection @param collection: (string) collection name to use to obtain record ids @param current_recids: (list) list of current record ids which have already been obtained from previous collection or recid flags @return: (list) current record ids with newly appended recids from input collection """ records = get_collection_reclist(collection) for r in records: if r not in current_recids: current_recids.append(r) return current_recids daemon_cli_opts = { 'treat_as_reference_section': 0, 'fulltext': [], 'output_raw': 0, 'verbosity': 0, 'xmlfile': 0, 'dictfile': 0, 'inspire': 0, 'kb-journal': 0, 'kb-report-number': 0, 'extraction-mode': 'ref', } ## holds the name of the extraction job, and if it's already in the db task_info = task_get_option('extraction-job') ## Now set the cli options, from the set task options list if task_has_option('verbose'): v = task_get_option('verbose') if not v.isdigit(): daemon_cli_opts['verbosity'] = 0 elif int(v) not in xrange(0, 10): daemon_cli_opts['verbosity'] = 0 else: daemon_cli_opts['verbosity'] = int(v) if task_has_option('raw-references'): daemon_cli_opts['treat_as_reference_section'] = 1 if task_has_option('output-raw-refs'): daemon_cli_opts['output_raw'] = 1 if task_has_option('xmlfile'): daemon_cli_opts['xmlfile'] = task_get_option('xmlfile') if task_has_option('dictfile'): daemon_cli_opts['dictfile'] = task_get_option('dictfile') if task_has_option('inspire'): daemon_cli_opts['inspire'] = 1 if task_has_option('kb-journal'): daemon_cli_opts['kb-journal'] = task_get_option('kb-journal') if task_has_option('kb-report-number'): daemon_cli_opts['kb-report-number'] = task_get_option( 'kb-report-number') if task_get_option('recids'): ## Construct the fulltext argument equivalent from record id's ## (records, and arguments, which have valid files) try: fulltexts_for_collection = \ _get_fulltext_args_from_recids(task_get_option('recids'), task_info) daemon_cli_opts['fulltext'].extend(fulltexts_for_collection) except Exception, err: write_message('Error: Unable to obtain fulltexts for recid %s. %s' \ % (str(task_get_option('recids')), err), \ stream=sys.stdout, verbose=0) raise StandardError
def bibreformat_task(fmt, recids, without_fmt, process): """BibReformat main task. @param fmt: output format to use @param process: @param recids: a list of record IDs to reformat @return: None """ write_message("Processing format %s" % fmt) t1 = os.times()[4] start_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S") latest_bibrank_run = get_bibrankmethod_lastupdate('citation') def related_records(recids, recids_processed): if fmt == "HDREF" and recids: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recids) def check_date(mod_date): return mod_date.strftime( "%Y-%m-%d %H:%M:%S") < latest_bibrank_run rel_recids = intbitset([ recid for recid, mod_date in run_sql(sql) if check_date(mod_date) ]) for r in rel_recids: recids |= intbitset(get_cited_by(r)) # To not process recids twice recids -= recids_processed # Adds to the set of processed recids recids_processed += recids return recids def recid_chunker(recids): recids_processed = intbitset() chunk = intbitset() for recid in recids: if len(chunk) == 5000: for r in related_records(chunk, recids_processed): yield r recids_processed += chunk chunk = intbitset() if recid not in recids_processed: chunk.add(recid) if chunk: for r in related_records(chunk, recids_processed): yield r recIDs = list(recid_chunker(recids)) ### list of corresponding record IDs was retrieved ### now format the selected records if without_fmt: write_message("Records to be processed: %d" % len(recIDs)) write_message("Out of it records without existing cache: %d" % len(without_fmt)) else: write_message("Records to be processed: %d" % len(recIDs)) ### Initialize main loop total_rec = 0 # Total number of records tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) if process: total_rec_1, tbibformat_1, tbibupload_1 = iterate_over_new(recIDs, fmt) total_rec += total_rec_1 tbibformat += tbibformat_1 tbibupload += tbibupload_1 ### Store last run time if task_has_option("last"): write_message("storing run date to %s" % start_date) store_last_updated(fmt, start_date) ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec write_message(message) message = "total processing time: %2f sec" % elapsed write_message(message) message = "Time spent on external call (os.system):" write_message(message) message = " bibformat: %2f sec" % tbibformat write_message(message) message = " bibupload: %2f sec" % tbibupload write_message(message)
def bibreformat_task(fmt, recids, without_fmt, process): """ BibReformat main task @param fmt: output format to use @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case @param sql_queries: a list of sql queries to be executed to select records to reformat. @param cds_query: a search query to be executed to select records to reformat @param process_format: @param process: @param recids: a list of record IDs to reformat @return: None """ write_message("Processing format %s" % fmt) t1 = os.times()[4] start_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S") latest_bibrank_run = get_bibrankmethod_lastupdate('citation') def related_records(recids, recids_processed): if fmt == "HDREF" and recids: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recids) def check_date(mod_date): return mod_date.strftime("%Y-%m-%d %H:%M:%S") < latest_bibrank_run rel_recids = intbitset([recid for recid, mod_date in run_sql(sql) if check_date(mod_date)]) for r in rel_recids: recids |= intbitset(get_cited_by(r)) # To not process recids twice recids -= recids_processed # Adds to the set of processed recids recids_processed += recids return recids def recid_chunker(recids): recids_processed = intbitset() chunk = intbitset() for recid in recids: if len(chunk) == 5000: for r in related_records(chunk, recids_processed): yield r recids_processed += chunk chunk = intbitset() if recid not in recids_processed: chunk.add(recid) if chunk: for r in related_records(chunk, recids_processed): yield r recIDs = list(recid_chunker(recids)) ### list of corresponding record IDs was retrieved ### now format the selected records if without_fmt: write_message("Records to be processed: %d" % len(recIDs)) write_message("Out of it records without existing cache: %d" % len(without_fmt)) else: write_message("Records to be processed: %d" % len(recIDs)) ### Initialize main loop total_rec = 0 # Total number of records tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) if process: total_rec_1, tbibformat_1, tbibupload_1 = iterate_over_new(recIDs, fmt) total_rec += total_rec_1 tbibformat += tbibformat_1 tbibupload += tbibupload_1 ### Store last run time if task_has_option("last"): write_message("storing run date to %s" % start_date) store_last_updated(fmt, start_date) ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec write_message(message) message = "total processing time: %2f sec" % elapsed write_message(message) message = "Time spent on external call (os.system):" write_message(message) message = " bibformat: %2f sec" % tbibformat write_message(message) message = " bibupload: %2f sec" % tbibupload write_message(message)
def bibreformat_task(fmt, sql, sql_queries, cds_query, process_format, process, recids): """ BibReformat main task @param fmt: output format to use @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case @param sql_queries: a list of sql queries to be executed to select records to reformat. @param cds_query: a search query to be executed to select records to reformat @param process_format: @param process: @param recids: a list of record IDs to reformat @return: None """ write_message("Processing format %s" % fmt) t1 = os.times()[4] start_date = datetime.now() ### Query the database ### task_update_progress('Fetching records to process') if process_format: # '-without' parameter write_message("Querying database for records without cache...") without_format = without_fmt(sql) recIDs = intbitset(recids) if cds_query['field'] != "" or \ cds_query['collection'] != "" or \ cds_query['pattern'] != "": write_message("Querying database (CDS query)...") if cds_query['collection'] == "": # use search_pattern() whenever possible, as it can search # even in private collections res = search_pattern(p=cds_query['pattern'], f=cds_query['field'], m=cds_query['matching']) else: # use perform_request_search when '-c' argument has been # defined, as it is not supported by search_pattern() res = intbitset( perform_request_search(req=None, of='id', c=cds_query['collection'], p=cds_query['pattern'], f=cds_query['field'])) recIDs |= res for sql_query in sql_queries: write_message("Querying database (%s) ..." % sql_query, verbose=2) recIDs |= intbitset(run_sql(sql_query)) if fmt == "HDREF" and recIDs: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes latest_bibrank_run = get_bibrankmethod_lastupdate('citation') start_date = latest_bibrank_run sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recIDs) def check_date(mod_date): return mod_date < latest_bibrank_run recIDs = intbitset([recid for recid, mod_date in run_sql(sql) \ if check_date(mod_date)]) for r in recIDs: recIDs |= intbitset(get_cited_by(r)) ### list of corresponding record IDs was retrieved ### now format the selected records if process_format: write_message("Records to be processed: %d" % (len(recIDs) \ + len(without_format))) write_message("Out of it records without existing cache: %d" % len(without_format)) else: write_message("Records to be processed: %d" % (len(recIDs))) ### Initialize main loop total_rec = 0 # Total number of records tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) if process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this # when migration from php to # python bibformat is done (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_old(recIDs, fmt) else: (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_new(recIDs, fmt) total_rec += total_rec_1 tbibformat += tbibformat_1 tbibupload += tbibupload_1 ### Iterate over all records prepared in list II (no_format) if process_format and process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this # when migration from php to # python bibformat is done (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_old(without_format, fmt) else: (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_new(without_format, fmt) total_rec += total_rec_2 tbibformat += tbibformat_2 tbibupload += tbibupload_2 ### Store last run time if task_has_option("last"): write_message("storing run date to %s" % start_date) store_last_updated(fmt, start_date) ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec write_message(message) message = "total processing time: %2f sec" % elapsed write_message(message) message = "Time spent on external call (os.system):" write_message(message) message = " bibformat: %2f sec" % tbibformat write_message(message) message = " bibupload: %2f sec" % tbibupload write_message(message)
def task_run_core(): """ Main daemon task. Returns True when run successfully. False otherwise. """ rules_to_reset = task_get_option("reset_rules") if rules_to_reset: write_message("Resetting the following rules: %s" % rules_to_reset) for rule in rules_to_reset: reset_rule_last_run(rule) plugins = load_plugins() rules = load_rules(plugins) write_message("Loaded rules: %s" % rules, verbose=9) task_set_option('plugins', plugins) recids_for_rules = get_recids_for_rules(rules) write_message("recids for rules: %s" % recids_for_rules, verbose=9) update_database = not (task_has_option('record_ids') or task_get_option( 'no_upload', False) or task_get_option('no_tickets', False)) if update_database: next_starting_dates = {} for rule_name, rule in rules.iteritems(): next_starting_dates[rule_name] = get_next_starting_date(rule) all_recids = intbitset([]) single_rules = set() batch_rules = set() for rule_name, rule_recids in recids_for_rules.iteritems(): all_recids.union_update(rule_recids) if plugins[rules[rule_name]["check"]]["batch"]: batch_rules.add(rule_name) else: single_rules.add(rule_name) records_to_upload_holdingpen = [] records_to_upload_replace = [] records_to_submit_tickets = [] for batch in iter_batches(all_recids, CFG_BATCH_SIZE): for rule_name in batch_rules: rule = rules[rule_name] rule_recids = recids_for_rules[rule_name] task_sleep_now_if_required(can_stop_too=True) records = [] for i, record_id, record in batch: if record_id in rule_recids: records.append(record) if len(records): check_records(rule, records) # Then run them through normal rules for i, record_id, record in batch: progress_percent = int(float(i) / len(all_recids) * 100) task_update_progress("Processing record %s/%s (%i%%)." % (i, len(all_recids), progress_percent)) write_message("Processing record %s" % record_id) for rule_name in single_rules: rule = rules[rule_name] rule_recids = recids_for_rules[rule_name] task_sleep_now_if_required(can_stop_too=True) if record_id in rule_recids: check_record(rule, record) if record.amended: if record.holdingpen: records_to_upload_holdingpen.append(record) else: records_to_upload_replace.append(record) if not record.valid: records_to_submit_tickets.append(record) if len(records_to_submit_tickets) >= CFG_BATCH_SIZE: Tickets(records_to_submit_tickets).submit() records_to_submit_tickets = [] if len(records_to_upload_holdingpen) >= CFG_BATCH_SIZE: upload_amendments(records_to_upload_holdingpen, True) records_to_upload_holdingpen = [] if len(records_to_upload_replace) >= CFG_BATCH_SIZE: upload_amendments(records_to_upload_replace, False) records_to_upload_replace = [] ## In case there are still some remaining amended records if records_to_submit_tickets: Tickets(records_to_submit_tickets).submit() if records_to_upload_holdingpen: upload_amendments(records_to_upload_holdingpen, True) if records_to_upload_replace: upload_amendments(records_to_upload_replace, False) # Update the database with the last time each rule was ran if update_database: for rule_name, rule in rules.iteritems(): update_rule_last_run(rule_name, next_starting_dates[rule_name]) return True
## with a timestamp perm_file_fd, perm_file_name = \ mkstemp(suffix='.xml', prefix="refextract_%s_" % \ time.strftime("%Y-%m-%d_%H:%M:%S"), \ dir=os.path.join(CFG_TMPDIR, "refextract")) copyfile(daemon_cli_opts['xmlfile'], perm_file_name) os.close(perm_file_fd) except IOError, err: write_message("Error: Unable to copy content to timestamped XML file, %s" \ % err) return 0 ## Now, given the references have been output to option 'xmlfile' ## enrich the meta-data of the affected records, via bibupload ## Only if a named file was given as input if task_has_option('extraction-job'): cmd = "%s/bibupload -n -c '%s' " % (CFG_BINDIR, perm_file_name) errcode = 0 try: errcode = os.system(cmd) except OSError, exc: write_message('Error: Command %s failed [%s].' % (cmd, exc), stream=sys.stdout, verbose=0) if errcode != 0: write_message("Error: %s failed, error code is %d." % (cmd, errcode), stream=sys.stdout, verbose=0) return 0 ## Update the extraction_date for each record id,
def bibreformat_task(fmt, sql, sql_queries, cds_query, process_format, process, recids): """ BibReformat main task @param fmt: output format to use @param sql: dictionary with pre-created sql queries for various cases (for selecting records). Some of these queries will be picked depending on the case @param sql_queries: a list of sql queries to be executed to select records to reformat. @param cds_query: a search query to be executed to select records to reformat @param process_format: @param process: @param recids: a list of record IDs to reformat @return: None """ write_message("Processing format %s" % fmt) t1 = os.times()[4] start_date = datetime.now() ### Query the database ### task_update_progress('Fetching records to process') if process_format: # '-without' parameter write_message("Querying database for records without cache...") without_format = without_fmt(sql) recIDs = intbitset(recids) if cds_query['field'] != "" or \ cds_query['collection'] != "" or \ cds_query['pattern'] != "": write_message("Querying database (CDS query)...") if cds_query['collection'] == "": # use search_pattern() whenever possible, as it can search # even in private collections res = search_pattern(p=cds_query['pattern'], f=cds_query['field'], m=cds_query['matching']) else: # use perform_request_search when '-c' argument has been # defined, as it is not supported by search_pattern() res = intbitset(perform_request_search(req=None, of='id', c=cds_query['collection'], p=cds_query['pattern'], f=cds_query['field'])) recIDs |= res for sql_query in sql_queries: write_message("Querying database (%s) ..." % sql_query, verbose=2) recIDs |= intbitset(run_sql(sql_query)) if fmt == "HDREF" and recIDs: # HDREF represents the references tab # the tab needs to be recomputed not only when the record changes # but also when one of the citations changes latest_bibrank_run = get_bibrankmethod_lastupdate('citation') start_date = latest_bibrank_run sql = """SELECT id, modification_date FROM bibrec WHERE id in (%s)""" % ','.join(str(r) for r in recIDs) def check_date(mod_date): return mod_date < latest_bibrank_run recIDs = intbitset([recid for recid, mod_date in run_sql(sql) \ if check_date(mod_date)]) for r in recIDs: recIDs |= intbitset(get_cited_by(r)) ### list of corresponding record IDs was retrieved ### now format the selected records if process_format: write_message("Records to be processed: %d" % (len(recIDs) \ + len(without_format))) write_message("Out of it records without existing cache: %d" % len(without_format)) else: write_message("Records to be processed: %d" % (len(recIDs))) ### Initialize main loop total_rec = 0 # Total number of records tbibformat = 0 # time taken up by external call tbibupload = 0 # time taken up by external call ### Iterate over all records prepared in lists I (option) if process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this # when migration from php to # python bibformat is done (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_old(recIDs, fmt) else: (total_rec_1, tbibformat_1, tbibupload_1) = iterate_over_new(recIDs, fmt) total_rec += total_rec_1 tbibformat += tbibformat_1 tbibupload += tbibupload_1 ### Iterate over all records prepared in list II (no_format) if process_format and process: if CFG_BIBFORMAT_USE_OLD_BIBFORMAT: # FIXME: remove this # when migration from php to # python bibformat is done (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_old(without_format, fmt) else: (total_rec_2, tbibformat_2, tbibupload_2) = iterate_over_new(without_format, fmt) total_rec += total_rec_2 tbibformat += tbibformat_2 tbibupload += tbibupload_2 ### Store last run time if task_has_option("last"): write_message("storing run date to %s" % start_date) store_last_updated(fmt, start_date) ### Final statistics t2 = os.times()[4] elapsed = t2 - t1 message = "total records processed: %d" % total_rec write_message(message) message = "total processing time: %2f sec" % elapsed write_message(message) message = "Time spent on external call (os.system):" write_message(message) message = " bibformat: %2f sec" % tbibformat write_message(message) message = " bibupload: %2f sec" % tbibupload write_message(message)
def task_submit_check_options(): """ NOTE: Depending on the parameters, either "BibSched mode" or plain straigh-forward execution mode is entered. """ if task_has_option("create_event_with_id"): print webstat.create_customevent( task_get_option("create_event_with_id"), task_get_option("event_name", None), task_get_option("column_headers", [])) sys.exit(0) elif task_has_option("destroy_event_with_id"): print webstat.destroy_customevent( task_get_option("destroy_event_with_id")) sys.exit(0) elif task_has_option("list_events"): events = webstat._get_customevents() if len(events) == 0: print "There are no custom events available." else: print "Available custom events are:\n" print '\n'.join([ x[0] + ": " + ((x[1] == None) and "No descriptive name" or str(x[1])) for x in events ]) sys.exit(0) elif task_has_option("cache_events"): events = task_get_option("cache_events") write_message(str(events), verbose=9) if events[0] == 'ALL': keyevents_to_cache = webstat.KEYEVENT_REPOSITORY.keys() customevents_to_cache = [x[0] for x in webstat._get_customevents()] elif events[0] == 'KEYEVENTS': keyevents_to_cache = webstat.KEYEVENT_REPOSITORY.keys() customevents_to_cache = [] elif events[0] == 'CUSTOMEVENTS': keyevents_to_cache = [] customevents_to_cache = [x[0] for x in webstat._get_customevents()] elif events[0] != '': keyevents_to_cache = [ x for x in webstat.KEYEVENT_REPOSITORY.keys() if x in events ] customevents_to_cache = [ x[0] for x in webstat._get_customevents() if x in events ] # Control so that we have valid event names if len(keyevents_to_cache + customevents_to_cache) == 0: # Oops, no events. Abort and display help. return False else: task_set_option("keyevents", keyevents_to_cache) task_set_option("customevents", customevents_to_cache) return True elif task_has_option("dump_config"): print """\ [general] visitors_box = True search_box = True record_box = True bibsched_box = True basket_box = True apache_box = True uptime_box = True [webstat_custom_event_1] name = baskets param1 = action param2 = basket param3 = user [apache_log_analyzer] profile = nil nb-histogram-items-to-print = 20 exclude-ip-list = ("137.138.249.162") home-collection = "Atlantis Institute of Fictive Science" search-interface-url = "/?" detailed-record-url = "/%s/" search-engine-url = "/search?" search-engine-url-old-style = "/search.py?" basket-url = "/yourbaskets/" add-to-basket-url = "/yourbaskets/add" display-basket-url = "/yourbaskets/display" display-public-basket-url = "/yourbaskets/display_public" alert-url = "/youralerts/" display-your-alerts-url = "/youralerts/list" display-your-searches-url = "/youralerts/display" """ % CFG_SITE_RECORD sys.exit(0) elif task_has_option("load_config"): from ConfigParser import ConfigParser conf = ConfigParser() conf.read(CFG_WEBSTAT_CONFIG_PATH) for section in conf.sections(): if section[:21] == "webstat_custom_event_": cols = [] name = "" for option, value in conf.items(section): if option == "name": name = value if option[:5] == "param": # add the column name in it's position index = int(option[-1]) - 1 while len(cols) <= index: cols.append("") cols[index] = value if name: res = run_sql( "SELECT COUNT(id) FROM staEVENT WHERE id = %s", (name, )) if res[0][0] == 0: # name does not exist, create customevent webstat.create_customevent(name, name, cols) else: # name already exists, update customevent webstat.modify_customevent(name, cols=cols) sys.exit(0) else: # False means that the --help should be displayed return False
def task_submit_check_options(): """ NOTE: Depending on the parameters, either "BibSched mode" or plain straigh-forward execution mode is entered. """ if task_has_option("create_event_with_id"): print webstat.create_customevent( task_get_option("create_event_with_id"), task_get_option("event_name", None), task_get_option("column_headers", []), ) sys.exit(0) elif task_has_option("destroy_event_with_id"): print webstat.destroy_customevent(task_get_option("destroy_event_with_id")) sys.exit(0) elif task_has_option("list_events"): events = webstat._get_customevents() if len(events) == 0: print "There are no custom events available." else: print "Available custom events are:\n" print "\n".join([x[0] + ": " + ((x[1] == None) and "No descriptive name" or str(x[1])) for x in events]) sys.exit(0) elif task_has_option("cache_events"): events = task_get_option("cache_events") write_message(str(events), verbose=9) if events[0] == "ALL": keyevents_to_cache = webstat.KEYEVENT_REPOSITORY.keys() customevents_to_cache = [x[0] for x in webstat._get_customevents()] elif events[0] == "KEYEVENTS": keyevents_to_cache = webstat.KEYEVENT_REPOSITORY.keys() customevents_to_cache = [] elif events[0] == "CUSTOMEVENTS": keyevents_to_cache = [] customevents_to_cache = [x[0] for x in webstat._get_customevents()] elif events[0] != "": keyevents_to_cache = [x for x in webstat.KEYEVENT_REPOSITORY.keys() if x in events] customevents_to_cache = [x[0] for x in webstat._get_customevents() if x in events] # Control so that we have valid event names if len(keyevents_to_cache + customevents_to_cache) == 0: # Oops, no events. Abort and display help. return False else: task_set_option("keyevents", keyevents_to_cache) task_set_option("customevents", customevents_to_cache) return True elif task_has_option("dump_config"): print """\ [general] visitors_box = True search_box = True record_box = True bibsched_box = True basket_box = True apache_box = True uptime_box = True [webstat_custom_event_1] name = baskets param1 = action param2 = basket param3 = user [apache_log_analyzer] profile = nil nb-histogram-items-to-print = 20 exclude-ip-list = ("137.138.249.162") home-collection = "Atlantis Institute of Fictive Science" search-interface-url = "/?" detailed-record-url = "/%s/" search-engine-url = "/search?" search-engine-url-old-style = "/search.py?" basket-url = "/yourbaskets/" add-to-basket-url = "/yourbaskets/add" display-basket-url = "/yourbaskets/display" display-public-basket-url = "/yourbaskets/display_public" alert-url = "/youralerts/" display-your-alerts-url = "/youralerts/list" display-your-searches-url = "/youralerts/display" """ % CFG_SITE_RECORD sys.exit(0) elif task_has_option("load_config"): from ConfigParser import ConfigParser conf = ConfigParser() conf.read(CFG_WEBSTAT_CONFIG_PATH) for section in conf.sections(): if section[:21] == "webstat_custom_event_": cols = [] name = "" for option, value in conf.items(section): if option == "name": name = value if option[:5] == "param": # add the column name in it's position index = int(option[-1]) - 1 while len(cols) <= index: cols.append("") cols[index] = value if name: res = run_sql("SELECT COUNT(id) FROM staEVENT WHERE id = %s", (name,)) if res[0][0] == 0: # name does not exist, create customevent webstat.create_customevent(name, name, cols) else: # name already exists, update customevent webstat.modify_customevent(name, cols=cols) sys.exit(0) else: # False means that the --help should be displayed return False
## with a timestamp perm_file_fd, perm_file_name = \ mkstemp(suffix='.xml', prefix="refextract_%s_" % \ time.strftime("%Y-%m-%d_%H:%M:%S"), \ dir=os.path.join(CFG_TMPDIR, "refextract")) copyfile(daemon_cli_opts['xmlfile'], perm_file_name) os.close(perm_file_fd) except IOError, err: write_message("Error: Unable to copy content to timestamped XML file, %s" \ % err) return 0 ## Now, given the references have been output to option 'xmlfile' ## enrich the meta-data of the affected records, via bibupload ## Only if a named file was given as input if task_has_option('extraction-job'): cmd = "%s/bibupload -n -c '%s' " % (CFG_BINDIR, perm_file_name) errcode = 0 try: errcode = os.system(cmd) except OSError, exc: write_message('Error: Command %s failed [%s].' % (cmd, exc), stream=sys.stdout, verbose=0) if errcode != 0: write_message("Error: %s failed, error code is %d." % (cmd, errcode), stream=sys.stdout, verbose=0) return 0 ## Update the extraction_date for each record id, ## (only those which have been given to Refextract) if task_info['last_updated']: ## If the last updated time exists in the db.. update it
def _task_run_core(): """calls extract_references in refextract""" def _append_recid_collection_list(collection, current_recids): """Updated list of recids with new recids from collection @param collection: (string) collection name to use to obtain record ids @param current_recids: (list) list of current record ids which have already been obtained from previous collection or recid flags @return: (list) current record ids with newly appended recids from input collection """ records = get_collection_reclist(collection) for r in records: if r not in current_recids: current_recids.append(r) return current_recids daemon_cli_opts = { 'treat_as_reference_section' : 0, 'fulltext' : [], 'output_raw' : 0, 'verbosity' : 0, 'xmlfile' : 0, 'dictfile' : 0, 'inspire' : 0, 'kb-journal' : 0, 'kb-report-number' : 0, 'extraction-mode' : 'ref', 'authors' : 0, 'affiliations' : 0, 'treat_as_raw_section' : 0, } ## holds the name of the extraction job, and if it's already in the db task_info = task_get_option('extraction-job') ## Now set the cli options, from the set task options list if task_has_option('verbose'): v = task_get_option('verbose') if not v.isdigit(): daemon_cli_opts['verbosity'] = 0 elif int(v) not in xrange(0, 10): daemon_cli_opts['verbosity'] = 0 else: daemon_cli_opts['verbosity'] = int(v) if task_has_option('raw-references'): daemon_cli_opts['treat_as_reference_section'] = 1 if task_has_option('output-raw-refs'): daemon_cli_opts['output_raw'] = 1 if task_has_option('xmlfile'): daemon_cli_opts['xmlfile'] = task_get_option('xmlfile') if task_has_option('dictfile'): daemon_cli_opts['dictfile'] = task_get_option('dictfile') if task_has_option('inspire'): daemon_cli_opts['inspire'] = 1 if task_has_option('kb-journal'): daemon_cli_opts['kb-journal'] = task_get_option('kb-journal') if task_has_option('kb-report-number'): daemon_cli_opts['kb-report-number'] = task_get_option('kb-report-number') if task_get_option('recids'): ## Construct the fulltext argument equivalent from record id's ## (records, and arguments, which have valid files) try: fulltexts_for_collection = \ _get_fulltext_args_from_recids(task_get_option('recids'), task_info) daemon_cli_opts['fulltext'].extend(fulltexts_for_collection) except Exception, err: write_message('Error: Unable to obtain fulltexts for recid %s. %s' \ % (str(task_get_option('recids')), err), \ stream=sys.stdout, verbose=0) raise StandardError
def _task_submit_elaborate_specific_parameter(key, value, opts, args): """ Must be defined for bibtask to create a task """ if args and len(args) > 0: ## There should be no standalone arguments for any refextract job ## This will catch args before the job is shipped to Bibsched raise StandardError("Error: Unrecognised argument '%s'.\n" % args[0]) ## Task name specified if key in ('-e', '--extraction-job'): ## Make sure that the user is not mixing job name with other defined ## Refextract flags on the command line if filter(lambda p: task_get_option(p), possible_task_option_keys): write_message( "Error: cli and extraction-job extraction parameters specified together." ) write_message( "The extraction-job flag cannot be mixed with other cli flags." ) return False ## ---- Get the task file with this name task_file_dir = os.path.join(CFG_ETCDIR, 'bibedit') ## The job file name task_file = value + '.cfg' abs_path = os.path.join(task_file_dir, task_file) try: ## Open and readlines from file file_hdl = open(abs_path, 'r') file_params = file_hdl.readlines() file_hdl.close() except IOError: write_message("Error: Unable to read job file '%s'" % \ abs_path, stream=sys.stdout, verbose=0) return False ## ---- Get the database 'last_updated' value for this name xtrJOB_row = _task_name_exists(value) ## Build the information for this extraction job ## These dictionaries will be extended with extra file parameters if xtrJOB_row: task_info = { 'id': xtrJOB_row[0][0], 'name': xtrJOB_row[0][1], 'last_updated': xtrJOB_row[0][2], 'collections': [], 'recids': [], } else: ## Save the name as the input argument for this job task_info = { 'name': value, 'last_updated': None, 'collections': [], 'recids': [], } ## ---- Save job parameters for p in file_params: p = p.strip() ## Ignore comments and titles, and skip blank lines if (not p) or p.startswith('#') or p.startswith("["): continue ## Split arguments just once p_args = map(lambda x: x.strip(), p.split("=", 1)) ## Check cfg file param against list of vaild params if not (p_args[0] in CFG_REFEXTRACT_JOB_FILE_PARAMS): write_message("Error: Unknown task param '%s' inside '%s'." \ % (p_args[0], task_file), stream=sys.stdout, verbose=0) return False if p_args[0] == 'collection': ## Separate and strip collections collections = map(lambda c: c.strip(), p_args[1].split(',')) task_info['collections'].extend( [c for c in collections if c.strip()]) #FIXME add author extraction functionality # elif p_args[0] == 'extraction-mode': # if p_args[0] == 'authors': # task_set_option('authors', p_args[1]) elif p_args[0] == 'recid': recids = p_args[1].split(",") task_info['recids'].extend([r for r in recids if r.strip()]) elif len(p_args) == 2: ## All other flags task_info[p_args[0]] = p_args[1] else: ## Standalone flag task_info[p_args[0]] = 1 if not ('xmlfile' in task_info): task_info['xmlfile'] = _generate_default_xml_out() ## Used to flag the creation of a bibupload task task_set_option('extraction-job', task_info) ## using the extraction-job options... ## set the task options for option, value in task_info.items(): if option == 'collections': for collection in value: collection_row = _collection_exists(collection) if not collection_row: write_message( "Error: '%s' is not a valid collection." % collection, stream=sys.stdout, verbose=0) return 0 ## Use the collection name matched from the database task_get_option(option).append(collection_row[0][0]) elif option == 'recids': for recid in value: if not _recid_exists(recid): write_message("Error: '%s' is not a valid record id." % recid, stream=sys.stdout, verbose=0) return 0 ## Add this valid record id to the list of record ids task_get_option(option).append(recid) elif option not in ('id', 'name', 'last_updated'): ## Usual way of setting options, but this time from the extraction-job file task_set_option(option, value) else: ## Quick check to see if an extraction job has also been specified if task_has_option('extraction-job'): write_message( "Error: cli and extraction-job extraction parameters specified together." ) write_message( "The extraction-job flag cannot be mixed with other cli flags." ) return False # Recid option elif key in ("-i", "--recid"): split_recids = value.split(":") if len(split_recids) == 2: first = last = valid_range = None try: first = int(split_recids[0]) last = int(split_recids[1]) valid_range = first < last except ValueError: write_message( "Error: Range values for --recid must be integers, " "not '%s'." % value, stream=sys.stdout, verbose=0) if first is None or last is None: return False if not _recid_exists(first) or not _recid_exists( last) or not valid_range: write_message( "Error: '%s' is not a valid range of record ID's." % value, stream=sys.stdout, verbose=0) return False task_get_option('recids').extend(range(first, last)) else: int_val = None try: int_val = int(value) except ValueError: write_message( "Error: The value specified for --recid must be a " "valid integer, not '%s'." % value, stream=sys.stdout, verbose=0) if not _recid_exists(value) or int_val is None: write_message("Error: '%s' is not a valid record ID." % value, stream=sys.stdout, verbose=0) return False task_get_option('recids').append(value) # Collection option elif key in ("-c", "--collection"): collection_row = _collection_exists(value) if not collection_row: write_message("Error: '%s' is not a valid collection." % value, stream=sys.stdout, verbose=0) return False task_get_option('collections').append(collection_row[0][0]) elif key in ('-z', '--raw-references'): task_set_option('raw-references', True) elif key in ('-r', '--output-raw-refs'): task_set_option('output-raw-refs', True) elif key in ('-x', '--xmlfile'): task_set_option('xmlfile', value) elif key in ('-d', '--dictfile'): task_set_option('dictfile', value) elif key in ('-p', '--inspire'): task_set_option('inspire', True) elif key in ('-j', '--kb-journal'): task_set_option('kb-journal', value) elif key in ('-n', '--kb-report-number'): task_set_option('kb-report-number', value) return True