def get_recids_for_rules(rules): """ Generates the final list of record IDs to load. @param rules dict of rules {rule_name: rule_dict} @type rules: dict of rules @return dict {rule_name: array of record IDs} """ override_record_ids = task_get_option("record_ids") recids = {} for rule_name, rule in rules.iteritems(): if "filter_pattern" in rule: query = rule["filter_pattern"] if "filter_collection" in rule: collections = rule["filter_collection"].split() else: collections = None write_message("Performing given search query: '%s'" % query) if collections: result = perform_request_search(p=query, of='intbitset', wl=rule.get('filter_limit', 0), f=rule.get( 'filter_field', None), c=collections) else: result = search_pattern( p=query, wl=rule.get('filter_limit', 0), f=rule.get('filter_field', None), ) else: result = intbitset(trailing_bits=True) if override_record_ids is not None: result.intersection_update(override_record_ids) else: last_run = get_rule_lastrun(rule_name) modified_recids = get_modified_records_since(last_run) if not "consider_deleted_records" in rule: modified_recids -= search_unit_in_bibxxx(p='DELETED', f='980__%', type='e') if CFG_CERN_SITE: modified_recids -= search_unit_in_bibxxx(p='DUMMY', f='980__%', type='e') result.intersection_update(modified_recids) recids[rule_name] = result return recids
def get_recids_for_rules(rules): """ Generates the final list of record IDs to load. @param rules dict of rules {rule_name: rule_dict} @type rules: dict of rules @return dict {rule_name: array of record IDs} """ override_record_ids = task_get_option("record_ids") recids = {} for rule_name, rule in rules.iteritems(): if "filter_pattern" in rule: query = rule["filter_pattern"] if "filter_collection" in rule: collections = rule["filter_collection"].split() else: collections = None write_message("Performing given search query: '%s'" % query) if collections: result = perform_request_search( p=query, of='intbitset', wl=rule.get('filter_limit', 0), f=rule.get('filter_field', None), c=collections ) else: result = search_pattern( p=query, wl=rule.get('filter_limit', 0), f=rule.get('filter_field', None), ) else: result = intbitset(trailing_bits=True) if override_record_ids is not None: result.intersection_update(override_record_ids) else: last_run = get_rule_lastrun(rule_name) modified_recids = get_modified_records_since(last_run) if not "consider_deleted_records" in rule: modified_recids -= search_unit_in_bibxxx(p='DELETED', f='980__%', m='e') if CFG_CERN_SITE: modified_recids -= search_unit_in_bibxxx(p='DUMMY', f='980__%', m='e') result.intersection_update(modified_recids) recids[rule_name] = result return recids
def _get_recids_foreach_ontology(recids=None, collections=None, taxonomy=None): """Returns an array containing hash objects containing the collection, its corresponding ontology and the records belonging to the given collection.""" rec_onts = [] # User specified record IDs. if recids: rec_onts.append({ 'ontology': taxonomy, 'collection': None, 'recIDs': recids, }) return rec_onts # User specified collections. if collections: for collection in collections: records = get_collection_reclist(collection) if records: rec_onts.append({ 'ontology': taxonomy, 'collection': collection, 'recIDs': records }) return rec_onts # Use rules found in collection_clsMETHOD. result = run_sql("""SELECT "clsMETHOD".name, "clsMETHOD".last_updated, """ """collection.name FROM "clsMETHOD" JOIN "collection_clsMETHOD" ON """ """clsMETHOD".id="id_clsMETHOD" JOIN collection ON """ "id_collection=collection.id") for ontology, date_last_run, collection in result: records = get_collection_reclist(collection) if records: if not date_last_run: bibtask.write_message( "INFO: Collection %s has not been previously " "analyzed." % collection, stream=sys.stderr, verbose=3) modified_records = intbitset(run_sql("SELECT id FROM bibrec")) elif bibtask.task_get_option('force'): bibtask.write_message( "INFO: Analysis is forced for collection %s." % collection, stream=sys.stderr, verbose=3) modified_records = intbitset(run_sql("SELECT id FROM bibrec")) else: modified_records = bibtask.get_modified_records_since(date_last_run) records &= modified_records if records: rec_onts.append({ 'ontology': ontology, 'collection': collection, 'recIDs': records }) else: bibtask.write_message( "WARNING: All records from collection '%s' have " "already been analyzed for keywords with ontology '%s' " "on %s." % (collection, ontology, date_last_run), stream=sys.stderr, verbose=2) else: bibtask.write_message( "ERROR: Collection '%s' doesn't contain any record. " "Cannot analyse keywords." % (collection,), stream=sys.stderr, verbose=0) return rec_onts
def _get_recids_foreach_ontology(recids=None, collections=None, taxonomy=None): """Returns an array containing hash objects containing the collection, its corresponding ontology and the records belonging to the given collection.""" rec_onts = [] # User specified record IDs. if recids: rec_onts.append({ 'ontology': taxonomy, 'collection': None, 'recIDs': recids, }) return rec_onts # User specified collections. if collections: for collection in collections: records = get_collection_reclist(collection) if records: rec_onts.append({ 'ontology': taxonomy, 'collection': collection, 'recIDs': records }) return rec_onts # Use rules found in collection_clsMETHOD. result = run_sql( """SELECT "clsMETHOD".name, "clsMETHOD".last_updated, """ """collection.name FROM "clsMETHOD" JOIN "collection_clsMETHOD" ON """ """clsMETHOD".id="id_clsMETHOD" JOIN collection ON """ "id_collection=collection.id") for ontology, date_last_run, collection in result: records = get_collection_reclist(collection) if records: if not date_last_run: bibtask.write_message( "INFO: Collection %s has not been previously " "analyzed." % collection, stream=sys.stderr, verbose=3) modified_records = intbitset(run_sql("SELECT id FROM bibrec")) elif bibtask.task_get_option('force'): bibtask.write_message( "INFO: Analysis is forced for collection %s." % collection, stream=sys.stderr, verbose=3) modified_records = intbitset(run_sql("SELECT id FROM bibrec")) else: modified_records = bibtask.get_modified_records_since( date_last_run) records &= modified_records if records: rec_onts.append({ 'ontology': ontology, 'collection': collection, 'recIDs': records }) else: bibtask.write_message( "WARNING: All records from collection '%s' have " "already been analyzed for keywords with ontology '%s' " "on %s." % (collection, ontology, date_last_run), stream=sys.stderr, verbose=2) else: bibtask.write_message( "ERROR: Collection '%s' doesn't contain any record. " "Cannot analyse keywords." % (collection, ), stream=sys.stderr, verbose=0) return rec_onts