def get_recids_for_set_spec(set_spec): """ Returns the list (as HitSet) of recids belonging to 'set' Parameters: set_spec - *str* the set_spec for which we would like to get the recids """ recids = HitSet() for set_def in get_set_definitions(set_spec): new_recids = perform_request_search(c=[coll.strip() \ for coll in set_def['c'].split(',')], p1=set_def['p1'], f1=set_def['f1'], m1=set_def['m1'], op1=set_def['op1'], p2=set_def['p2'], f2=set_def['f2'], m2=set_def['m2'], op2=set_def['op2'], p3=set_def['p3'], f3=set_def['f3'], m3=set_def['m3'], ap=0) recids = recids.union(HitSet(new_recids)) return recids
def oairepositoryupdater_task(): """Main business logic code of oai_archive""" no_upload = task_get_option("no_upload") report = task_get_option("report") if report > 1: print_repository_status(verbose=report) return True task_update_progress("Fetching records to process") # Build the list of records to be processed, that is, search for # the records that match one of the search queries defined in OAI # Repository admin interface. recids_for_set = {} # Remember exactly which record belongs to which set recids = HitSet() # "Flat" set of the recids_for_set values for set_spec in all_set_specs(): task_sleep_now_if_required(can_stop_too=True) _recids = get_recids_for_set_spec(set_spec) recids_for_set[set_spec] = _recids recids = recids.union(_recids) # Also get the list of records that are currently exported through # OAI and that might need to be refreshed oai_recids = perform_request_search(c=CFG_SITE_NAME, p1='oai:%s:*' % CFG_OAI_ID_PREFIX, f1=CFG_OAI_ID_FIELD, m1="e", ap=0) recids = recids.union(HitSet(oai_recids)) # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") # Iterate over the recids i = 0 for recid in recids: i += 1 task_sleep_now_if_required(can_stop_too=True) task_update_progress("Done %s out of %s records." % \ (i, len(recids))) # Check if an OAI identifier is already in the record or # not. oai_id_entry = "" oai_ids = [_oai_id for _oai_id in \ get_fieldvalues(recid, CFG_OAI_ID_FIELD) \ if _oai_id.strip() != ''] if len(oai_ids) == 0: oai_id_entry = "<subfield code=\"%s\">oai:%s:%s</subfield>\n" % \ (CFG_OAI_ID_FIELD[5:6], CFG_OAI_ID_PREFIX, recid) # Get the sets to which this record already belongs according # to the metadata current_oai_sets = set(\ [_oai_set for _oai_set in \ get_fieldvalues(recid, CFG_OAI_SET_FIELD) \ if _oai_set.strip() != '']) # Get the sets that should be in this record according to # settings updated_oai_sets = set(\ [_set for _set, _recids in recids_for_set.iteritems() if recid in _recids]) # Ok, we have the old sets and the new sets. If they are equal # and oai ID does not need to be added, then great, nothing to # change . Otherwise apply the new sets. if current_oai_sets == updated_oai_sets and not oai_id_entry: continue # Jump to next recid # Generate the xml sets entry oai_set_entry = '\n'.join(["<subfield code=\"%s\">%s</subfield>" % \ (CFG_OAI_SET_FIELD[5:6], _oai_set) \ for _oai_set in updated_oai_sets]) + \ "\n" # Also get all the datafields with tag and indicator matching # CFG_OAI_SET_FIELD[:5] and CFG_OAI_ID_FIELD[:5] but with # subcode != CFG_OAI_SET_FIELD[5:6] and subcode != # CFG_OAI_SET_FIELD[5:6], so that we can preserve these values other_data = marcxml_filter_out_tags(recid, [CFG_OAI_SET_FIELD, CFG_OAI_ID_FIELD]) if oai_id_entry or oai_set_entry: if CFG_OAI_ID_FIELD[0:5] == CFG_OAI_SET_FIELD[0:5]: # Put set and OAI ID in the same datafield oai_out.write("<record>\n") oai_out.write("<controlfield tag=\"001\">%s" "</controlfield>\n" % recid) oai_out.write(DATAFIELD_ID_HEAD) oai_out.write("\n") #if oai_id_entry: oai_out.write(oai_id_entry) #if oai_set_entry: oai_out.write(oai_set_entry) oai_out.write("</datafield>\n") oai_out.write(other_data) oai_out.write("</record>\n") else: oai_out.write("<record>\n") oai_out.write("<controlfield tag=\"001\">%s" "</controlfield>\n" % recid) if oai_id_entry: oai_out.write(DATAFIELD_ID_HEAD) oai_out.write("\n") oai_out.write(oai_id_entry) oai_out.write("</datafield>\n") if oai_set_entry: oai_out.write(DATAFIELD_SET_HEAD) oai_out.write("\n") oai_out.write(oai_set_entry) oai_out.write("</datafield>\n") oai_out.write(other_data) oai_out.write("</record>\n") oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: task_sleep_now_if_required(can_stop_too=True) # Check if file is empty or not: len_file = os.stat(filename)[ST_SIZE] if len_file > 0: command = "%s/bibupload -c %s -u oairepository" % (CFG_BINDIR, filename) os.system(command) else: os.remove(filename) return True
def print_repository_status(write_message=write_message, verbose=0): """ Prints the repository status to the standard output. Parameters: write_message - *function* the function used to write the output verbose - *int* the verbosity of the output - 0: print repository size - 1: print quick status of each set (numbers can be wrong if the repository is in some inconsistent state, i.e. a record is in an OAI setSpec but has not OAI ID) - 2: print detailed status of repository, with number of records that needs to be synchronized according to the sets definitions. Precise, but ~slow... """ repository_size_s = "%d" % repository_size() repository_recids_after_update = HitSet() write_message(CFG_SITE_NAME) write_message(" OAI Repository Status") set_spec_max_length = 19 # How many max char do we display for set_name_max_length = 20 # setName and setSpec? if verbose == 0: # Just print repository size write_message(" Total(**)" + " " * 29 + " " * (9 - len(repository_size_s)) + repository_size_s) return elif verbose == 1: # We display few information: show longer set name and spec set_spec_max_length = 30 set_name_max_length = 30 write_message("=" * 80) header = " setSpec" + " " * (set_spec_max_length - 7) + \ " setName" + " " * (set_name_max_length - 5) + " Volume" if verbose > 1: header += " " * 5 + "After update(*):" write_message(header) if verbose > 1: write_message(" " * 57 + "Additions Deletions") write_message("-" * 80) for set_spec in all_set_specs(): if verbose <= 1: # Get the records that are in this set. This is an # incomplete check, as it can happen that some records are # in this set (according to the metadata) but have no OAI # ID (so they are not exported). This can happen if the # repository has some records coming from external # sources, or if it has never been synchronized with this # tool. current_recids = perform_request_search(c=CFG_SITE_NAME, p1=set_spec, f1=CFG_OAI_SET_FIELD, m1="e", ap=0) nb_current_recids = len(current_recids) else: # Get the records that are *currently* exported for this # setSpec current_recids = perform_request_search(c=CFG_SITE_NAME, p1=set_spec, f1=CFG_OAI_SET_FIELD, m1="e", ap=0, op1="a", p2="oai:*", f2=CFG_OAI_ID_FIELD, m2="e") nb_current_recids = len(current_recids) # Get the records that *should* be in this set according to # the admin defined settings, and compute how many should be # added or removed should_recids = get_recids_for_set_spec(set_spec) repository_recids_after_update = repository_recids_after_update.union(should_recids) nb_add_recids = len(HitSet(should_recids).difference(HitSet(current_recids))) nb_remove_recids = len(HitSet(current_recids).difference(HitSet(should_recids))) nb_should_recids = len(should_recids) nb_recids_after_update = len(repository_recids_after_update) # Adapt setName and setSpec strings lengths set_spec_str = set_spec if len(set_spec_str) > set_spec_max_length : set_spec_str = "%s.." % set_spec_str[:set_spec_max_length] set_name_str = get_set_name_for_set_spec(set_spec) if len(set_name_str) > set_name_max_length : set_name_str = "%s.." % set_name_str[:set_name_max_length] row = " " + set_spec_str + \ " " * ((set_spec_max_length + 2) - len(set_spec_str)) + set_name_str + \ " " * ((set_name_max_length + 2) - len(set_name_str)) + \ " " * (7 - len(str(nb_current_recids))) + str(nb_current_recids) if verbose > 1: row += \ " " * max(9 - len(str(nb_add_recids)), 0) + '+' + str(nb_add_recids) + \ " " * max(7 - len(str(nb_remove_recids)), 0) + '-' + str(nb_remove_recids) + " = " +\ " " * max(7 - len(str(nb_should_recids)), 0) + str(nb_should_recids) write_message(row) write_message("=" * 80) footer = " Total(**)" + " " * (set_spec_max_length + set_name_max_length - 7) + \ " " * (9 - len(repository_size_s)) + repository_size_s if verbose > 1: footer += ' ' * (28 - len(str(nb_recids_after_update))) + str(nb_recids_after_update) write_message(footer) if verbose > 1: write_message(' *The "after update" columns show the repository after you run this tool.') else: write_message(' *"Volume" is indicative if repository is out of sync. Use --detailed-report.') write_message('**The "total" is not the sum of the above numbers, but the union of the records.')
def oairepositoryupdater_task(): """Main business logic code of oai_archive""" no_upload = task_get_option("no_upload") report = task_get_option("report") if report > 1: print_repository_status(verbose=report) return True task_update_progress("Fetching records to process") # Build the list of records to be processed, that is, search for # the records that match one of the search queries defined in OAI # Repository admin interface. recids_for_set = {} # Remember exactly which record belongs to which set recids = HitSet() # "Flat" set of the recids_for_set values for set_spec in all_set_specs(): task_sleep_now_if_required(can_stop_too=True) _recids = get_recids_for_set_spec(set_spec) recids_for_set[set_spec] = _recids recids = recids.union(_recids) # Also get the list of records that are currently exported through # OAI and that might need to be refreshed oai_recids = perform_request_search(c=CFG_SITE_NAME, p1='oai:%s:*' % CFG_OAI_ID_PREFIX, f1=CFG_OAI_ID_FIELD, m1="e", ap=0) recids = recids.union(HitSet(oai_recids)) # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write('<collection>') has_updated_records = False # Iterate over the recids i = 0 for recid in recids: i += 1 task_sleep_now_if_required(can_stop_too=True) task_update_progress("Done %s out of %s records." % \ (i, len(recids))) # Check if an OAI identifier is already in the record or # not. oai_id_entry = "<subfield code=\"%s\">oai:%s:%s</subfield>\n" % \ (CFG_OAI_ID_FIELD[5:6], CFG_OAI_ID_PREFIX, recid) already_has_oai_id = True oai_ids = [_oai_id for _oai_id in \ get_fieldvalues(recid, CFG_OAI_ID_FIELD) \ if _oai_id.strip() != ''] if len(oai_ids) == 0: already_has_oai_id = False # Get the sets to which this record already belongs according # to the metadata current_oai_sets = set(\ [_oai_set for _oai_set in \ get_fieldvalues(recid, CFG_OAI_SET_FIELD) \ if _oai_set.strip() != '']) # Get the sets that should be in this record according to # settings updated_oai_sets = set(\ [_set for _set, _recids in recids_for_set.iteritems() if recid in _recids if _set]) # Ok, we have the old sets and the new sets. If they are equal # and oai ID does not need to be added, then great, nothing to # change . Otherwise apply the new sets. if current_oai_sets == updated_oai_sets and already_has_oai_id: continue # Jump to next recid has_updated_records = True # Generate the xml sets entry oai_set_entry = '\n'.join(["<subfield code=\"%s\">%s</subfield>" % \ (CFG_OAI_SET_FIELD[5:6], _oai_set) \ for _oai_set in updated_oai_sets if \ _oai_set]) + \ "\n" # Also get all the datafields with tag and indicator matching # CFG_OAI_SET_FIELD[:5] and CFG_OAI_ID_FIELD[:5] but with # subcode != CFG_OAI_SET_FIELD[5:6] and subcode != # CFG_OAI_SET_FIELD[5:6], so that we can preserve these values other_data = marcxml_filter_out_tags( recid, [CFG_OAI_SET_FIELD, CFG_OAI_ID_FIELD]) if CFG_OAI_ID_FIELD[0:5] == CFG_OAI_SET_FIELD[0:5]: # Put set and OAI ID in the same datafield oai_out.write("<record>\n") oai_out.write("<controlfield tag=\"001\">%s" "</controlfield>\n" % recid) oai_out.write(DATAFIELD_ID_HEAD) oai_out.write("\n") #if oai_id_entry: oai_out.write(oai_id_entry) #if oai_set_entry: oai_out.write(oai_set_entry) oai_out.write("</datafield>\n") oai_out.write(other_data) oai_out.write("</record>\n") else: oai_out.write("<record>\n") oai_out.write("<controlfield tag=\"001\">%s" "</controlfield>\n" % recid) oai_out.write(DATAFIELD_ID_HEAD) oai_out.write("\n") oai_out.write(oai_id_entry) oai_out.write("</datafield>\n") oai_out.write(DATAFIELD_SET_HEAD) oai_out.write("\n") oai_out.write(oai_set_entry) oai_out.write("</datafield>\n") oai_out.write(other_data) oai_out.write("</record>\n") oai_out.write('</collection>') oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: task_sleep_now_if_required(can_stop_too=True) if has_updated_records: command = "%s/bibupload -c %s -u oairepository" % (CFG_BINDIR, filename) os.system(command) else: os.remove(filename) return True
def print_repository_status(write_message=write_message, verbose=0): """ Prints the repository status to the standard output. Parameters: write_message - *function* the function used to write the output verbose - *int* the verbosity of the output - 0: print repository size - 1: print quick status of each set (numbers can be wrong if the repository is in some inconsistent state, i.e. a record is in an OAI setSpec but has not OAI ID) - 2: print detailed status of repository, with number of records that needs to be synchronized according to the sets definitions. Precise, but ~slow... """ repository_size_s = "%d" % repository_size() repository_recids_after_update = HitSet() write_message(CFG_SITE_NAME) write_message(" OAI Repository Status") set_spec_max_length = 19 # How many max char do we display for set_name_max_length = 20 # setName and setSpec? if verbose == 0: # Just print repository size write_message(" Total(**)" + " " * 29 + " " * (9 - len(repository_size_s)) + repository_size_s) return elif verbose == 1: # We display few information: show longer set name and spec set_spec_max_length = 30 set_name_max_length = 30 write_message("=" * 80) header = " setSpec" + " " * (set_spec_max_length - 7) + \ " setName" + " " * (set_name_max_length - 5) + " Volume" if verbose > 1: header += " " * 5 + "After update(*):" write_message(header) if verbose > 1: write_message(" " * 57 + "Additions Deletions") write_message("-" * 80) for set_spec in all_set_specs(): if verbose <= 1: # Get the records that are in this set. This is an # incomplete check, as it can happen that some records are # in this set (according to the metadata) but have no OAI # ID (so they are not exported). This can happen if the # repository has some records coming from external # sources, or if it has never been synchronized with this # tool. current_recids = perform_request_search(c=CFG_SITE_NAME, p1=set_spec, f1=CFG_OAI_SET_FIELD, m1="e", ap=0) nb_current_recids = len(current_recids) else: # Get the records that are *currently* exported for this # setSpec current_recids = perform_request_search(c=CFG_SITE_NAME, p1=set_spec, f1=CFG_OAI_SET_FIELD, m1="e", ap=0, op1="a", p2="oai:*", f2=CFG_OAI_ID_FIELD, m2="e") nb_current_recids = len(current_recids) # Get the records that *should* be in this set according to # the admin defined settings, and compute how many should be # added or removed should_recids = get_recids_for_set_spec(set_spec) repository_recids_after_update = repository_recids_after_update.union( should_recids) nb_add_recids = len( HitSet(should_recids).difference(HitSet(current_recids))) nb_remove_recids = len( HitSet(current_recids).difference(HitSet(should_recids))) nb_should_recids = len(should_recids) nb_recids_after_update = len(repository_recids_after_update) # Adapt setName and setSpec strings lengths set_spec_str = set_spec if len(set_spec_str) > set_spec_max_length: set_spec_str = "%s.." % set_spec_str[:set_spec_max_length] set_name_str = get_set_name_for_set_spec(set_spec) if len(set_name_str) > set_name_max_length: set_name_str = "%s.." % set_name_str[:set_name_max_length] row = " " + set_spec_str + \ " " * ((set_spec_max_length + 2) - len(set_spec_str)) + set_name_str + \ " " * ((set_name_max_length + 2) - len(set_name_str)) + \ " " * (7 - len(str(nb_current_recids))) + str(nb_current_recids) if verbose > 1: row += \ " " * max(9 - len(str(nb_add_recids)), 0) + '+' + str(nb_add_recids) + \ " " * max(7 - len(str(nb_remove_recids)), 0) + '-' + str(nb_remove_recids) + " = " +\ " " * max(7 - len(str(nb_should_recids)), 0) + str(nb_should_recids) write_message(row) write_message("=" * 80) footer = " Total(**)" + " " * (set_spec_max_length + set_name_max_length - 7) + \ " " * (9 - len(repository_size_s)) + repository_size_s if verbose > 1: footer += ' ' * (28 - len(str(nb_recids_after_update))) + str( nb_recids_after_update) write_message(footer) if verbose > 1: write_message( ' *The "after update" columns show the repository after you run this tool.' ) else: write_message( ' *"Volume" is indicative if repository is out of sync. Use --detailed-report.' ) write_message( '**The "total" is not the sum of the above numbers, but the union of the records.' )