Пример #1
0
def get_recids_for_set_spec(set_spec):
    """
    Returns the list (as HitSet) of recids belonging to 'set'

    Parameters:

      set_spec - *str* the set_spec for which we would like to get the
                 recids
    """
    recids = HitSet()

    for set_def in get_set_definitions(set_spec):
        new_recids = perform_request_search(c=[coll.strip() \
                                               for coll in set_def['c'].split(',')],
                                            p1=set_def['p1'],
                                            f1=set_def['f1'],
                                            m1=set_def['m1'],
                                            op1=set_def['op1'],
                                            p2=set_def['p2'],
                                            f2=set_def['f2'],
                                            m2=set_def['m2'],
                                            op2=set_def['op2'],
                                            p3=set_def['p3'],
                                            f3=set_def['f3'],
                                            m3=set_def['m3'],
                                            ap=0)

        recids = recids.union(HitSet(new_recids))

    return recids
Пример #2
0
def get_recids_for_set_spec(set_spec):
    """
    Returns the list (as HitSet) of recids belonging to 'set'

    Parameters:

      set_spec - *str* the set_spec for which we would like to get the
                 recids
    """
    recids = HitSet()

    for set_def in get_set_definitions(set_spec):
        new_recids = perform_request_search(c=[coll.strip() \
                                               for coll in set_def['c'].split(',')],
                                            p1=set_def['p1'],
                                            f1=set_def['f1'],
                                            m1=set_def['m1'],
                                            op1=set_def['op1'],
                                            p2=set_def['p2'],
                                            f2=set_def['f2'],
                                            m2=set_def['m2'],
                                            op2=set_def['op2'],
                                            p3=set_def['p3'],
                                            f3=set_def['f3'],
                                            m3=set_def['m3'],
                                            ap=0)

        recids = recids.union(HitSet(new_recids))

    return recids
Пример #3
0
def oairepositoryupdater_task():
    """Main business logic code of oai_archive"""
    no_upload = task_get_option("no_upload")
    report = task_get_option("report")

    if report > 1:
        print_repository_status(verbose=report)
        return True

    task_update_progress("Fetching records to process")

    # Build the list of records to be processed, that is, search for
    # the records that match one of the search queries defined in OAI
    # Repository admin interface.
    recids_for_set = {} # Remember exactly which record belongs to which set
    recids = HitSet() # "Flat" set of the recids_for_set values
    for set_spec in all_set_specs():
        task_sleep_now_if_required(can_stop_too=True)
        _recids = get_recids_for_set_spec(set_spec)
        recids_for_set[set_spec] = _recids
        recids = recids.union(_recids)

    # Also get the list of records that are currently exported through
    # OAI and that might need to be refreshed
    oai_recids = perform_request_search(c=CFG_SITE_NAME,
                                        p1='oai:%s:*' % CFG_OAI_ID_PREFIX,
                                        f1=CFG_OAI_ID_FIELD,
                                        m1="e", ap=0)
    recids = recids.union(HitSet(oai_recids))

    # Prepare to save results in a tmp file
    (fd, filename) = mkstemp(dir=CFG_TMPDIR,
                                  prefix='oairepository_' + \
                                  time.strftime("%Y%m%d_%H%M%S_",
                                                time.localtime()))
    oai_out = os.fdopen(fd, "w")

    # Iterate over the recids
    i = 0
    for recid in recids:
        i += 1
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Done %s out of %s records." % \
                             (i, len(recids)))

        # Check if an OAI identifier is already in the record or
        # not.
        oai_id_entry = ""
        oai_ids = [_oai_id for _oai_id in \
                   get_fieldvalues(recid, CFG_OAI_ID_FIELD) \
                   if _oai_id.strip() != '']
        if len(oai_ids) == 0:
            oai_id_entry = "<subfield code=\"%s\">oai:%s:%s</subfield>\n" % \
                         (CFG_OAI_ID_FIELD[5:6], CFG_OAI_ID_PREFIX, recid)

        # Get the sets to which this record already belongs according
        # to the metadata
        current_oai_sets = set(\
            [_oai_set for _oai_set in \
             get_fieldvalues(recid, CFG_OAI_SET_FIELD) \
             if _oai_set.strip() != ''])

        # Get the sets that should be in this record according to
        # settings
        updated_oai_sets = set(\
            [_set for _set, _recids in recids_for_set.iteritems()
             if recid in _recids])

        # Ok, we have the old sets and the new sets. If they are equal
        # and oai ID does not need to be added, then great, nothing to
        # change . Otherwise apply the new sets.
        if current_oai_sets == updated_oai_sets and not oai_id_entry:
            continue # Jump to next recid

        # Generate the xml sets entry
        oai_set_entry = '\n'.join(["<subfield code=\"%s\">%s</subfield>" % \
                                 (CFG_OAI_SET_FIELD[5:6], _oai_set) \
                                 for _oai_set in updated_oai_sets]) + \
                                 "\n"

        # Also get all the datafields with tag and indicator matching
        # CFG_OAI_SET_FIELD[:5] and CFG_OAI_ID_FIELD[:5] but with
        # subcode != CFG_OAI_SET_FIELD[5:6] and subcode !=
        # CFG_OAI_SET_FIELD[5:6], so that we can preserve these values
        other_data = marcxml_filter_out_tags(recid, [CFG_OAI_SET_FIELD,
                                                     CFG_OAI_ID_FIELD])

        if oai_id_entry or oai_set_entry:
            if CFG_OAI_ID_FIELD[0:5] == CFG_OAI_SET_FIELD[0:5]:
                # Put set and OAI ID in the same datafield
                oai_out.write("<record>\n")
                oai_out.write("<controlfield tag=\"001\">%s"
                    "</controlfield>\n" % recid)
                oai_out.write(DATAFIELD_ID_HEAD)
                oai_out.write("\n")
                #if oai_id_entry:
                oai_out.write(oai_id_entry)
                #if oai_set_entry:
                oai_out.write(oai_set_entry)
                oai_out.write("</datafield>\n")
                oai_out.write(other_data)
                oai_out.write("</record>\n")
            else:
                oai_out.write("<record>\n")
                oai_out.write("<controlfield tag=\"001\">%s"
                    "</controlfield>\n" % recid)
                if oai_id_entry:
                    oai_out.write(DATAFIELD_ID_HEAD)
                    oai_out.write("\n")
                    oai_out.write(oai_id_entry)
                    oai_out.write("</datafield>\n")
                if oai_set_entry:
                    oai_out.write(DATAFIELD_SET_HEAD)
                    oai_out.write("\n")
                    oai_out.write(oai_set_entry)
                    oai_out.write("</datafield>\n")
                oai_out.write(other_data)
                oai_out.write("</record>\n")

    oai_out.close()
    write_message("Wrote to file %s" % filename)

    if not no_upload:
        task_sleep_now_if_required(can_stop_too=True)
        # Check if file is empty or not:
        len_file = os.stat(filename)[ST_SIZE]
        if len_file > 0:
            command = "%s/bibupload -c %s -u oairepository" % (CFG_BINDIR, filename)
            os.system(command)
        else:
            os.remove(filename)

    return True
Пример #4
0
def print_repository_status(write_message=write_message,
                            verbose=0):
    """
    Prints the repository status to the standard output.

    Parameters:

      write_message - *function* the function used to write the output

            verbose - *int* the verbosity of the output
                       - 0: print repository size
                       - 1: print quick status of each set (numbers
                         can be wrong if the repository is in some
                         inconsistent state, i.e. a record is in an
                         OAI setSpec but has not OAI ID)
                       - 2: print detailed status of repository, with
                         number of records that needs to be
                         synchronized according to the sets
                         definitions. Precise, but ~slow...
    """
    repository_size_s = "%d" % repository_size()
    repository_recids_after_update = HitSet()

    write_message(CFG_SITE_NAME)
    write_message(" OAI Repository Status")

    set_spec_max_length = 19 # How many max char do we display for
    set_name_max_length = 20 # setName and setSpec?

    if verbose == 0:
        # Just print repository size
        write_message("  Total(**)" + " " * 29 +
                      " " * (9 - len(repository_size_s)) + repository_size_s)
        return
    elif verbose == 1:
        # We display few information: show longer set name and spec
        set_spec_max_length = 30
        set_name_max_length = 30

    write_message("=" * 80)
    header = "  setSpec" + " " * (set_spec_max_length - 7) + \
             "  setName" + " " * (set_name_max_length - 5) + " Volume"
    if verbose > 1:
        header += " " * 5 + "After update(*):"
    write_message(header)

    if verbose > 1:
        write_message(" " * 57 + "Additions  Deletions")

    write_message("-" * 80)

    for set_spec in all_set_specs():

        if verbose <= 1:
            # Get the records that are in this set. This is an
            # incomplete check, as it can happen that some records are
            # in this set (according to the metadata) but have no OAI
            # ID (so they are not exported). This can happen if the
            # repository has some records coming from external
            # sources, or if it has never been synchronized with this
            # tool.
            current_recids = perform_request_search(c=CFG_SITE_NAME,
                                                    p1=set_spec,
                                                    f1=CFG_OAI_SET_FIELD,
                                                    m1="e", ap=0)
            nb_current_recids = len(current_recids)
        else:
            # Get the records that are *currently* exported for this
            # setSpec
            current_recids = perform_request_search(c=CFG_SITE_NAME,
                                                    p1=set_spec,
                                                    f1=CFG_OAI_SET_FIELD,
                                                    m1="e", ap=0, op1="a",
                                                    p2="oai:*",
                                                    f2=CFG_OAI_ID_FIELD,
                                                    m2="e")
            nb_current_recids = len(current_recids)
            # Get the records that *should* be in this set according to
            # the admin defined settings, and compute how many should be
            # added or removed
            should_recids = get_recids_for_set_spec(set_spec)
            repository_recids_after_update = repository_recids_after_update.union(should_recids)

            nb_add_recids = len(HitSet(should_recids).difference(HitSet(current_recids)))
            nb_remove_recids = len(HitSet(current_recids).difference(HitSet(should_recids)))
            nb_should_recids = len(should_recids)
            nb_recids_after_update = len(repository_recids_after_update)


        # Adapt setName and setSpec strings lengths
        set_spec_str = set_spec
        if len(set_spec_str) > set_spec_max_length :
            set_spec_str = "%s.." % set_spec_str[:set_spec_max_length]
        set_name_str = get_set_name_for_set_spec(set_spec)
        if len(set_name_str) > set_name_max_length :
            set_name_str = "%s.." % set_name_str[:set_name_max_length]

        row = "  " + set_spec_str + \
               " " * ((set_spec_max_length + 2) - len(set_spec_str)) + set_name_str + \
               " " * ((set_name_max_length + 2) - len(set_name_str)) + \
               " " * (7 - len(str(nb_current_recids))) + str(nb_current_recids)
        if verbose > 1:
            row += \
                " " * max(9 - len(str(nb_add_recids)), 0) + '+' + str(nb_add_recids) + \
                " " * max(7 - len(str(nb_remove_recids)), 0) + '-' + str(nb_remove_recids) + " = " +\
                " " * max(7 - len(str(nb_should_recids)), 0) + str(nb_should_recids)
        write_message(row)

    write_message("=" * 80)
    footer = "  Total(**)" + " " * (set_spec_max_length + set_name_max_length - 7) + \
             " " * (9 - len(repository_size_s)) + repository_size_s
    if verbose > 1:
        footer += ' ' * (28 - len(str(nb_recids_after_update))) + str(nb_recids_after_update)
    write_message(footer)

    if verbose > 1:
        write_message('  *The "after update" columns show the repository after you run this tool.')
    else:
        write_message(' *"Volume" is indicative if repository is out of sync. Use --detailed-report.')
    write_message('**The "total" is not the sum of the above numbers, but the union of the records.')
Пример #5
0
def oairepositoryupdater_task():
    """Main business logic code of oai_archive"""
    no_upload = task_get_option("no_upload")
    report = task_get_option("report")

    if report > 1:
        print_repository_status(verbose=report)
        return True

    task_update_progress("Fetching records to process")

    # Build the list of records to be processed, that is, search for
    # the records that match one of the search queries defined in OAI
    # Repository admin interface.
    recids_for_set = {}  # Remember exactly which record belongs to which set
    recids = HitSet()  # "Flat" set of the recids_for_set values
    for set_spec in all_set_specs():
        task_sleep_now_if_required(can_stop_too=True)
        _recids = get_recids_for_set_spec(set_spec)
        recids_for_set[set_spec] = _recids
        recids = recids.union(_recids)

    # Also get the list of records that are currently exported through
    # OAI and that might need to be refreshed
    oai_recids = perform_request_search(c=CFG_SITE_NAME,
                                        p1='oai:%s:*' % CFG_OAI_ID_PREFIX,
                                        f1=CFG_OAI_ID_FIELD,
                                        m1="e",
                                        ap=0)
    recids = recids.union(HitSet(oai_recids))

    # Prepare to save results in a tmp file
    (fd, filename) = mkstemp(dir=CFG_TMPDIR,
                                  prefix='oairepository_' + \
                                  time.strftime("%Y%m%d_%H%M%S_",
                                                time.localtime()))
    oai_out = os.fdopen(fd, "w")
    oai_out.write('<collection>')
    has_updated_records = False
    # Iterate over the recids
    i = 0
    for recid in recids:
        i += 1
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Done %s out of %s records." % \
                             (i, len(recids)))

        # Check if an OAI identifier is already in the record or
        # not.
        oai_id_entry = "<subfield code=\"%s\">oai:%s:%s</subfield>\n" % \
                       (CFG_OAI_ID_FIELD[5:6], CFG_OAI_ID_PREFIX, recid)
        already_has_oai_id = True
        oai_ids = [_oai_id for _oai_id in \
                   get_fieldvalues(recid, CFG_OAI_ID_FIELD) \
                   if _oai_id.strip() != '']
        if len(oai_ids) == 0:
            already_has_oai_id = False

        # Get the sets to which this record already belongs according
        # to the metadata
        current_oai_sets = set(\
            [_oai_set for _oai_set in \
             get_fieldvalues(recid, CFG_OAI_SET_FIELD) \
             if _oai_set.strip() != ''])

        # Get the sets that should be in this record according to
        # settings
        updated_oai_sets = set(\
            [_set for _set, _recids in recids_for_set.iteritems()
             if recid in _recids if _set])

        # Ok, we have the old sets and the new sets. If they are equal
        # and oai ID does not need to be added, then great, nothing to
        # change . Otherwise apply the new sets.
        if current_oai_sets == updated_oai_sets and already_has_oai_id:
            continue  # Jump to next recid

        has_updated_records = True

        # Generate the xml sets entry
        oai_set_entry = '\n'.join(["<subfield code=\"%s\">%s</subfield>" % \
                                   (CFG_OAI_SET_FIELD[5:6], _oai_set) \
                                   for _oai_set in updated_oai_sets if \
                                   _oai_set]) + \
                                   "\n"

        # Also get all the datafields with tag and indicator matching
        # CFG_OAI_SET_FIELD[:5] and CFG_OAI_ID_FIELD[:5] but with
        # subcode != CFG_OAI_SET_FIELD[5:6] and subcode !=
        # CFG_OAI_SET_FIELD[5:6], so that we can preserve these values
        other_data = marcxml_filter_out_tags(
            recid, [CFG_OAI_SET_FIELD, CFG_OAI_ID_FIELD])

        if CFG_OAI_ID_FIELD[0:5] == CFG_OAI_SET_FIELD[0:5]:
            # Put set and OAI ID in the same datafield
            oai_out.write("<record>\n")
            oai_out.write("<controlfield tag=\"001\">%s"
                          "</controlfield>\n" % recid)
            oai_out.write(DATAFIELD_ID_HEAD)
            oai_out.write("\n")
            #if oai_id_entry:
            oai_out.write(oai_id_entry)
            #if oai_set_entry:
            oai_out.write(oai_set_entry)
            oai_out.write("</datafield>\n")
            oai_out.write(other_data)
            oai_out.write("</record>\n")
        else:
            oai_out.write("<record>\n")
            oai_out.write("<controlfield tag=\"001\">%s"
                          "</controlfield>\n" % recid)
            oai_out.write(DATAFIELD_ID_HEAD)
            oai_out.write("\n")
            oai_out.write(oai_id_entry)
            oai_out.write("</datafield>\n")
            oai_out.write(DATAFIELD_SET_HEAD)
            oai_out.write("\n")
            oai_out.write(oai_set_entry)
            oai_out.write("</datafield>\n")
            oai_out.write(other_data)
            oai_out.write("</record>\n")

    oai_out.write('</collection>')
    oai_out.close()
    write_message("Wrote to file %s" % filename)

    if not no_upload:
        task_sleep_now_if_required(can_stop_too=True)
        if has_updated_records:
            command = "%s/bibupload -c %s -u oairepository" % (CFG_BINDIR,
                                                               filename)
            os.system(command)
        else:
            os.remove(filename)

    return True
Пример #6
0
def print_repository_status(write_message=write_message, verbose=0):
    """
    Prints the repository status to the standard output.

    Parameters:

      write_message - *function* the function used to write the output

            verbose - *int* the verbosity of the output
                       - 0: print repository size
                       - 1: print quick status of each set (numbers
                         can be wrong if the repository is in some
                         inconsistent state, i.e. a record is in an
                         OAI setSpec but has not OAI ID)
                       - 2: print detailed status of repository, with
                         number of records that needs to be
                         synchronized according to the sets
                         definitions. Precise, but ~slow...
    """
    repository_size_s = "%d" % repository_size()
    repository_recids_after_update = HitSet()

    write_message(CFG_SITE_NAME)
    write_message(" OAI Repository Status")

    set_spec_max_length = 19  # How many max char do we display for
    set_name_max_length = 20  # setName and setSpec?

    if verbose == 0:
        # Just print repository size
        write_message("  Total(**)" + " " * 29 + " " *
                      (9 - len(repository_size_s)) + repository_size_s)
        return
    elif verbose == 1:
        # We display few information: show longer set name and spec
        set_spec_max_length = 30
        set_name_max_length = 30

    write_message("=" * 80)
    header = "  setSpec" + " " * (set_spec_max_length - 7) + \
             "  setName" + " " * (set_name_max_length - 5) + " Volume"
    if verbose > 1:
        header += " " * 5 + "After update(*):"
    write_message(header)

    if verbose > 1:
        write_message(" " * 57 + "Additions  Deletions")

    write_message("-" * 80)

    for set_spec in all_set_specs():

        if verbose <= 1:
            # Get the records that are in this set. This is an
            # incomplete check, as it can happen that some records are
            # in this set (according to the metadata) but have no OAI
            # ID (so they are not exported). This can happen if the
            # repository has some records coming from external
            # sources, or if it has never been synchronized with this
            # tool.
            current_recids = perform_request_search(c=CFG_SITE_NAME,
                                                    p1=set_spec,
                                                    f1=CFG_OAI_SET_FIELD,
                                                    m1="e",
                                                    ap=0)
            nb_current_recids = len(current_recids)
        else:
            # Get the records that are *currently* exported for this
            # setSpec
            current_recids = perform_request_search(c=CFG_SITE_NAME,
                                                    p1=set_spec,
                                                    f1=CFG_OAI_SET_FIELD,
                                                    m1="e",
                                                    ap=0,
                                                    op1="a",
                                                    p2="oai:*",
                                                    f2=CFG_OAI_ID_FIELD,
                                                    m2="e")
            nb_current_recids = len(current_recids)
            # Get the records that *should* be in this set according to
            # the admin defined settings, and compute how many should be
            # added or removed
            should_recids = get_recids_for_set_spec(set_spec)
            repository_recids_after_update = repository_recids_after_update.union(
                should_recids)

            nb_add_recids = len(
                HitSet(should_recids).difference(HitSet(current_recids)))
            nb_remove_recids = len(
                HitSet(current_recids).difference(HitSet(should_recids)))
            nb_should_recids = len(should_recids)
            nb_recids_after_update = len(repository_recids_after_update)

        # Adapt setName and setSpec strings lengths
        set_spec_str = set_spec
        if len(set_spec_str) > set_spec_max_length:
            set_spec_str = "%s.." % set_spec_str[:set_spec_max_length]
        set_name_str = get_set_name_for_set_spec(set_spec)
        if len(set_name_str) > set_name_max_length:
            set_name_str = "%s.." % set_name_str[:set_name_max_length]

        row = "  " + set_spec_str + \
               " " * ((set_spec_max_length + 2) - len(set_spec_str)) + set_name_str + \
               " " * ((set_name_max_length + 2) - len(set_name_str)) + \
               " " * (7 - len(str(nb_current_recids))) + str(nb_current_recids)
        if verbose > 1:
            row += \
                " " * max(9 - len(str(nb_add_recids)), 0) + '+' + str(nb_add_recids) + \
                " " * max(7 - len(str(nb_remove_recids)), 0) + '-' + str(nb_remove_recids) + " = " +\
                " " * max(7 - len(str(nb_should_recids)), 0) + str(nb_should_recids)
        write_message(row)

    write_message("=" * 80)
    footer = "  Total(**)" + " " * (set_spec_max_length + set_name_max_length - 7) + \
             " " * (9 - len(repository_size_s)) + repository_size_s
    if verbose > 1:
        footer += ' ' * (28 - len(str(nb_recids_after_update))) + str(
            nb_recids_after_update)
    write_message(footer)

    if verbose > 1:
        write_message(
            '  *The "after update" columns show the repository after you run this tool.'
        )
    else:
        write_message(
            ' *"Volume" is indicative if repository is out of sync. Use --detailed-report.'
        )
    write_message(
        '**The "total" is not the sum of the above numbers, but the union of the records.'
    )