示例#1
0
def _detect_collections_from_marcxml_file(recs):
    """
    Extract all possible recIDs from MARCXML file and guess collections
    for these recIDs.
    """
    from invenio.legacy.bibrecord import record_get_field_values
    from invenio.legacy.search_engine import guess_collection_of_a_record
    from invenio.legacy.bibupload.engine import find_record_from_sysno, \
                                  find_records_from_extoaiid, \
                                  find_record_from_oaiid

    dbcollids = {}
    sysno_tag = CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG
    oaiid_tag = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG
    oai_tag = CFG_OAI_ID_FIELD
    for rec, dummy1, dummy2 in recs:
        if rec:
            for tag001 in record_get_field_values(rec, '001'):
                collection = guess_collection_of_a_record(int(tag001))
                dbcollids[collection] = 1
            for tag_sysno in record_get_field_values(rec,
                                                     tag=sysno_tag[:3],
                                                     ind1=sysno_tag[3],
                                                     ind2=sysno_tag[4],
                                                     code=sysno_tag[5]):
                record = find_record_from_sysno(tag_sysno)
                if record:
                    collection = guess_collection_of_a_record(int(record))
                    dbcollids[collection] = 1
            for tag_oaiid in record_get_field_values(rec,
                                                     tag=oaiid_tag[:3],
                                                     ind1=oaiid_tag[3],
                                                     ind2=oaiid_tag[4],
                                                     code=oaiid_tag[5]):
                try:
                    records = find_records_from_extoaiid(tag_oaiid)
                except Error:
                    records = []
                if records:
                    record = records.pop()
                    collection = guess_collection_of_a_record(int(record))
                    dbcollids[collection] = 1
            for tag_oai in record_get_field_values(rec,
                                                   tag=oai_tag[0:3],
                                                   ind1=oai_tag[3],
                                                   ind2=oai_tag[4],
                                                   code=oai_tag[5]):
                record = find_record_from_oaiid(tag_oai)
                if record:
                    collection = guess_collection_of_a_record(int(record))
                    dbcollids[collection] = 1
    return dbcollids.keys()
示例#2
0
def _detect_collections_from_marcxml_file(recs):
    """
    Extract all possible recIDs from MARCXML file and guess collections
    for these recIDs.
    """
    from invenio.legacy.bibrecord import record_get_field_values
    from invenio.legacy.search_engine import guess_collection_of_a_record
    from invenio.legacy.bibupload.engine import find_record_from_sysno, \
                                  find_records_from_extoaiid, \
                                  find_record_from_oaiid

    dbcollids = {}
    sysno_tag = CFG_BIBUPLOAD_EXTERNAL_SYSNO_TAG
    oaiid_tag = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG
    oai_tag = CFG_OAI_ID_FIELD
    for rec, dummy1, dummy2 in recs:
        if rec:
            for tag001 in record_get_field_values(rec, '001'):
                collection = guess_collection_of_a_record(int(tag001))
                dbcollids[collection] = 1
            for tag_sysno in record_get_field_values(rec, tag=sysno_tag[:3],
                                                     ind1=sysno_tag[3],
                                                     ind2=sysno_tag[4],
                                                     code=sysno_tag[5]):
                record = find_record_from_sysno(tag_sysno)
                if record:
                    collection = guess_collection_of_a_record(int(record))
                    dbcollids[collection] = 1
            for tag_oaiid in record_get_field_values(rec, tag=oaiid_tag[:3],
                                                     ind1=oaiid_tag[3],
                                                     ind2=oaiid_tag[4],
                                                     code=oaiid_tag[5]):
                try:
                    records = find_records_from_extoaiid(tag_oaiid)
                except Error:
                    records = []
                if records:
                    record = records.pop()
                    collection = guess_collection_of_a_record(int(record))
                    dbcollids[collection] = 1
            for tag_oai in record_get_field_values(rec, tag=oai_tag[0:3],
                                                   ind1=oai_tag[3],
                                                   ind2=oai_tag[4],
                                                   code=oai_tag[5]):
                record = find_record_from_oaiid(tag_oai)
                if record:
                    collection = guess_collection_of_a_record(int(record))
                    dbcollids[collection] = 1
    return dbcollids.keys()
示例#3
0
 def test_compare_field_values_with_bibrecord_values(self):
     """bibfield - same value as in bibrecord"""
     from invenio.legacy.bibrecord import record_get_field_values
     from invenio.legacy.search_engine import get_record as search_engine_get_record
     record = get_record(1)
     bibrecord_value = record_get_field_values(search_engine_get_record(1), '245', ' ', ' ', 'a')[0]
     self.assertEqual(bibrecord_value, record['title.title'])
示例#4
0
def _get_minimal_arxiv_id(record, tag_code):
    """
    Returns the OAI arXiv id in the given record skipping the prefixes.
    I.e. oai:arxiv.org:1234.1234 becomes 1234.1234 and oai:arxiv.org:hep-ex/2134123
    becomes hep-ex/2134123. Used for searching.
    """
    values = record_get_field_values(record, **split_tag_code(tag_code))
    for value in values:
        if 'arXiv' in value:
            return value.split(':')[-1]
示例#5
0
def retrieve_field_values(curdir,
                          field_name,
                          separator=None,
                          system_number_file='SN',
                          tag=None):
    """
    This is a handy function to retrieve values either from the current
    submission directory, when a form has been just submitted, or from
    an existing record (e.g. during MBI action).

    @param curdir: is the current submission directory.
    @type curdir: string
    @param field_name: is the form field name that might exists on disk.
    @type field_name: string
    @param separator: is an optional separator. If it exists, it will be used
        to retrieve multiple values contained in the field.
    @type separator: string
    @param system_number_file: is the name of the file on disk in curdir, that
        is supposed to contain the record id.
    @type system_number_file: string
    @param tag: is the full MARC tag (tag+ind1+ind2+code) that should
        contain values. If not specified, only values in curdir will
        be retrieved.
    @type tag: 6-chars
    @return: the field value(s).
    @rtype: list of strings.

    @note: if field_name exists in curdir it will take precedence over
        retrieving the values from the record.
    """
    field_file = os.path.join(curdir, field_name)
    if os.path.exists(field_file):
        field_value = open(field_file).read()
        if separator is not None:
            return [
                value.strip() for value in field_value.split(separator)
                if value.strip()
            ]
        else:
            return [field_value.strip()]
    elif tag is not None:
        system_number_file = os.path.join(curdir, system_number_file)
        if os.path.exists(system_number_file):
            recid = int(open(system_number_file).read().strip())
            record = get_record(recid)
            if separator:
                return record_get_field_values(record, tag[:3], tag[3], tag[4],
                                               tag[5])
            else:
                return [
                    record_get_field_value(record, tag[:3], tag[3], tag[4],
                                           tag[5])
                ]
    return []
示例#6
0
def get_record_collections(recid=0, recstruct=None):
    """ Returns all collections of a record, field 980
    @param recid: record id to get collections from
    @type: string

    @return: list of collections
    @rtype: list
    """
    if not recstruct:
        recstruct = get_record(recid)
    return [collection for collection in record_get_field_values(recstruct, tag="980", ind1=" ", ind2=" ", code="a")]
示例#7
0
def _get_minimal_arxiv_id(record, tag_code):
    """
    Returns the OAI arXiv id in the given record skipping the prefixes.
    I.e. oai:arxiv.org:1234.1234 becomes 1234.1234 and oai:arxiv.org:hep-ex/2134123
    becomes hep-ex/2134123. Used for searching.
    """
    values = record_get_field_values(record,
                                     **split_tag_code(tag_code))
    for value in values:
        if 'arXiv' in value:
            return value.split(':')[-1]
示例#8
0
def get_sysno_from_record(record, options):
    """Function to get the system number for a record.
       In the case of a pure text MARC record being created, the
       sysno will be retrieved from 001 (i.e. the 'recid' will be returned).
       In the case of an Aleph MARC record being created, the sysno
       will be retrieved from 970__a IF this field exists.  If not,
       None will be returned.
       @param record: the internal representation of the record
        (created by bibrecord) from which the sysno is to be retrieved.
       @param options: various options about the record to be created,
        as obtained from the command line.
       @return: a string containing a 9-digit SYSNO, -OR- None in
       certain cases for an Aleph MARC record.
    """
    if options["text-marc"] != 0:
        vals001 = record_get_field_values(rec=record, tag="001")
        if len(vals001) > 1:
            ## multiple values for recid is illegal!
            sysno = None
        elif len(vals001) < 1:
            ## no value for recid is illegal!
            sysno = None
        else:
            ## get recid
            sysno = vals001[0]
            if len(sysno) < 9:
                sysno = "0"*(9-len(sysno)) + sysno
    else:
        vals970a = record_get_field_values(rec=record, tag="970", code="a")
        if len(vals970a) > 1:
            ## multiple SYS is illegal - return a list of them all,
            ## let other functions decide what to do
            return vals970a
        if len(vals970a) < 1:
            ## no SYS
            sysno = None
        else:
            ## get SYS
            sysno = vals970a[0][0:9]
    return sysno
示例#9
0
def get_sysno_from_record(record, options):
    """Function to get the system number for a record.
       In the case of a pure text MARC record being created, the
       sysno will be retrieved from 001 (i.e. the 'recid' will be returned).
       In the case of an Aleph MARC record being created, the sysno
       will be retrieved from 970__a IF this field exists.  If not,
       None will be returned.
       @param record: the internal representation of the record
        (created by bibrecord) from which the sysno is to be retrieved.
       @param options: various options about the record to be created,
        as obtained from the command line.
       @return: a string containing a 9-digit SYSNO, -OR- None in
       certain cases for an Aleph MARC record.
    """
    if options["text-marc"] != 0:
        vals001 = record_get_field_values(rec=record, tag="001")
        if len(vals001) > 1:
            ## multiple values for recid is illegal!
            sysno = None
        elif len(vals001) < 1:
            ## no value for recid is illegal!
            sysno = None
        else:
            ## get recid
            sysno = vals001[0]
            if len(sysno) < 9:
                sysno = "0" * (9 - len(sysno)) + sysno
    else:
        vals970a = record_get_field_values(rec=record, tag="970", code="a")
        if len(vals970a) > 1:
            ## multiple SYS is illegal - return a list of them all,
            ## let other functions decide what to do
            return vals970a
        if len(vals970a) < 1:
            ## no SYS
            sysno = None
        else:
            ## get SYS
            sysno = vals970a[0][0:9]
    return sysno
示例#10
0
def get_record_collections(recid=0, recstruct=None):
    """ Returns all collections of a record, field 980
    @param recid: record id to get collections from
    @type: string

    @return: list of collections
    @rtype: list
    """
    if not recstruct:
        recstruct = get_record(recid)
    return [
        collection for collection in record_get_field_values(
            recstruct, tag="980", ind1=" ", ind2=" ", code="a")
    ]
示例#11
0
def record_is_conference(record):
    """
    Determine if the record is a new conference based on the value present
    on field 980

    @param record: record to be checked
    @type record: bibrecord object

    @return: True if record is a conference, False otherwise
    @rtype: boolean
    """
    # Get collection field content (tag 980)
    tag_980_content = record_get_field_values(record, "980", " ", " ", "a")
    if "CONFERENCES" in tag_980_content:
        return True
    return False
示例#12
0
def record_is_conference(record):
    """
    Determine if the record is a new conference based on the value present
    on field 980

    @param record: record to be checked
    @type record: bibrecord object

    @return: True if record is a conference, False otherwise
    @rtype: boolean
    """
    # Get collection field content (tag 980)
    tag_980_content = record_get_field_values(record, "980", " ", " ", "a")
    if "CONFERENCES" in tag_980_content:
        return True
    return False
示例#13
0
def user_can_edit_record_collection(req, recid):
    """ Check if user has authorization to modify a collection
    the recid belongs to
    """

    def remove_volatile(field_value):
        """ Remove volatile keyword from field value """
        if field_value.startswith(VOLATILE_PREFIX):
            field_value = field_value[len(VOLATILE_PREFIX) :]
        return field_value

    # Get the collections the record belongs to
    record_collections = get_all_collections_of_a_record(recid)

    user_info = collect_user_info(req)
    uid = user_info["uid"]
    # In case we are creating a new record
    if cache_exists(recid, uid):
        record = get_cache_contents(recid, uid)[2]
        values = record_get_field_values(record, "980", code="a")
        record_collections.extend([remove_volatile(v) for v in values])

    normalized_collections = []
    for collection in record_collections:
        # Get the normalized collection name present in the action table
        res = run_sql(
            """SELECT value FROM "accARGUMENT"
                         WHERE keyword='collection'
                         AND value=%s;""",
            (collection,),
        )
        if res:
            normalized_collections.append(res[0][0])
    if not normalized_collections:
        # Check if user has access to all collections
        auth_code, dummy_message = acc_authorize_action(req, "runbibedit", collection="")
        if auth_code == 0:
            return True
    else:
        for collection in normalized_collections:
            auth_code, dummy_message = acc_authorize_action(req, "runbibedit", collection=collection)
            if auth_code == 0:
                return True
    return False
示例#14
0
def user_can_edit_record_collection(req, recid):
    """ Check if user has authorization to modify a collection
    the recid belongs to
    """
    def remove_volatile(field_value):
        """ Remove volatile keyword from field value """
        if field_value.startswith(VOLATILE_PREFIX):
            field_value = field_value[len(VOLATILE_PREFIX):]
        return field_value

    # Get the collections the record belongs to
    record_collections = get_all_collections_of_a_record(recid)

    user_info = collect_user_info(req)
    uid = user_info["uid"]
    # In case we are creating a new record
    if cache_exists(recid, uid):
        record = get_cache_contents(recid, uid)[2]
        values = record_get_field_values(record, '980', code="a")
        record_collections.extend([remove_volatile(v) for v in values])

    normalized_collections = []
    for collection in record_collections:
        # Get the normalized collection name present in the action table
        res = run_sql(
            """SELECT value FROM "accARGUMENT"
                         WHERE keyword='collection'
                         AND value=%s;""", (collection, ))
        if res:
            normalized_collections.append(res[0][0])
    if not normalized_collections:
        # Check if user has access to all collections
        auth_code, dummy_message = acc_authorize_action(req,
                                                        'runbibedit',
                                                        collection='')
        if auth_code == 0:
            return True
    else:
        for collection in normalized_collections:
            auth_code, dummy_message = acc_authorize_action(
                req, 'runbibedit', collection=collection)
            if auth_code == 0:
                return True
    return False
示例#15
0
def retrieve_field_values(curdir, field_name, separator=None, system_number_file='SN', tag=None):
    """
    This is a handy function to retrieve values either from the current
    submission directory, when a form has been just submitted, or from
    an existing record (e.g. during MBI action).

    @param curdir: is the current submission directory.
    @type curdir: string
    @param field_name: is the form field name that might exists on disk.
    @type field_name: string
    @param separator: is an optional separator. If it exists, it will be used
        to retrieve multiple values contained in the field.
    @type separator: string
    @param system_number_file: is the name of the file on disk in curdir, that
        is supposed to contain the record id.
    @type system_number_file: string
    @param tag: is the full MARC tag (tag+ind1+ind2+code) that should
        contain values. If not specified, only values in curdir will
        be retrieved.
    @type tag: 6-chars
    @return: the field value(s).
    @rtype: list of strings.

    @note: if field_name exists in curdir it will take precedence over
        retrieving the values from the record.
    """
    field_file = os.path.join(curdir, field_name)
    if os.path.exists(field_file):
        field_value = open(field_file).read()
        if separator is not None:
            return [value.strip() for value in field_value.split(separator) if value.strip()]
        else:
            return [field_value.strip()]
    elif tag is not None:
        system_number_file = os.path.join(curdir, system_number_file)
        if os.path.exists(system_number_file):
            recid = int(open(system_number_file).read().strip())
            record = get_record(recid)
            if separator:
                return record_get_field_values(record, tag[:3], tag[3], tag[4], tag[5])
            else:
                return [record_get_field_value(record, tag[:3], tag[3], tag[4], tag[5])]
    return []
示例#16
0
def _detect_980_values_from_marcxml_file(recs):
    """
    Read MARCXML file and return list of 980 $a values found in that file.
    Useful for checking rights.
    """
    from invenio.legacy.bibrecord import record_get_field_values

    collection_tag = run_sql("SELECT value FROM tag, field_tag, field \
                              WHERE tag.id=field_tag.id_tag AND \
                              field_tag.id_field=field.id AND \
                              field.code='collection'")
    collection_tag = collection_tag[0][0]
    dbcollids = {}
    for rec, dummy1, dummy2 in recs:
        if rec:
            for tag980 in record_get_field_values(rec,
                                                  tag=collection_tag[:3],
                                                  ind1=collection_tag[3],
                                                  ind2=collection_tag[4],
                                                  code=collection_tag[5]):
                dbcollids[tag980] = 1
    return dbcollids.keys()
示例#17
0
def _detect_980_values_from_marcxml_file(recs):
    """
    Read MARCXML file and return list of 980 $a values found in that file.
    Useful for checking rights.
    """
    from invenio.legacy.bibrecord import record_get_field_values

    collection_tag = run_sql("SELECT value FROM tag, field_tag, field \
                              WHERE tag.id=field_tag.id_tag AND \
                              field_tag.id_field=field.id AND \
                              field.code='collection'")
    collection_tag = collection_tag[0][0]
    dbcollids = {}
    for rec, dummy1, dummy2 in recs:
        if rec:
            for tag980 in record_get_field_values(rec,
                                                  tag=collection_tag[:3],
                                                  ind1=collection_tag[3],
                                                  ind2=collection_tag[4],
                                                  code=collection_tag[5]):
                dbcollids[tag980] = 1
    return dbcollids.keys()
示例#18
0
    def fields(self, tag, escape=0, repeatable_subfields_p=False):
        """
        Returns the list of values corresonding to "tag".

        If tag has an undefined subcode (such as 999C5),
        the function returns a list of dictionaries, whoose keys
        are the subcodes and the values are the values of tag.subcode.
        If the tag has a subcode, simply returns list of values
        corresponding to tag.
        Eg. for given MARC::
            999C5 $a value_1a $b value_1b
            999C5 $b value_2b
            999C5 $b value_3b $b value_3b_bis

            >>> bfo.fields('999C5b')
            >>> ['value_1b', 'value_2b', 'value_3b', 'value_3b_bis']
            >>> bfo.fields('999C5')
            >>> [{'a':'value_1a', 'b':'value_1b'},
                {'b':'value_2b'},
                {'b':'value_3b'}]

        By default the function returns only one value for each
        subfield (that is it considers that repeatable subfields are
        not allowed). It is why in the above example 'value3b_bis' is
        not shown for bfo.fields('999C5').  (Note that it is not
        defined which of value_3b or value_3b_bis is returned).  This
        is to simplify the use of the function, as most of the time
        subfields are not repeatable (in that way we get a string
        instead of a list).  You can allow repeatable subfields by
        setting 'repeatable_subfields_p' parameter to True. In
        this mode, the above example would return:
            >>> bfo.fields('999C5b', repeatable_subfields_p=True)
            >>> ['value_1b', 'value_2b', 'value_3b']
            >>> bfo.fields('999C5', repeatable_subfields_p=True)
            >>> [{'a':['value_1a'], 'b':['value_1b']},
                {'b':['value_2b']},
                {'b':['value_3b', 'value3b_bis']}]

        NOTICE THAT THE RETURNED STRUCTURE IS DIFFERENT.  Also note
        that whatever the value of 'repeatable_subfields_p' is,
        bfo.fields('999C5b') always show all fields, even repeatable
        ones. This is because the parameter has no impact on the
        returned structure (it is always a list).

        'escape' parameter allows to escape special characters
        of the fields. The value of escape can be:
                      0. No escaping
                      1. Escape all HTML characters
                      2. Remove unsafe HTML tags (Eg. keep <br />)
                      3. Mix of mode 1 and 2. If value of field starts with
                      <!-- HTML -->, then use mode 2. Else use mode 1.
                      4. Remove all HTML tags
                      5. Same as 2, with more tags allowed (like <img>)
                      6. Same as 3, with more tags allowed (like <img>)
                      7. Mix of mode 0 and mode 1. If field_value
                      starts with <!--HTML-->, then use mode 0.
                      Else use mode 1.
                      8. Same as mode 1, but also escape double-quotes
                      9. Same as mode 4, but also escape double-quotes

        :param tag: the marc code of a field
        :param escape: 1 if returned values should be escaped. Else 0.
        @repeatable_subfields_p if True, returns the list of subfields in the dictionary
        @return: values of field tag in record
        """

        if self.get_record() is None:
            # Case where BibRecord could not parse object
            return []

        p_tag = parse_tag(tag)
        if p_tag[3] != "":
            # Subcode has been defined. Simply returns list of values
            values = record_get_field_values(self.get_record(),
                                             p_tag[0],
                                             p_tag[1],
                                             p_tag[2],
                                             p_tag[3])
            if escape == 0:
                return values
            else:
                return [escape_field(value, escape) for value in values]

        else:
            # Subcode is undefined. Returns list of dicts.
            # However it might be the case of a control field.

            instances = record_get_field_instances(self.get_record(),
                                                   p_tag[0],
                                                   p_tag[1],
                                                   p_tag[2])
            if repeatable_subfields_p:
                list_of_instances = []
                for instance in instances:
                    instance_dict = {}
                    for subfield in instance[0]:
                        if subfield[0] not in instance_dict:
                            instance_dict[subfield[0]] = []
                        if escape == 0:
                            instance_dict[subfield[0]].append(subfield[1])
                        else:
                            instance_dict[subfield[0]].append(escape_field(subfield[1], escape))
                    list_of_instances.append(instance_dict)
                return list_of_instances
            else:
                if escape == 0:
                    return [dict(instance[0]) for instance in instances]
                else:
                    return [dict([(subfield[0], escape_field(subfield[1], escape))
                                   for subfield in instance[0]])
                            for instance in instances]
示例#19
0
def generate_ticket(ticket, record):
    """
    Generates a ticket to be created, filling subject, body and queue values
    of the passed BibCatalogTicket object. The enriched object is returned.

    @param ticket: a ticket object as created by BibCatalogTicket() containing
                   the subject, body and queue to create a ticket in.
    @type ticket: record object of BibCatalogTicket.

    @param record: a recstruct object as created by bibrecord.create_record()
    @type record: record object of BibRecord.

    @return: the modified ticket object to create.
    @rtype: BibCatalogTicket
    """
    title_code = load_tag_code_from_name("title")
    abstract_code = load_tag_code_from_name("abstract")

    try:
        date_code = load_tag_code_from_name("date")
    except BibCatalogTagNotFound:
        date_code = load_tag_code_from_name("year")

    category_code = load_tag_code_from_name("subject")

    try:
        notes_code = load_tag_code_from_name("note")
    except BibCatalogTagNotFound:
        notes_code = load_tag_code_from_name("comment")

    first_author_code = load_tag_code_from_name("first author name")
    additional_author_code = load_tag_code_from_name("additional author name")

    try:
        external_id_code = load_tag_code_from_name("ext system ID")
    except BibCatalogTagNotFound:
        external_id_code = load_tag_code_from_name("primary report number")

    # List of extra info to print in the ticket.
    extra_info = []
    recid = record_id_from_record(record)

    arxiv_id = _get_minimal_arxiv_id(record, external_id_code)
    if arxiv_id:
        # We have an arxiv id - we can add special info:
        extra_info.append("ABSTRACT: http://arxiv.org/abs/%s" % (arxiv_id, ))
        extra_info.append("PDF: http://arxiv.org/pdf/%s" % (arxiv_id, ))

        categories = record_get_value_with_provenence(
            record=record,
            provenence_code="2",
            provenence_value="arXiv",
            **split_tag_code(category_code))
        comments = record_get_value_with_provenence(
            record=record,
            provenence_code="9",
            provenence_value="arXiv",
            **split_tag_code(notes_code))
        external_ids = arxiv_id
        subject = "ARXIV:" + arxiv_id
    else:
        # Not an arxiv record - Lets get generic info
        categories = record_get_value_with_provenence(
            record=record,
            provenence_code="2",
            provenence_value="SzGeCERN",
            **split_tag_code(category_code))
        comments = record_get_field_values(rec=record,
                                           **split_tag_code(notes_code))
        external_id_list = record_get_field_values(
            rec=record, **split_tag_code(external_id_code))
        external_ids = ", ".join(external_id_list)
        subject = "Record #%s %s" % (recid, external_ids)

    authors = record_get_field_values(record, **split_tag_code(first_author_code)) + \
              record_get_field_values(record, **split_tag_code(additional_author_code))

    text = """
%(submitdate)s

External IDs: %(external_ids)s

Title: %(title)s

Authors: %(authors)s

Categories: %(categories)s

Comments: %(comments)s

%(abstract)s

%(extra_info)s

Edit the record now: %(editurl)s

""" \
    % {
        'external_ids': external_ids,
        'submitdate': record_get_field_value(record, **split_tag_code(date_code)),
        'extra_info': "\n".join(extra_info),
        'title': record_get_field_value(record, **split_tag_code(title_code)),
        'comments': "; ".join(comments),
        'categories': " ".join(categories),
        'authors': " / ".join(authors[:10]),
        'abstract': record_get_field_value(record, **split_tag_code(abstract_code)),
        'editurl': "%s/record/edit/%s" % (CFG_SITE_URL, recid),
    }
    # To avoid errors with string formatting later, we are escaping %'s
    ticket.subject = subject
    ticket.body = text.replace('%', '%%')
    ticket.queue = "Test"
    return ticket
示例#20
0
def oairepositoryupdater_task():
    """Main business logic code of oai_archive"""
    no_upload = task_get_option("no_upload")
    report = task_get_option("report")

    if report > 1:
        print_repository_status(verbose=report)
        return True

    initial_snapshot = {}
    for set_spec in all_set_specs():
        initial_snapshot[set_spec] = get_set_definitions(set_spec)
    write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2)

    task_update_progress("Fetching records to process")

    recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, type='e')
    write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2)

    all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e')
    no_more_exported_recids = intbitset(all_current_recids)
    write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2)

    all_affected_recids = intbitset()
    all_should_recids = intbitset()
    recids_for_set = {}
    for set_spec in all_set_specs():
        if not set_spec:
            set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC
        should_recids = get_recids_for_set_spec(set_spec)
        recids_for_set[set_spec] = should_recids
        no_more_exported_recids -= should_recids
        all_should_recids |= should_recids
        current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e')
        write_message("%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2)
        to_add = should_recids - current_recids
        write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2)
        to_remove = current_recids - should_recids
        write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2)
        affected_recids = to_add | to_remove
        write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2)
        all_affected_recids |= affected_recids

    missing_oaiid = all_should_recids - recids_with_oaiid
    write_message("%s recids are missing an oaiid" % len(missing_oaiid))
    write_message("%s recids should no longer be exported" % len(no_more_exported_recids))

    ## Let's add records with missing OAI ID
    all_affected_recids |= missing_oaiid | no_more_exported_recids
    write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2)

    if not all_affected_recids:
        write_message("Nothing to do!")
        return True

    # Prepare to save results in a tmp file
    (fd, filename) = mkstemp(dir=CFG_TMPDIR,
                                  prefix='oairepository_' + \
                                  time.strftime("%Y%m%d_%H%M%S_",
                                                time.localtime()))
    oai_out = os.fdopen(fd, "w")
    oai_out.write("<collection>")

    tot = 0
    # Iterate over the recids
    for i, recid in enumerate(all_affected_recids):
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Done %s out of %s records." % \
                             (i, len(all_affected_recids)))

        write_message("Elaborating recid %s" % recid, verbose=3)
        record = get_record(recid)
        if not record:
            write_message("Record %s seems empty. Let's skip it." % recid, verbose=3)
            continue
        new_record = {}

        # Check if an OAI identifier is already in the record or
        # not.
        assign_oai_id_entry = False
        oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5])
        if not oai_id_entry:
            assign_oai_id_entry = True
            oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid)
            write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3)
        else:
            write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3)

        # Get the sets to which this record already belongs according
        # to the metadata
        current_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5]))
        write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3)

        current_previous_oai_sets = set(record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5]))
        write_message("Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3)

        # Get the sets that should be in this record according to
        # settings
        updated_oai_sets = set(_set for _set, _recids in iteritems(recids_for_set)
             if recid in _recids)
        write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3)

        updated_previous_oai_sets = set(_set for _set in (current_previous_oai_sets - updated_oai_sets) |
             (current_oai_sets - updated_oai_sets))
        write_message("Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3)

        # Ok, we have the old sets and the new sets. If they are equal
        # and oai ID does not need to be added, then great, nothing to
        # change . Otherwise apply the new sets.
        if current_oai_sets == updated_oai_sets and not assign_oai_id_entry:
            write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3)
            continue # Jump to next recid

        write_message("Something has changed for record %s, let's update it!" % recid, verbose=3)
        subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)]
        for oai_set in updated_oai_sets:
            subfields.append((CFG_OAI_SET_FIELD[5], oai_set))
        for oai_set in updated_previous_oai_sets:
            subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set))

        record_add_field(new_record, tag="001", controlfield_value=str(recid))
        record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields)
        oai_out.write(record_xml_output(new_record))
        tot += 1
        if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE:
            oai_out.write("</collection>")
            oai_out.close()
            write_message("Wrote to file %s" % filename)
            if not no_upload:
                if task_get_option("notimechange"):
                    task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n')
                else:
                    task_low_level_submission('bibupload', 'oairepository', '-c', filename)
            # Prepare to save results in a tmp file
            (fd, filename) = mkstemp(dir=CFG_TMPDIR,
                                        prefix='oairepository_' + \
                                        time.strftime("%Y%m%d_%H%M%S_",
                                                        time.localtime()))
            oai_out = os.fdopen(fd, "w")
            oai_out.write("<collection>")
            tot = 0
            task_sleep_now_if_required(can_stop_too=True)

    oai_out.write("</collection>")
    oai_out.close()
    write_message("Wrote to file %s" % filename)

    if tot > 0:
        if not no_upload:
            task_sleep_now_if_required(can_stop_too=True)
            if task_get_option("notimechange"):
                task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n')
            else:
                task_low_level_submission('bibupload', 'oairepository', '-c', filename)
    else:
        os.remove(filename)

    return True
示例#21
0
def create_marc_record(record, sysno, options):
    """Create a text-marc, or aleph-marc record from the contents
       of "record", and return it as a string.
       @param record: Internal representation of an XML MARC
        record, created by bibrecord.
       @param sysno: the system number to be used for the record
       @param options: the options about the MARC record to be created,
        as passed from command line
       @return: string (MARC record, either text-marc or ALEPH marc format,
        depending upon "options".
    """
    out = ""  ## String containing record to be printed
    display_001 = 0  ## Flag used in ALEPH MARC mode to determine whether
    ## or not to print the "001" field

    ## Get a dictionary containing the names of fields to change for
    ## the output record:
    if options["aleph-marc"] == 1:
        fieldname_changes = get_fieldname_changes()
    else:
        fieldname_changes = {}

    if options["aleph-marc"] == 1:
        ## Perform some ALEPH-MARC specific tasks:
        ## Assume that we will NOT display "001":
        display_001 = 0

        ## Add ALEPH record headers to the output record:
        if 1 not in (options["correct-mode"], options["append-mode"]):
            ## This is not an ALEPH "correct" or "append" record. The
            ## record must therefore have FMT and LDR fields. E.g.:
            ## 123456789 FMT   L BK
            ## 123456789 LDR   L ^^^^^nam^^22^^^^^^a^4500
            out += """%(sys)s%(fmt)s
%(sys)s%(ldr)s\n""" % {
                'sys': sysno,
                'fmt': get_aleph_FMT(),
                'ldr': get_aleph_LDR()
            }

        if options["delete-mode"] == 1:
            ## This is an ALEPH 'delete' record. Add the DEL field
            ## then return the 'completed' record (in delete mode,
            ## the record only needs the leaders, and a 'DEL' field, e.g.:
            ## 123456789 FMT   L BK
            ## 123456789 LDR   L ^^^^^nam^^22^^^^^^a^4500
            ## 123456789 DEL   L $$aY
            out += """%(sys)s%(del)s\n""" % {
                'sys': sysno,
                'del': get_aleph_DEL()
            }
            return out
        elif 1 in (options["insert-mode"], options["replace-mode"]):
            ## Either an ALEPH 'insert' or 'replace' record is being created.
            ## It needs to have 008 and OWN fields. E.g.:
            ## 123456789 008   L ^^^^^^s^^^^^^^^^^^^^^^^r^^^^^000^0^eng^d
            ## 123456789 OWN   L $$aPUBLIC
            out += """%(sys)s%(008)s\n""" % {
                'sys': sysno,
                '008': get_aleph_008()
            }
            ## The "OWN" field should only be printed at this level if the
            ## MARC XML did not have an OWN (963__a) field:
            if "PUBLIC" not in \
               record_get_field_values(record, "963", code="a"):
                ## Add OWN field:
                out += """%(sys)s%(own)s\n""" % {
                    'sys': sysno,
                    'own': get_aleph_OWN()
                }

            if options["replace-mode"] == 1:
                ## In 'replace' mode, the record should have a 001 field:
                display_001 = 1

        ## Remove fields unwanted in ALEPH MARC:
        for deltag in get_fields_dropped_in_aleph():
            try:
                del record[deltag]
            except KeyError:
                ## tag doesn't exist in record:
                pass

    ## now add 001, since it is a special field:
    if options["text-marc"] == 1:
        try:
            ## get the 001 line(s):
            lines_001 = create_field_lines(fieldname="001", \
                                           field=record["001"][0], \
                                           sysno=sysno, \
                                           alephmarc=options["aleph-marc"])
            ## print the 001 line(s):
            out += print_field(field_lines=lines_001, \
                               alephmarc=options["aleph-marc"])
        except KeyError:
            ## no 001 field
            pass
    elif options["aleph-marc"] == 1:
        ## If desirable, build the "001" line:
        if display_001 == 1:
            try:
                ## make the 001 line(s):
                line_leader = """%(sys)s """ % {'sys': sysno}
                line_leader += """%(fieldname)s   L """ % {'fieldname': "001"}
                lines_001 = [[["", line_leader], ["", sysno]]]
                ## print the 001 line(s):
                out += print_field(field_lines=lines_001, \
                                   alephmarc=options["aleph-marc"])
            except KeyError:
                ## no 001 field
                pass

        ## Now, if running in "insert" or "replace" mode, add "003":
        ## 003 is a mandatory field in an ALEPH record. It contains the
        ## identifier for the organization that has generated the SYS (001)
        ## for the record. As such, it is necessary to drop any existing 003
        ## from the record, then add our own 003.

        ## First, drop the "003" field from the record:
        try:
            del record["003"]
        except KeyError:
            ## There was no 003
            pass

        ## Now add a correct 003 (if desirable):
        if 1 in (options["insert-mode"], options["replace-mode"]):
            out += """%(sys)s%(own)s\n""" % {
                'sys': sysno,
                'own': get_aleph_003()
            }

    ## delete 001 from the list of fields to output (if it exists):
    try:
        del record["001"]
    except KeyError:
        ## There was no 001
        pass

    ## Get the fields of this record, and order them correctly (using the same
    ## order as that of the original MARC XML file):
    fields = []
    tags = record.keys()
    tags.sort()
    for tag in tags:
        for field in record[tag]:
            fields.append((tag, field))

    ## Finally, loop through all fields and display them in the record:
    for field in fields:
        ## Should the field-name be changed?
        try:
            fieldname = fieldname_changes[str(field[0])]
        except KeyError:
            ## Don't change this fieldname:
            fieldname = field[0]
        ## get the subfields, etc, for this field:
        fielddata = field[1]

        ## Create the MARC lines for this field:
        field_lines = create_field_lines(fieldname, \
                                         fielddata, \
                                         sysno, \
                                         options["aleph-marc"])

        ## Now create the formatted MARC lines:
        out += print_field(field_lines, options["aleph-marc"])
    ## Return the formatted MARC record:
    return out
示例#22
0
def oairepositoryupdater_task():
    """Main business logic code of oai_archive"""
    no_upload = task_get_option("no_upload")
    report = task_get_option("report")

    if report > 1:
        print_repository_status(verbose=report)
        return True

    if run_sql(
            "SELECT id FROM schTASK WHERE proc='bibupload:oairepository' AND status='WAITING'"
    ):
        write_message(
            "Previous requests of oairepository still being elaborated. Let's skip this execution."
        )
        return True

    initial_snapshot = {}
    for set_spec in all_set_specs():
        initial_snapshot[set_spec] = get_set_definitions(set_spec)
    write_message("Initial set snapshot: %s" % pformat(initial_snapshot),
                  verbose=2)

    task_update_progress("Fetching records to process")

    recids_with_oaiid = search_unit_in_bibxxx(p='*',
                                              f=CFG_OAI_ID_FIELD,
                                              type='e')
    write_message("%s recids have an OAI ID" % len(recids_with_oaiid),
                  verbose=2)

    all_current_recids = search_unit_in_bibxxx(p='*',
                                               f=CFG_OAI_SET_FIELD,
                                               type='e')
    no_more_exported_recids = intbitset(all_current_recids)
    write_message("%s recids are currently exported" %
                  (len(all_current_recids)),
                  verbose=2)

    all_affected_recids = intbitset()
    all_should_recids = intbitset()
    recids_for_set = {}
    for set_spec in all_set_specs():
        if not set_spec:
            set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC
        should_recids = get_recids_for_set_spec(set_spec)
        recids_for_set[set_spec] = should_recids
        no_more_exported_recids -= should_recids
        all_should_recids |= should_recids
        current_recids = search_unit_in_bibxxx(p=set_spec,
                                               f=CFG_OAI_SET_FIELD,
                                               type='e')
        write_message(
            "%s recids should be in %s. Currently %s are in %s" %
            (len(should_recids), set_spec, len(current_recids), set_spec),
            verbose=2)
        to_add = should_recids - current_recids
        write_message("%s recids should be added to %s" %
                      (len(to_add), set_spec),
                      verbose=2)
        to_remove = current_recids - should_recids
        write_message("%s recids should be removed from %s" %
                      (len(to_remove), set_spec),
                      verbose=2)
        affected_recids = to_add | to_remove
        write_message("%s recids should be hence updated for %s" %
                      (len(affected_recids), set_spec),
                      verbose=2)
        all_affected_recids |= affected_recids

    missing_oaiid = all_should_recids - recids_with_oaiid
    write_message("%s recids are missing an oaiid" % len(missing_oaiid))
    write_message("%s recids should no longer be exported" %
                  len(no_more_exported_recids))

    ## Let's add records with missing OAI ID
    all_affected_recids |= missing_oaiid | no_more_exported_recids
    write_message("%s recids should updated" % (len(all_affected_recids)),
                  verbose=2)

    if not all_affected_recids:
        write_message("Nothing to do!")
        return True

    # Prepare to save results in a tmp file
    (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR,
                                  prefix='oairepository_' + \
                                  time.strftime("%Y%m%d_%H%M%S_",
                                                time.localtime()))
    oai_out = os.fdopen(fd, "w")
    oai_out.write("<collection>")

    tot = 0
    # Iterate over the recids
    for i, recid in enumerate(all_affected_recids):
        task_sleep_now_if_required(can_stop_too=True)
        task_update_progress("Done %s out of %s records." % \
                             (i, len(all_affected_recids)))

        write_message("Elaborating recid %s" % recid, verbose=3)
        record = get_record(recid)
        if not record:
            write_message("Record %s seems empty. Let's skip it." % recid,
                          verbose=3)
            continue
        new_record = {}

        # Check if an OAI identifier is already in the record or
        # not.
        assign_oai_id_entry = False
        oai_id_entry = record_get_field_value(record,
                                              tag=CFG_OAI_ID_FIELD[:3],
                                              ind1=CFG_OAI_ID_FIELD[3],
                                              ind2=CFG_OAI_ID_FIELD[4],
                                              code=CFG_OAI_ID_FIELD[5])
        if not oai_id_entry:
            assign_oai_id_entry = True
            oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid)
            write_message("Setting new oai_id %s for record %s" %
                          (oai_id_entry, recid),
                          verbose=3)
        else:
            write_message("Already existing oai_id %s for record %s" %
                          (oai_id_entry, recid),
                          verbose=3)

        # Get the sets to which this record already belongs according
        # to the metadata
        current_oai_sets = set(
            record_get_field_values(record,
                                    tag=CFG_OAI_SET_FIELD[:3],
                                    ind1=CFG_OAI_SET_FIELD[3],
                                    ind2=CFG_OAI_SET_FIELD[4],
                                    code=CFG_OAI_SET_FIELD[5]))
        write_message("Record %s currently belongs to these oai_sets: %s" %
                      (recid, ", ".join(current_oai_sets)),
                      verbose=3)

        current_previous_oai_sets = set(
            record_get_field_values(record,
                                    tag=CFG_OAI_PREVIOUS_SET_FIELD[:3],
                                    ind1=CFG_OAI_PREVIOUS_SET_FIELD[3],
                                    ind2=CFG_OAI_PREVIOUS_SET_FIELD[4],
                                    code=CFG_OAI_PREVIOUS_SET_FIELD[5]))
        write_message(
            "Record %s currently doesn't belong anymore to these oai_sets: %s"
            % (recid, ", ".join(current_previous_oai_sets)),
            verbose=3)

        # Get the sets that should be in this record according to
        # settings
        updated_oai_sets = set(_set
                               for _set, _recids in iteritems(recids_for_set)
                               if recid in _recids)
        write_message("Record %s now belongs to these oai_sets: %s" %
                      (recid, ", ".join(updated_oai_sets)),
                      verbose=3)

        updated_previous_oai_sets = set(
            _set for _set in (current_previous_oai_sets - updated_oai_sets)
            | (current_oai_sets - updated_oai_sets))
        write_message(
            "Record %s now doesn't belong anymore to these oai_sets: %s" %
            (recid, ", ".join(updated_previous_oai_sets)),
            verbose=3)

        # Ok, we have the old sets and the new sets. If they are equal
        # and oai ID does not need to be added, then great, nothing to
        # change . Otherwise apply the new sets.
        if current_oai_sets == updated_oai_sets and not assign_oai_id_entry:
            write_message("Nothing has changed for record %s, let's move on!" %
                          recid,
                          verbose=3)
            continue  # Jump to next recid

        write_message("Something has changed for record %s, let's update it!" %
                      recid,
                      verbose=3)
        subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)]
        for oai_set in updated_oai_sets:
            subfields.append((CFG_OAI_SET_FIELD[5], oai_set))
        for oai_set in updated_previous_oai_sets:
            subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set))

        record_add_field(new_record, tag="001", controlfield_value=str(recid))
        record_add_field(new_record,
                         tag=CFG_OAI_ID_FIELD[:3],
                         ind1=CFG_OAI_ID_FIELD[3],
                         ind2=CFG_OAI_ID_FIELD[4],
                         subfields=subfields)
        oai_out.write(record_xml_output(new_record))
        tot += 1
        if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE:
            oai_out.write("</collection>")
            oai_out.close()
            write_message("Wrote to file %s" % filename)
            if not no_upload:
                if task_get_option("notimechange"):
                    task_low_level_submission('bibupload', 'oairepository',
                                              '-c', filename, '-n',
                                              '-Noairepository', '-P', '-1')
                else:
                    task_low_level_submission('bibupload', 'oairepository',
                                              '-c', filename,
                                              '-Noairepository', '-P', '-1')
            # Prepare to save results in a tmp file
            (fd, filename) = mkstemp(dir=CFG_TMPSHAREDDIR,
                                        prefix='oairepository_' + \
                                        time.strftime("%Y%m%d_%H%M%S_",
                                                        time.localtime()))
            oai_out = os.fdopen(fd, "w")
            oai_out.write("<collection>")
            tot = 0
            task_sleep_now_if_required(can_stop_too=True)

    oai_out.write("</collection>")
    oai_out.close()
    write_message("Wrote to file %s" % filename)

    if tot > 0:
        if not no_upload:
            task_sleep_now_if_required(can_stop_too=True)
            if task_get_option("notimechange"):
                task_low_level_submission('bibupload', 'oairepository', '-c',
                                          filename, '-n')
            else:
                task_low_level_submission('bibupload', 'oairepository', '-c',
                                          filename)
    else:
        os.remove(filename)

    return True
示例#23
0
def create_marc_record(record, sysno, options):
    """Create a text-marc, or aleph-marc record from the contents
       of "record", and return it as a string.
       @param record: Internal representation of an XML MARC
        record, created by bibrecord.
       @param sysno: the system number to be used for the record
       @param options: the options about the MARC record to be created,
        as passed from command line
       @return: string (MARC record, either text-marc or ALEPH marc format,
        depending upon "options".
    """
    out = ""  ## String containing record to be printed
    display_001 = 0  ## Flag used in ALEPH MARC mode to determine whether
                     ## or not to print the "001" field

    ## Get a dictionary containing the names of fields to change for
    ## the output record:
    if options["aleph-marc"] == 1:
        fieldname_changes = get_fieldname_changes()
    else:
        fieldname_changes = {}

    if options["aleph-marc"] == 1:
        ## Perform some ALEPH-MARC specific tasks:
        ## Assume that we will NOT display "001":
        display_001 = 0

        ## Add ALEPH record headers to the output record:
        if 1 not in (options["correct-mode"], options["append-mode"]):
            ## This is not an ALEPH "correct" or "append" record. The
            ## record must therefore have FMT and LDR fields. E.g.:
            ## 123456789 FMT   L BK
            ## 123456789 LDR   L ^^^^^nam^^22^^^^^^a^4500
            out += """%(sys)s%(fmt)s
%(sys)s%(ldr)s\n""" % { 'sys' : sysno,
                       'fmt' : get_aleph_FMT(),
                       'ldr' : get_aleph_LDR()
                     }

        if options["delete-mode"] == 1:
            ## This is an ALEPH 'delete' record. Add the DEL field
            ## then return the 'completed' record (in delete mode,
            ## the record only needs the leaders, and a 'DEL' field, e.g.:
            ## 123456789 FMT   L BK
            ## 123456789 LDR   L ^^^^^nam^^22^^^^^^a^4500
            ## 123456789 DEL   L $$aY
            out += """%(sys)s%(del)s\n""" % { 'sys' : sysno,
                                              'del' : get_aleph_DEL()
                                            }
            return out
        elif 1 in (options["insert-mode"], options["replace-mode"]):
            ## Either an ALEPH 'insert' or 'replace' record is being created.
            ## It needs to have 008 and OWN fields. E.g.:
            ## 123456789 008   L ^^^^^^s^^^^^^^^^^^^^^^^r^^^^^000^0^eng^d
            ## 123456789 OWN   L $$aPUBLIC
            out += """%(sys)s%(008)s\n""" % { 'sys' : sysno,
                                              '008' : get_aleph_008()
                                            }
            ## The "OWN" field should only be printed at this level if the
            ## MARC XML did not have an OWN (963__a) field:
            if "PUBLIC" not in \
               record_get_field_values(record, "963", code="a"):
                ## Add OWN field:
                out += """%(sys)s%(own)s\n""" % { 'sys' : sysno,
                                                  'own' : get_aleph_OWN() }

            if options["replace-mode"] == 1:
                ## In 'replace' mode, the record should have a 001 field:
                display_001 = 1

        ## Remove fields unwanted in ALEPH MARC:
        for deltag in get_fields_dropped_in_aleph():
            try:
                del record[deltag]
            except KeyError:
                ## tag doesn't exist in record:
                pass

    ## now add 001, since it is a special field:
    if options["text-marc"] == 1:
        try:
            ## get the 001 line(s):
            lines_001 = create_field_lines(fieldname="001", \
                                           field=record["001"][0], \
                                           sysno=sysno, \
                                           alephmarc=options["aleph-marc"])
            ## print the 001 line(s):
            out += print_field(field_lines=lines_001, \
                               alephmarc=options["aleph-marc"])
        except KeyError:
            ## no 001 field
            pass
    elif options["aleph-marc"] == 1:
        ## If desirable, build the "001" line:
        if display_001 == 1:
            try:
                ## make the 001 line(s):
                line_leader = """%(sys)s """ % { 'sys' : sysno }
                line_leader += """%(fieldname)s   L """ % { 'fieldname' : "001" }
                lines_001 = [[["", line_leader], ["", sysno]]]
                ## print the 001 line(s):
                out += print_field(field_lines=lines_001, \
                                   alephmarc=options["aleph-marc"])
            except KeyError:
                ## no 001 field
                pass

        ## Now, if running in "insert" or "replace" mode, add "003":
        ## 003 is a mandatory field in an ALEPH record. It contains the
        ## identifier for the organization that has generated the SYS (001)
        ## for the record. As such, it is necessary to drop any existing 003
        ## from the record, then add our own 003.

        ## First, drop the "003" field from the record:
        try:
            del record["003"]
        except KeyError:
            ## There was no 003
            pass

        ## Now add a correct 003 (if desirable):
        if 1 in (options["insert-mode"], options["replace-mode"]):
            out += """%(sys)s%(own)s\n""" % { 'sys' : sysno,
                                              'own' : get_aleph_003() }

    ## delete 001 from the list of fields to output (if it exists):
    try:
        del record["001"]
    except KeyError:
        ## There was no 001
        pass

    ## Get the fields of this record, and order them correctly (using the same
    ## order as that of the original MARC XML file):
    fields = []
    tags = record.keys()
    tags.sort()
    for tag in tags:
        for field in record[tag]:
            fields.append((tag, field))

    ## Finally, loop through all fields and display them in the record:
    for field in fields:
        ## Should the field-name be changed?
        try:
            fieldname = fieldname_changes[str(field[0])]
        except KeyError:
            ## Don't change this fieldname:
            fieldname = field[0]
        ## get the subfields, etc, for this field:
        fielddata = field[1]

        ## Create the MARC lines for this field:
        field_lines = create_field_lines(fieldname, \
                                         fielddata, \
                                         sysno, \
                                         options["aleph-marc"])

        ## Now create the formatted MARC lines:
        out += print_field(field_lines, options["aleph-marc"])
    ## Return the formatted MARC record:
    return out
示例#24
0
def validate_matches(bibmatch_recid, record, server, result_recids, \
                     collections="", verbose=0, ascii_mode=False):
    """
    Perform record validation on a set of matches. This function will
    try to find any search-result that "really" is a correct match, based on
    various methods defined in a given rule-set. See more about rule-sets in
    validate_match() function documentation.

    This function will return a tuple containing a list of all record IDs
    satisfying the count of field matching needed for exact matches and a
    similar list for fuzzy matches that has less fields matching then the
    threshold. Records that are not matching at all are simply left out of
    the lists.

    @param bibmatch_recid: Current record number. Used for logging.
    @type bibmatch_recid: int

    @param record: bibrec structure of original record
    @type record: dict

    @param server: InvenioConnector object to matched record source repository
    @type server: InvenioConnector object

    @param result_recids: the list of record ids from search result.
    @type result_recids: list

    @param collections: list of collections to search, if specified
    @type collections: list

    @param verbose: be loud
    @type verbose: int

    @param ascii_mode: True to transform values to its ascii representation
    @type ascii_mode: bool

    @return: list of record IDs matched
    @rtype: list
    """
    matches_found = []
    fuzzy_matches_found = []

    # Generate final rule-set by analyzing the record
    final_ruleset = get_validation_ruleset(record)
    if not final_ruleset:
        raise BibMatchValidationError("Bad configuration rule-set." \
                                      "Please check that CFG_BIBMATCH_MATCH_VALIDATION_RULESETS" \
                                      " is formed correctly.")

    if verbose > 8:
        sys.stderr.write("\nStart record validation:\n\nFinal validation ruleset used:\n")
        pp = pprint.PrettyPrinter(stream=sys.stderr, indent=2)
        pp.pprint(final_ruleset)
    CFG_BIBMATCH_LOGGER.info("Final validation ruleset used: %s" % (final_ruleset,))

    # Fetch all records in MARCXML and convert to BibRec
    found_record_list = []
    query = " OR ".join(["001:%d" % (recid,) for recid in result_recids])

    if collections:
        search_params = dict(p=query, of="xm", c=collections)
    else:
        search_params = dict(p=query, of="xm")
    CFG_BIBMATCH_LOGGER.info("Fetching records to match: %s" % (str(search_params),))
    result_marcxml = server.search_with_retry(**search_params)
    # Check if record was found
    if result_marcxml:
        found_record_list = [r[0] for r in create_records(result_marcxml)]
        # Check if BibRecord generation was successful
        if not found_record_list:
            # Error fetching records. Unable to validate. Abort.
            raise BibMatchValidationError("Error retrieving MARCXML for possible matches from %s. Aborting." \
                                          % (server.server_url,))
        if len(found_record_list) < len(result_recids):
            # Error fetching all records. Will still continue.
            sys.stderr.write("\nError retrieving all MARCXML for possible matched records from %s.\n" \
                              % (server.server_url,))

    # Validate records one-by-one, adding any matches to the list of matching record IDs
    current_index = 1
    for matched_record in found_record_list:
        recid = record_get_field_values(matched_record, tag="001")[0]
        if verbose > 8:
            sys.stderr.write("\n Validating matched record #%d (%s):\n" % \
                             (current_index, recid))
        CFG_BIBMATCH_LOGGER.info("Matching of record %d: Comparing to matched record %s" % \
                                 (bibmatch_recid, recid))
        match_ratio = validate_match(record, matched_record, final_ruleset, \
                                     verbose, ascii_mode)
        if match_ratio == 1.0:
            # All matches were a success, this is an exact match
            CFG_BIBMATCH_LOGGER.info("Matching of record %d: Exact match found -> %s" % (bibmatch_recid, recid))
            matches_found.append(recid)
        elif match_ratio >= CFG_BIBMATCH_FUZZY_MATCH_VALIDATION_LIMIT:
            # This means that some matches failed, but some succeeded as well. That's fuzzy...
            CFG_BIBMATCH_LOGGER.info("Matching of record %d: Fuzzy match found -> %s" % \
                                     (bibmatch_recid, recid))
            fuzzy_matches_found.append(recid)
        else:
            CFG_BIBMATCH_LOGGER.info("Matching of record %d: Not a match" % (bibmatch_recid,))
        current_index += 1

    # Return list of matching record IDs
    return matches_found, fuzzy_matches_found
示例#25
0
def validate_match(org_record, matched_record, ruleset, verbose=0, ascii_mode=False):
    """
    This function will try to match the original record with matched record.
    This comparison uses various methods defined in configuration and/or
    determined from the source record.

    These methods can be derived from each rule-set defined, which contains a
    mapping of a certain pattern to a list of rules defining the "match-strategy".

    For example:

    ('260__', [{ 'tags' : '260__c',
                 'threshold' : 0.8,
                 'compare_mode' : 'lazy',
                 'match_mode' : 'date',
                 'result_mode' : 'normal' }])

    Quick run-down of possible values:
      Compare mode:
        'strict'    : all (sub-)fields are compared, and all must match. Order is significant.
        'normal'    : all (sub-)fields are compared, and all must match. Order is ignored.
        'lazy'      : all (sub-)fields are compared with each other and at least one must match
        'ignored'   : the tag is ignored in the match. Used to disable previously defined rules.

      Match mode:
        'title'     : uses a method specialized for comparing titles, e.g. looking for subtitles
        'author'    : uses a special authorname comparison. Will take initials into account.
        'identifier': special matching for identifiers, stripping away punctuation
        'date'      : matches dates by extracting and comparing the year
        'normal'    : normal string comparison.

      Result mode:
        'normal'    : a failed match will cause the validation to continue on other rules (if any)
                      a successful match will cause the validation to continue on other rules (if any)
        'final'     : a failed match will cause the validation to immediately exit as a failure.
                      a successful match will cause validation to immediately exit as a success.
        'joker'     : a failed match will cause the validation to continue on other rules (if any).
                      a successful match will cause validation to immediately exit as a success.

    Fields are considered matching when all its subfields or values match. ALL matching strategy
    must return successfully for a match to be validated (except for 'joker' mode).

    @param org_record: bibrec structure of original record
    @type org_record: dict

    @param matched_record: bibrec structure of matched record
    @type matched_record: dict

    @param ruleset: the default rule-set {tag: strategy,..} used when validating
    @type ruleset: dict

    @param verbose: be loud
    @type verbose: int

    @param ascii_mode: True to transform values to its ascii representation
    @type ascii_mode: bool

    @return: Number of matches succeeded divided by number of comparisons done. At least two
        successful matches must be done unless a joker or final match is found
    @rtype: float
    """
    total_number_of_matches = 0
    total_number_of_comparisons = 0
    for field_tags, threshold, compare_mode, match_mode, result_mode in ruleset:
        field_tag_list = field_tags.split(',')
        if verbose > 8:
            sys.stderr.write("\nValidating tags: %s in parsing mode '%s' and comparison\
 mode '%s' as '%s' result with threshold %0.2f\n" \
                             % (field_tag_list, compare_mode, match_mode, \
                                result_mode, threshold))
        current_matching_status = False

        ## 1. COMPARE MODE
        # Fetch defined fields from both records
        original_record_values = []
        matched_record_values = []
        for field_tag in field_tag_list:
            tag_structure = validate_tag(field_tag)
            if tag_structure != None:
                tag, ind1, ind2, code = tag_structure
                # Fetch all field instances to match
                original_values = record_get_field_values(org_record, tag, ind1, ind2, code)
                original_record_values.extend([value for value in original_values if value])
                matched_values = record_get_field_values(matched_record, tag, ind1, ind2, code)
                matched_record_values.extend([value for value in matched_values if value])

        if (len(original_record_values) == 0 or len(matched_record_values) == 0):
            # Both records do not have values, ignore.
            if verbose > 8:
                sys.stderr.write("\nBoth records do not have this field. Continue.\n")
            continue

        if result_mode != 'joker':
            # Since joker is a special beast (should have no impact on failure),
            # We first check if it is the current mode before incrementing number
            # of matching comparisons / attempts
            total_number_of_comparisons += 1

        if ascii_mode:
            original_record_values = translate_to_ascii(original_record_values)
            matched_record_values = translate_to_ascii(matched_record_values)

        ignore_order = True
        matches_needed = 0
        # How many field-value matches are needed for successful validation of this record
        if compare_mode == 'lazy':
            # 'lazy' : all fields are matched with each other, if any match = success
            matches_needed = 1
        elif compare_mode == 'normal':
            # 'normal' : all fields are compared, and all must match.
            # Order is ignored. The number of matches needed is equal
            # to the value count of original record
            matches_needed = len(original_record_values)
        elif compare_mode == 'strict':
            # 'strict' : all fields are compared, and all must match. Order matters.
            if len(original_record_values) != len(matched_record_values):
                # Not the same number of fields, not a valid match
                # Unless this is a joker, we return indicating failure
                if result_mode != 'joker':
                    return 0.0
                continue
            matches_needed = len(original_record_values)
            ignore_order = False
        if verbose > 8:
            sys.stderr.write("Total matches needed: %d -> " % (matches_needed,))

        ## 2. MATCH MODE
        comparison_function = None
        if match_mode == 'title':
            # Special title mode
            comparison_function = compare_fieldvalues_title
        elif match_mode == 'author':
            # Special author mode
            comparison_function = compare_fieldvalues_authorname
        elif match_mode == 'identifier':
            # Special identifier mode
            comparison_function = compare_fieldvalues_identifier
        elif match_mode == 'date':
            # Special identifier mode
            comparison_function = compare_fieldvalues_date
        else:
            # Normal mode
            comparison_function = compare_fieldvalues_normal

        # Get list of comparisons to perform containing extracted values
        field_comparisons = get_paired_comparisons(original_record_values, \
                                                   matched_record_values, \
                                                   ignore_order)

        if verbose > 8:
            sys.stderr.write("Field comparison values:\n%s\n" % (field_comparisons,))

        # Run comparisons according to match_mode
        current_matching_status, matches = comparison_function(field_comparisons, \
                                                               threshold, \
                                                               matches_needed)
        CFG_BIBMATCH_LOGGER.info("-- Comparing fields %s with %s = %d matches of %d" % \
                                 (str(original_record_values), \
                                  str(matched_record_values), \
                                  matches, matches_needed))

        ## 3. RESULT MODE
        if current_matching_status:
            if verbose > 8:
                sys.stderr.write("Fields matched successfully.\n")
            if result_mode in ['final', 'joker']:
                # Matching success. Return 5,5 indicating exact-match when final or joker.
                return 1.0
            total_number_of_matches += 1
        else:
            # Matching failed. Not a valid match
            if result_mode == 'final':
                # Final does not allow failure
                return 0.0
            elif result_mode == 'joker':
                if verbose > 8:
                    sys.stderr.write("Fields not matching. (Joker)\n")
            else:
                if verbose > 8:
                    sys.stderr.write("Fields not matching. \n")
    if total_number_of_matches < CFG_BIBMATCH_MIN_VALIDATION_COMPARISONS \
        or total_number_of_comparisons == 0:
        return 0.0
    return total_number_of_matches / float(total_number_of_comparisons)
示例#26
0
def generate_ticket(ticket, record):
    """
    Generates a ticket to be created, filling subject, body and queue values
    of the passed BibCatalogTicket object. The enriched object is returned.

    @param ticket: a ticket object as created by BibCatalogTicket() containing
                   the subject, body and queue to create a ticket in.
    @type ticket: record object of BibCatalogTicket.

    @param record: a recstruct object as created by bibrecord.create_record()
    @type record: record object of BibRecord.

    @return: the modified ticket object to create.
    @rtype: BibCatalogTicket
    """
    title_code = load_tag_code_from_name("title")
    abstract_code = load_tag_code_from_name("abstract")

    try:
        date_code = load_tag_code_from_name("date")
    except BibCatalogTagNotFound:
        date_code = load_tag_code_from_name("year")

    category_code = load_tag_code_from_name("subject")

    try:
        notes_code = load_tag_code_from_name("note")
    except BibCatalogTagNotFound:
        notes_code = load_tag_code_from_name("comment")

    first_author_code = load_tag_code_from_name("first author name")
    additional_author_code = load_tag_code_from_name("additional author name")

    try:
        external_id_code = load_tag_code_from_name("ext system ID")
    except BibCatalogTagNotFound:
        external_id_code = load_tag_code_from_name("primary report number")

    # List of extra info to print in the ticket.
    extra_info = []
    recid = record_id_from_record(record)

    arxiv_id = _get_minimal_arxiv_id(record, external_id_code)
    if arxiv_id:
        # We have an arxiv id - we can add special info:
        extra_info.append("ABSTRACT: http://arxiv.org/abs/%s" % (arxiv_id,))
        extra_info.append("PDF: http://arxiv.org/pdf/%s" % (arxiv_id,))

        categories = record_get_value_with_provenence(record=record,
                                                      provenence_code="2",
                                                      provenence_value="arXiv",
                                                      **split_tag_code(category_code))
        comments = record_get_value_with_provenence(record=record,
                                                    provenence_code="9",
                                                    provenence_value="arXiv",
                                                    **split_tag_code(notes_code))
        external_ids = arxiv_id
        subject = "ARXIV:" + arxiv_id
    else:
        # Not an arxiv record - Lets get generic info
        categories = record_get_value_with_provenence(record=record,
                                                      provenence_code="2",
                                                      provenence_value="SzGeCERN",
                                                      **split_tag_code(category_code))
        comments = record_get_field_values(rec=record,
                                           **split_tag_code(notes_code))
        external_id_list = record_get_field_values(rec=record,
                                                   **split_tag_code(external_id_code))
        external_ids = ", ".join(external_id_list)
        subject = "Record #%s %s" % (recid, external_ids)

    authors = record_get_field_values(record, **split_tag_code(first_author_code)) + \
              record_get_field_values(record, **split_tag_code(additional_author_code))

    text = """
%(submitdate)s

External IDs: %(external_ids)s

Title: %(title)s

Authors: %(authors)s

Categories: %(categories)s

Comments: %(comments)s

%(abstract)s

%(extra_info)s

Edit the record now: %(editurl)s

""" \
    % {
        'external_ids': external_ids,
        'submitdate': record_get_field_value(record, **split_tag_code(date_code)),
        'extra_info': "\n".join(extra_info),
        'title': record_get_field_value(record, **split_tag_code(title_code)),
        'comments': "; ".join(comments),
        'categories': " ".join(categories),
        'authors': " / ".join(authors[:10]),
        'abstract': record_get_field_value(record, **split_tag_code(abstract_code)),
        'editurl': "%s/record/edit/%s" % (CFG_SITE_URL, recid),
    }
    # To avoid errors with string formatting later, we are escaping %'s
    ticket.subject = subject
    ticket.body = text.replace('%', '%%')
    ticket.queue = "Test"
    return ticket