예제 #1
0
    def _filter_records_fields(self, records_xml, output_fields):
        """Leaves in the records only fields that are necessary.
        All the other fields are removed from the records.

        @param records_xml: MARC XML containing all the information about the records
        @param output_fields: list of fields that should remain in the records

        @return: MARC XML with records containing only fields that are
        in output_fields list.
        """
        # Add 001/970 to the output fields. 970 is necessary for system number
        # extraction when exporting in aleph marc. When we add more formats,
        # we can add it optionally only when exporting aleph marc.
        output_fields.append("001")
        output_fields.append("970")

        records = bibrecord.create_records(records_xml)
        output_records = []

        for (record, status_code, list_of_errors) in records:
            record = self._filter_fields(record, output_fields)
            # do not return empty records
            if not self._is_record_empty(record):
                output_records.append(record)

        output_xml = bibrecord.print_recs(output_records)

        return output_xml
예제 #2
0
    def _filter_records_fields(self, records_xml, output_fields):
        """Leaves in the records only fields that are necessary.
        All the other fields are removed from the records.

        @param records_xml: MARC XML containing all the information about the records
        @param output_fields: list of fields that should remain in the records

        @return: MARC XML with records containing only fields that are
        in output_fields list.
        """
        # Add 001/970 to the output fields. 970 is necessary for system number
        # extraction when exporting in aleph marc. When we add more formats,
        # we can add it optionally only when exporting aleph marc.
        output_fields.append("001")
        output_fields.append("970")

        records = bibrecord.create_records(records_xml)
        output_records = []

        for (record, status_code, list_of_errors) in records:
            record = self._filter_fields(record, output_fields)
            # do not return empty records
            if not self._is_record_empty(record):
                output_records.append(record)

        output_xml = bibrecord.print_recs(output_records)

        return output_xml
예제 #3
0
파일: engine.py 프로젝트: mhellmic/b2share
def _create_marc(records_xml):
    """Creates MARC from MARCXML.

    @param records_xml: MARCXML containing information about the records

    @return: string containing information about the records
    in MARC format
    """
    aleph_marc_output = ""

    records = bibrecord.create_records(records_xml)
    for (record, status_code, list_of_errors) in records:

        sysno = ""

        options = {"aleph-marc":0, "correct-mode":1, "append-mode":0,
                   "delete-mode":0, "insert-mode":0, "replace-mode":0,
                   "text-marc":1}

        aleph_record = xmlmarc2textmarc.create_marc_record(record,
                                                           sysno,
                                                           options)
        aleph_marc_output += aleph_record

    return aleph_marc_output
예제 #4
0
    def _create_marc(self, records_xml):
        """Creates MARC from MARCXML.

        @param records_xml: MARCXML containing information about the records

        @return: string containing information about the records
        in MARC format
        """
        aleph_marc_output = ""

        records = bibrecord.create_records(records_xml)
        for (record, status_code, list_of_errors) in records:
            sysno_options = {"text-marc": 1}
            sysno = xmlmarc2textmarc.get_sysno_from_record(
                record, sysno_options)
            options = {
                "aleph-marc": 0,
                "correct-mode": 1,
                "append-mode": 0,
                "delete-mode": 0,
                "insert-mode": 0,
                "replace-mode": 0,
                "text-marc": 1
            }
            aleph_record = xmlmarc2textmarc.create_marc_record(
                record, sysno, options)
            aleph_marc_output += aleph_record

        return aleph_marc_output
예제 #5
0
def _check_client_can_submit_file(client_ip="",
                                  metafile="",
                                  req=None,
                                  webupload=0,
                                  ln=CFG_SITE_LANG):
    """
    Is this client able to upload such a FILENAME?
    check 980 $a values and collection tags in the file to see if they are among the
    permitted ones as specified by CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS and ACC_AUTHORIZE_ACTION.
    Useful to make sure that the client does not override other records by
    mistake.
    """
    _ = gettext_set_language(ln)
    recs = create_records(metafile, 0, 0)
    user_info = collect_user_info(req)

    permitted_dbcollids = _get_client_authorized_collections(client_ip)
    if '*' in permitted_dbcollids:
        if not webupload:
            return True
        else:
            return (0, " ")

    filename_tag980_values = _detect_980_values_from_marcxml_file(recs)
    for filename_tag980_value in filename_tag980_values:
        if not filename_tag980_value:
            if not webupload:
                return False
            else:
                return (1, "Invalid collection in tag 980")
        if not webupload:
            if not filename_tag980_value in permitted_dbcollids:
                return False
        else:
            auth_code, auth_message = acc_authorize_action(
                req, 'runbatchuploader', collection=filename_tag980_value)
            if auth_code != 0:
                error_msg = _("The user '%(x_user)s' is not authorized to modify collection '%(x_coll)s'") % \
                            {'x_user': user_info['nickname'], 'x_coll': filename_tag980_value}
                return (auth_code, error_msg)

    filename_rec_id_collections = _detect_collections_from_marcxml_file(recs)

    for filename_rec_id_collection in filename_rec_id_collections:
        if not webupload:
            if not filename_rec_id_collection in permitted_dbcollids:
                return False
        else:
            auth_code, auth_message = acc_authorize_action(
                req, 'runbatchuploader', collection=filename_rec_id_collection)
            if auth_code != 0:
                error_msg = _("The user '%(x_user)s' is not authorized to modify collection '%(x_coll)s'") % \
                            {'x_user': user_info['nickname'], 'x_coll': filename_rec_id_collection}
                return (auth_code, error_msg)
    if not webupload:
        return True
    else:
        return (0, " ")
예제 #6
0
파일: engine.py 프로젝트: ffelsner/invenio
def _check_client_can_submit_file(client_ip="", metafile="", req=None, webupload=0, ln=CFG_SITE_LANG):
    """
    Is this client able to upload such a FILENAME?
    check 980 $a values and collection tags in the file to see if they are among the
    permitted ones as specified by CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS and ACC_AUTHORIZE_ACTION.
    Useful to make sure that the client does not override other records by
    mistake.
    """
    _ = gettext_set_language(ln)
    recs = create_records(metafile, 0, 0)
    user_info = collect_user_info(req)

    permitted_dbcollids = _get_client_authorized_collections(client_ip)
    if '*' in permitted_dbcollids:
        if not webupload:
            return True
        else:
            return (0, " ")

    filename_tag980_values = _detect_980_values_from_marcxml_file(recs)
    for filename_tag980_value in filename_tag980_values:
        if not filename_tag980_value:
            if not webupload:
                return False
            else:
                return(1, "Invalid collection in tag 980")
        if not webupload:
            if not filename_tag980_value in permitted_dbcollids:
                return False
        else:
            auth_code, auth_message = acc_authorize_action(req, 'runbatchuploader', collection=filename_tag980_value)
            if auth_code != 0:
                error_msg = _("The user '%(x_user)s' is not authorized to modify collection '%(x_coll)s'") % \
                            {'x_user': user_info['nickname'], 'x_coll': filename_tag980_value}
                return (auth_code, error_msg)

    filename_rec_id_collections = _detect_collections_from_marcxml_file(recs)

    for filename_rec_id_collection in filename_rec_id_collections:
        if not webupload:
            if not filename_rec_id_collection in permitted_dbcollids:
                return False
        else:
            auth_code, auth_message = acc_authorize_action(req, 'runbatchuploader', collection=filename_rec_id_collection)
            if auth_code != 0:
                error_msg = _("The user '%(x_user)s' is not authorized to modify collection '%(x_coll)s'") % \
                            {'x_user': user_info['nickname'], 'x_coll': filename_rec_id_collection}
                return (auth_code, error_msg)
    if not webupload:
        return True
    else:
        return (0, " ")
예제 #7
0
def create_oaiharvest_log_str(task_id, oai_src_id, xml_content):
    """
    Function which creates the harvesting logs
    @param task_id bibupload task id
    """
    records = create_records(xml_content)
    for record in records:
        oai_id = record_extract_oai_id(record[0])
        my_new_harvest_log = OaiHARVESTLOG()
        my_new_harvest_log.id_oaiHARVEST = oai_src_id
        my_new_harvest_log.oai_id = oai_id
        my_new_harvest_log.date_harvested = datetime.now()
        my_new_harvest_log.bibupload_task_id = task_id
        db.session.add(my_new_harvest_log)
예제 #8
0
파일: dblayer.py 프로젝트: k3njiy/invenio
def create_oaiharvest_log_str(task_id, oai_src_id, xml_content):
    """
    Function which creates the harvesting logs
    @param task_id bibupload task id
    """
    records = create_records(xml_content)
    for record in records:
        oai_id = record_extract_oai_id(record[0])
        my_new_harvest_log = OaiHARVESTLOG()
        my_new_harvest_log.id_oaiHARVEST = oai_src_id
        my_new_harvest_log.oai_id = oai_id
        my_new_harvest_log.date_harvested = datetime.now()
        my_new_harvest_log.bibupload_task_id = task_id
        db.session.add(my_new_harvest_log)
예제 #9
0
파일: engine.py 프로젝트: ffelsner/invenio
def perform_basic_upload_checks(xml_record):
    """ Performs tests that would provoke the bibupload task to fail with
    an exit status 1, to prevent batchupload from crashing while alarming
    the user wabout the issue
    """
    from invenio.legacy.bibupload.engine import writing_rights_p

    errors = []
    if not writing_rights_p():
        errors.append("Error: BibUpload does not have rights to write fulltext files.")
    recs = create_records(xml_record, 1, 1)
    if recs == []:
        errors.append("Error: Cannot parse MARCXML file.")
    elif recs[0][0] is None:
        errors.append("Error: MARCXML file has wrong format: %s" % recs)
    return errors
def get_records():
    """Fetch records either from file or from StdIn"""
    try:
        with codecs.open(sys.argv[1], encoding='utf-8', mode='r') as handle:
            input_xml = handle.read()
    except Exception:
        input_xml = sys.stdin.read()

    records_out = []
    for record, code, errors in create_records(input_xml):
        if code != 1:
            msg = "Record Error: %s%s" % (str(record)[:30], str(errors))
            raise ValueError(msg)
        records_out.append(record)
    _print(TF.YELLOW + "Processing %d records" % len(records_out) + TF.END)
    return records_out, input_xml
예제 #11
0
def perform_basic_upload_checks(xml_record):
    """ Performs tests that would provoke the bibupload task to fail with
    an exit status 1, to prevent batchupload from crashing while alarming
    the user wabout the issue
    """
    from invenio.legacy.bibupload.engine import writing_rights_p

    errors = []
    if not writing_rights_p():
        errors.append(
            "Error: BibUpload does not have rights to write fulltext files.")
    recs = create_records(xml_record, 1, 1)
    if recs == []:
        errors.append("Error: Cannot parse MARCXML file.")
    elif recs[0][0] is None:
        errors.append("Error: MARCXML file has wrong format: %s" % recs)
    return errors
예제 #12
0
def _record_in_files_p(recid, filenames):
    """Search XML files for given record."""
    # Get id tags of record in question
    rec_oaiid = rec_sysno = -1
    rec_oaiid_tag = get_fieldvalues(recid, OAIID_TAG)
    rec_sysno_tag = get_fieldvalues(recid, SYSNO_TAG)
    if rec_sysno_tag:
        rec_sysno = rec_sysno_tag[0]

    # For each record in each file, compare ids and abort if match is found
    for filename in filenames:
        try:
            if CFG_BIBEDIT_QUEUE_CHECK_METHOD == 'regexp':
                # check via regexp: this is fast, but may not be precise
                file_content = open(filename).read()
                re_match_001 = re.compile(
                    '<controlfield tag="001">%s</controlfield>' % (recid))
                if re_match_001.search(file_content):
                    return True
                for rec_oaiid in rec_oaiid_tag:
                    re_match_oaiid = re.compile(
                        r'<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s'
                        % (OAIID_TAG[0:3], re.escape(rec_oaiid)))
                    if re_match_oaiid.search(file_content):
                        return True
                re_match_sysno = re.compile(
                    r'<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s'
                    % (SYSNO_TAG[0:3], re.escape(str(rec_sysno))))
                if rec_sysno_tag:
                    if re_match_sysno.search(file_content):
                        return True
            else:
                # by default, check via bibrecord: this is accurate, but may be slow
                file_ = open(filename)
                records = create_records(file_.read(), 0, 0)
                for i in range(0, len(records)):
                    record, all_good = records[i][:2]
                    if record and all_good:
                        if _record_has_id_p(record, recid, rec_oaiid,
                                            rec_sysno):
                            return True
                file_.close()
        except IOError:
            continue
    return False
예제 #13
0
파일: utils.py 프로젝트: jiangmin9/invenio
def _record_in_files_p(recid, filenames):
    """Search XML files for given record."""
    # Get id tags of record in question
    rec_oaiid = rec_sysno = -1
    rec_oaiid_tag = get_fieldvalues(recid, OAIID_TAG)
    rec_sysno_tag = get_fieldvalues(recid, SYSNO_TAG)
    if rec_sysno_tag:
        rec_sysno = rec_sysno_tag[0]

    # For each record in each file, compare ids and abort if match is found
    for filename in filenames:
        try:
            if CFG_BIBEDIT_QUEUE_CHECK_METHOD == "regexp":
                # check via regexp: this is fast, but may not be precise
                file_content = open(filename).read()
                re_match_001 = re.compile('<controlfield tag="001">%s</controlfield>' % (recid))
                if re_match_001.search(file_content):
                    return True
                for rec_oaiid in rec_oaiid_tag:
                    re_match_oaiid = re.compile(
                        r'<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s'
                        % (OAIID_TAG[0:3], re.escape(rec_oaiid))
                    )
                    if re_match_oaiid.search(file_content):
                        return True
                re_match_sysno = re.compile(
                    r'<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s'
                    % (SYSNO_TAG[0:3], re.escape(str(rec_sysno)))
                )
                if rec_sysno_tag:
                    if re_match_sysno.search(file_content):
                        return True
            else:
                # by default, check via bibrecord: this is accurate, but may be slow
                file_ = open(filename)
                records = create_records(file_.read(), 0, 0)
                for i in range(0, len(records)):
                    record, all_good = records[i][:2]
                    if record and all_good:
                        if _record_has_id_p(record, recid, rec_oaiid, rec_sysno):
                            return True
                file_.close()
        except IOError:
            continue
    return False
예제 #14
0
파일: dblayer.py 프로젝트: mhellmic/b2share
def create_oaiharvest_log_str(task_id, oai_src_id, xml_content):
    """
    Function which creates the harvesting logs
    @param task_id bibupload task id
    """
    try:
        records = create_records(xml_content)
        for record in records:
            oai_id = record_extract_oai_id(record[0])
            my_new_harvest_log = OaiHARVESTLOG()
            my_new_harvest_log.id_oaiHARVEST = oai_src_id
            my_new_harvest_log.oai_id = oai_id
            my_new_harvest_log.date_harvested = datetime.datetime.now
            my_new_harvest_log.bibupload_task_id = task_id
            db.session.add(my_new_harvest_log)
            db.session.commit()
    except Exception as msg:
        print("Logging exception : %s   " % (str(msg),))
예제 #15
0
def recxml2recmarc(xmltext, options, sysno_generator=get_sysno_generator()):
    """The function that processes creating the records from
       an XML string, and prints these records to the
       standard output stream.
       @param xmltext: An XML MARC record in string form.
       @param options: Various options about the record to be
        created, as passed from the command line.
       @param sysno_generator: A static parameter to act as an Aleph
        system number generator. Do not provide a value for this - it
        will be assigned upon first call to this function.
    """
    rec_count = 0  ## Counter used to record the number of the rec
                   ## that is being processed. Used in error messages
                   ## for the user, when a record cannot be processed

    ## create internal records structure from xmltext:
    records = create_records(xmltext, 1, 1)

    ## now loop through each record, get its sysno, and convert it:
    for rec_tuple in records:
        rec_count += 1
        ## Get the record-dictionary itself from the record-tuple:
        record = rec_tuple[0]

        if record is None:
            ## if the record is None, there was probably a problem
            ## with the MARC XML. Display a warning message on stderr and
            ## move past this record:
            sys.stderr.write("E: Unable to process record number %s; The XML " \
                             " may be broken for this record.\n" \
                             % str(rec_count))
            continue

        ## From the record, get the SYS if running in ALEPH-MARC mode, or
        ## the recid (001) if running in TEXT-MARC mode:
        sysno = get_sysno_from_record(record, options)

        if sysno is None:
            ## No 'sysno' was found in the record:
            if options["text-marc"] == 1:
                ## 'sysno' (001) (which is actually the recid) is mandatory
                ## for the creation of TEXT-MARC. Report the error and skip
                ## past the record:
                sys.stderr.write("E: Record number %s has no 'recid' (001). " \
                                 "This field is mandatory for the " \
                                 "creation of TEXT MARC. The record has been " \
                                 "skipped.\n" % str(rec_count))
                continue
            elif options["aleph-marc"] ==  1 and \
                     1 in (options["append-mode"], options["delete-mode"], \
                           options["correct-mode"], options["replace-mode"]):
                ## When creating ALEPH-MARC that will be used to manipulate
                ## a record in some way (i.e. correct, append, delete, replace),
                ## the ALEPH SYS (970__a in MARC XML) is mandatory. Report the
                ## error and skip past the record:
                sys.stderr.write("E: Record number %s has no ALEPH 'SYS' " \
                                 "(970__a). This field is mandatory for the " \
                                 "creation of ALEPH MARC that is used for the" \
                                 " manipulation of records (i.e. replace, " \
                                 "correct, append, delete). The record has " \
                                 "been skipped.\n" % str(rec_count))
                continue
        elif options["aleph-marc"] == 1 and type(sysno) in (list, tuple):
            ## multiple values for SYS (970__a) in ALEPH-MARC mode are not
            ## permitted. Report the error and skip past the record:
            sys.stderr.write("E: Multiple values have been found for the " \
                             "ALEPH SYS (970__a) in record number %s. This " \
                             "is not permitted when running in ALEPH-MARC " \
                             "mode. The record has been skipped." \
                             % str(rec_count))
            continue

        if options["aleph-marc"] == 1 and options["insert-mode"] == 1:
            ## Creating an ALEPH "insert" record. Since the resulting record
            ## should be treated as a new insert into ALEPH, any 'sysno' that
            ## may have been found in the MARC XML record cannot be used -
            ## that would be dangerous. Therefore, set 'sysno' to None and
            ## create a random sysno:
            sysno = None
            try:
                sysno = sysno_generator.next()
            except StopIteration:
                ## generator counter has overstepped the MAX ALEPH SYS!
                ## Without a SYS, we cannot create ALEPH MARC
                sys.stderr.write("""E: Maximum ALEPH SYS has been """ \
                                 """reached - unable to continue.\n""")
                sys.exit(1)


        ## No problems were encountered with SYS or recid. Display the
        ## translated record:
        rec_out = create_marc_record(record, sysno, options)
        sys.stdout.write(rec_out)
        sys.stdout.flush()
예제 #16
0
def main():
    cmdusage = """Usage: %s [options] <marcxmlfile>
    General options:
      -h, --help            Print this help.
      -V, --version         Print version information.
      -v, --verbose=LEVEL   Verbose level (from 0 to 9, default 0).
    Description: checks the validity of MARCXML file.
    """ % (sys.argv[0])

    verbose = 0
    badrecords = []
    listofrecs = []

    try:
        opts, args = getopt.getopt(sys.argv[1:], "hVv:", ["help", "version", "verbose="])
    except getopt.GetoptError:
        print(cmdusage)
        sys.exit(2)

    for opt in opts:
        if opt[0] in ("-V","--version"):
            print(__revision__)
            sys.exit(0)
        elif opt[0] in ("-h","--help"):
            sys.stderr.write(cmdusage)
            sys.exit(0)
        elif opt[0] in ("-v", "--verbose"):
            try:
                verbose = string.atoi(opt[1])
            except ValueError:
                print("[ERROR] verbose must be an integer.")
                sys.exit(2)

    try:
        xmlfile = args[0]
    except IndexError:
        sys.stderr.write(cmdusage)
        sys.exit(0)

    try:
        xmltext = open(xmlfile,'r').read()
    except IOError:
        print("[ERROR] File %s not found." % xmlfile)
        import sys
        sys.exit(1)

    listofrecs = create_records(xmltext, 0, 1)
    badr = filter((lambda x: x[1]==0), listofrecs)
    badrecords = map((lambda x:x[0]), badr)

    s = ''
    errors = []

    if xmltext and not listofrecs:
        print("[ERROR] No valid record detected.")
        sys.exit(1)

    if verbose:
        if verbose <= 3:
            errors.extend(map((lambda x:x[2]), listofrecs))
        else:
            s = print_recs(badrecords)
            errors.extend(map((lambda x:x[2]), listofrecs))
    else:
        if badrecords:
            print("[ERROR] Bad records detected.  For more information, increase verbosity.")
            print("\n[INFO] You may also want to run `xmllint %s' to help " \
                  "localise errors in the input file." % xmlfile)
            sys.exit(1)

    errors = [error for error in errors if error]

    if s or errors:
        if s:
            print(s)
        for error in errors:
            print("[ERROR]", error)
        print("[INFO] You may also want to run `xmllint %s' to help " \
              "localise errors in the input file." % xmlfile)
        sys.exit(1)
예제 #17
0
def validate_matches(bibmatch_recid, record, server, result_recids, \
                     collections="", verbose=0, ascii_mode=False):
    """
    Perform record validation on a set of matches. This function will
    try to find any search-result that "really" is a correct match, based on
    various methods defined in a given rule-set. See more about rule-sets in
    validate_match() function documentation.

    This function will return a tuple containing a list of all record IDs
    satisfying the count of field matching needed for exact matches and a
    similar list for fuzzy matches that has less fields matching then the
    threshold. Records that are not matching at all are simply left out of
    the lists.

    @param bibmatch_recid: Current record number. Used for logging.
    @type bibmatch_recid: int

    @param record: bibrec structure of original record
    @type record: dict

    @param server: InvenioConnector object to matched record source repository
    @type server: InvenioConnector object

    @param result_recids: the list of record ids from search result.
    @type result_recids: list

    @param collections: list of collections to search, if specified
    @type collections: list

    @param verbose: be loud
    @type verbose: int

    @param ascii_mode: True to transform values to its ascii representation
    @type ascii_mode: bool

    @return: list of record IDs matched
    @rtype: list
    """
    matches_found = []
    fuzzy_matches_found = []

    # Generate final rule-set by analyzing the record
    final_ruleset = get_validation_ruleset(record)
    if not final_ruleset:
        raise BibMatchValidationError("Bad configuration rule-set." \
                                      "Please check that CFG_BIBMATCH_MATCH_VALIDATION_RULESETS" \
                                      " is formed correctly.")

    if verbose > 8:
        sys.stderr.write("\nStart record validation:\n\nFinal validation ruleset used:\n")
        pp = pprint.PrettyPrinter(stream=sys.stderr, indent=2)
        pp.pprint(final_ruleset)
    CFG_BIBMATCH_LOGGER.info("Final validation ruleset used: %s" % (final_ruleset,))

    # Fetch all records in MARCXML and convert to BibRec
    found_record_list = []
    query = " OR ".join(["001:%d" % (recid,) for recid in result_recids])

    if collections:
        search_params = dict(p=query, of="xm", c=collections)
    else:
        search_params = dict(p=query, of="xm")
    CFG_BIBMATCH_LOGGER.info("Fetching records to match: %s" % (str(search_params),))
    result_marcxml = server.search_with_retry(**search_params)
    # Check if record was found
    if result_marcxml:
        found_record_list = [r[0] for r in create_records(result_marcxml)]
        # Check if BibRecord generation was successful
        if not found_record_list:
            # Error fetching records. Unable to validate. Abort.
            raise BibMatchValidationError("Error retrieving MARCXML for possible matches from %s. Aborting." \
                                          % (server.server_url,))
        if len(found_record_list) < len(result_recids):
            # Error fetching all records. Will still continue.
            sys.stderr.write("\nError retrieving all MARCXML for possible matched records from %s.\n" \
                              % (server.server_url,))

    # Validate records one-by-one, adding any matches to the list of matching record IDs
    current_index = 1
    for matched_record in found_record_list:
        recid = record_get_field_values(matched_record, tag="001")[0]
        if verbose > 8:
            sys.stderr.write("\n Validating matched record #%d (%s):\n" % \
                             (current_index, recid))
        CFG_BIBMATCH_LOGGER.info("Matching of record %d: Comparing to matched record %s" % \
                                 (bibmatch_recid, recid))
        match_ratio = validate_match(record, matched_record, final_ruleset, \
                                     verbose, ascii_mode)
        if match_ratio == 1.0:
            # All matches were a success, this is an exact match
            CFG_BIBMATCH_LOGGER.info("Matching of record %d: Exact match found -> %s" % (bibmatch_recid, recid))
            matches_found.append(recid)
        elif match_ratio >= CFG_BIBMATCH_FUZZY_MATCH_VALIDATION_LIMIT:
            # This means that some matches failed, but some succeeded as well. That's fuzzy...
            CFG_BIBMATCH_LOGGER.info("Matching of record %d: Fuzzy match found -> %s" % \
                                     (bibmatch_recid, recid))
            fuzzy_matches_found.append(recid)
        else:
            CFG_BIBMATCH_LOGGER.info("Matching of record %d: Not a match" % (bibmatch_recid,))
        current_index += 1

    # Return list of matching record IDs
    return matches_found, fuzzy_matches_found
예제 #18
0
    def get_number_of_records_found(self):
        """Returns the number of records in the result"""
        records = bibrecord.create_records(self._result)
        records_count = len(records)

        return records_count
예제 #19
0
    def get_number_of_records_found(self):
        """Returns the number of records in the result"""
        records = bibrecord.create_records(self._result)
        records_count = len(records)

        return records_count
예제 #20
0
    def _author_list(obj, eng):
        from invenio.legacy.bibrecord import create_records, record_xml_output
        from invenio.legacy.bibconvert.xslt_engine import convert
        from invenio.utils.plotextractor.api import get_tarball_from_arxiv
        from invenio.utils.plotextractor.cli import get_defaults
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.plotextractor.converter import untar
        from invenio.utils.shell import Timeout

        from ..utils import find_matching_files

        identifiers = obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "")
        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}
        if "tarball" not in obj.extra_data["_result"]:
            extract_path = os.path.join(
                cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
                str(eng.uuid)
            )
            tarball = get_tarball_from_arxiv(
                obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                extract_path
            )
            if tarball is None:
                obj.log.error("No tarball found")
                return
        else:
            tarball = obj.extra_data["_result"]["tarball"]

        # FIXME
        tarball = str(tarball)
        sub_dir, dummy = get_defaults(tarball,
                                      cfg['CFG_TMPDIR'], "")

        try:
            untar(tarball, sub_dir)
            obj.log.info("Extracted tarball to: {0}".format(sub_dir))
        except Timeout:
            eng.log.error('Timeout during tarball extraction on %s' % (
                obj.extra_data["_result"]["tarball"]))

        xml_files_list = find_matching_files(sub_dir, ["xml"])

        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        authors = ""

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors = convert(xml_content, stylesheet)
                authorlist_record = create_records(authors)
                if len(authorlist_record) == 1:
                    if authorlist_record[0][0] is None:
                        eng.log.error("Error parsing authorlist record for id: %s" % (
                            identifiers,))
                    authorlist_record = authorlist_record[0][0]

                author_xml = record_xml_output(authorlist_record)
                if author_xml:
                    updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \
                                  + record_xml_output(authorlist_record) + '</collection>'
                    new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
                    obj.data["authors"] = new_dict_representation["authors"]
                    obj.update_task_results(
                        "authors",
                        [{
                            "name": "authors",
                            "results": new_dict_representation["authors"]
                        }]
                    )
                    obj.update_task_results(
                        "number_of_authors",
                        [{
                            "name": "number_of_authors",
                            "results": new_dict_representation["number_of_authors"]
                        }]
                    )
                    break
예제 #21
0
def recxml2recmarc(xmltext, options, sysno_generator=get_sysno_generator()):
    """The function that processes creating the records from
       an XML string, and prints these records to the
       standard output stream.
       @param xmltext: An XML MARC record in string form.
       @param options: Various options about the record to be
        created, as passed from the command line.
       @param sysno_generator: A static parameter to act as an Aleph
        system number generator. Do not provide a value for this - it
        will be assigned upon first call to this function.
    """
    rec_count = 0  ## Counter used to record the number of the rec
    ## that is being processed. Used in error messages
    ## for the user, when a record cannot be processed

    ## create internal records structure from xmltext:
    records = create_records(xmltext, 1, 1)

    ## now loop through each record, get its sysno, and convert it:
    for rec_tuple in records:
        rec_count += 1
        ## Get the record-dictionary itself from the record-tuple:
        record = rec_tuple[0]

        if record is None:
            ## if the record is None, there was probably a problem
            ## with the MARC XML. Display a warning message on stderr and
            ## move past this record:
            sys.stderr.write("E: Unable to process record number %s; The XML " \
                             " may be broken for this record.\n" \
                             % str(rec_count))
            continue

        ## From the record, get the SYS if running in ALEPH-MARC mode, or
        ## the recid (001) if running in TEXT-MARC mode:
        sysno = get_sysno_from_record(record, options)

        if sysno is None:
            ## No 'sysno' was found in the record:
            if options["text-marc"] == 1:
                ## 'sysno' (001) (which is actually the recid) is mandatory
                ## for the creation of TEXT-MARC. Report the error and skip
                ## past the record:
                sys.stderr.write("E: Record number %s has no 'recid' (001). " \
                                 "This field is mandatory for the " \
                                 "creation of TEXT MARC. The record has been " \
                                 "skipped.\n" % str(rec_count))
                continue
            elif options["aleph-marc"] ==  1 and \
                     1 in (options["append-mode"], options["delete-mode"], \
                           options["correct-mode"], options["replace-mode"]):
                ## When creating ALEPH-MARC that will be used to manipulate
                ## a record in some way (i.e. correct, append, delete, replace),
                ## the ALEPH SYS (970__a in MARC XML) is mandatory. Report the
                ## error and skip past the record:
                sys.stderr.write("E: Record number %s has no ALEPH 'SYS' " \
                                 "(970__a). This field is mandatory for the " \
                                 "creation of ALEPH MARC that is used for the" \
                                 " manipulation of records (i.e. replace, " \
                                 "correct, append, delete). The record has " \
                                 "been skipped.\n" % str(rec_count))
                continue
        elif options["aleph-marc"] == 1 and type(sysno) in (list, tuple):
            ## multiple values for SYS (970__a) in ALEPH-MARC mode are not
            ## permitted. Report the error and skip past the record:
            sys.stderr.write("E: Multiple values have been found for the " \
                             "ALEPH SYS (970__a) in record number %s. This " \
                             "is not permitted when running in ALEPH-MARC " \
                             "mode. The record has been skipped." \
                             % str(rec_count))
            continue

        if options["aleph-marc"] == 1 and options["insert-mode"] == 1:
            ## Creating an ALEPH "insert" record. Since the resulting record
            ## should be treated as a new insert into ALEPH, any 'sysno' that
            ## may have been found in the MARC XML record cannot be used -
            ## that would be dangerous. Therefore, set 'sysno' to None and
            ## create a random sysno:
            sysno = None
            try:
                sysno = sysno_generator.next()
            except StopIteration:
                ## generator counter has overstepped the MAX ALEPH SYS!
                ## Without a SYS, we cannot create ALEPH MARC
                sys.stderr.write("""E: Maximum ALEPH SYS has been """ \
                                 """reached - unable to continue.\n""")
                sys.exit(1)

        ## No problems were encountered with SYS or recid. Display the
        ## translated record:
        rec_out = create_marc_record(record, sysno, options)
        sys.stdout.write(rec_out)
        sys.stdout.flush()
예제 #22
0
    def _author_list(obj, eng):
        from invenio.legacy.bibrecord import create_records, record_xml_output
        from invenio.legacy.bibconvert.xslt_engine import convert
        from invenio.utils.plotextractor.api import get_tarball_from_arxiv
        from invenio.utils.plotextractor.cli import get_defaults
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.plotextractor.converter import untar
        from invenio.utils.shell import Timeout

        from ..utils import find_matching_files

        identifiers = obj.data.get(
            cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "")
        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}
        if "tarball" not in obj.extra_data["_result"]:
            extract_path = os.path.join(
                cfg.get('OAIHARVESTER_STORAGEDIR',
                        cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid))
            tarball = get_tarball_from_arxiv(
                obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                extract_path)
            if tarball is None:
                obj.log.error("No tarball found")
                return
        else:
            tarball = obj.extra_data["_result"]["tarball"]

        # FIXME
        tarball = str(tarball)
        sub_dir, dummy = get_defaults(tarball, cfg['CFG_TMPDIR'], "")

        try:
            untar(tarball, sub_dir)
            obj.log.info("Extracted tarball to: {0}".format(sub_dir))
        except Timeout:
            eng.log.error('Timeout during tarball extraction on %s' %
                          (obj.extra_data["_result"]["tarball"]))

        xml_files_list = find_matching_files(sub_dir, ["xml"])

        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        authors = ""

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors = convert(xml_content, stylesheet)
                authorlist_record = create_records(authors)
                if len(authorlist_record) == 1:
                    if authorlist_record[0][0] is None:
                        eng.log.error(
                            "Error parsing authorlist record for id: %s" %
                            (identifiers, ))
                    authorlist_record = authorlist_record[0][0]

                author_xml = record_xml_output(authorlist_record)
                if author_xml:
                    updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \
                                  + record_xml_output(authorlist_record) + '</collection>'
                    new_dict_representation = convert_marcxml_to_bibfield(
                        updated_xml)
                    obj.data["authors"] = new_dict_representation["authors"]
                    obj.update_task_results(
                        "authors",
                        [{
                            "name": "authors",
                            "results": new_dict_representation["authors"]
                        }])
                    obj.update_task_results("number_of_authors", [{
                        "name":
                        "number_of_authors",
                        "results":
                        new_dict_representation["number_of_authors"]
                    }])
                    break
예제 #23
0
def author_list(obj, eng):
    """Perform the special authorlist extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.oaiharvest.utils import (translate_fieldvalues_from_latex,
                                                 find_matching_files)
    from invenio.legacy.bibrecord import create_records, record_xml_output
    from invenio.legacy.bibconvert.xslt_engine import convert
    from invenio.utils.plotextractor.cli import get_defaults
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
    from invenio.utils.plotextractor.getter import harvest_single
    from invenio.modules.workflows.errors import WorkflowError
    from invenio.utils.plotextractor.converter import untar
    from invenio.utils.shell import Timeout

    identifiers = obj.data["system_control_number"]["value"]
    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}
    if "tarball" not in obj.extra_data["_result"]:
        extract_path = os.path.join(
            cfg['CFG_TMPSHAREDDIR'],
            str(eng.uuid)
        )
        if not os.path.exists(extract_path):
            os.makedirs(extract_path)
        tarball, pdf = harvest_single(
            obj.data["system_control_number"]["value"], extract_path,
            ["tarball"])
        tarball = str(tarball)
        if tarball is None:
            raise WorkflowError(str(
                "Error harvesting tarball from id: %s %s" % (
                    identifiers, extract_path)), eng.uuid, id_object=obj.id)
        obj.extra_data["_result"]["tarball"] = tarball

    sub_dir, dummy = get_defaults(obj.extra_data["_result"]["tarball"],
                                  cfg['CFG_TMPDIR'], "")

    try:
        untar(obj.extra_data["_result"]["tarball"], sub_dir)
        obj.log.info("Extracted tarball to: {0}".format(sub_dir))
    except Timeout:
        eng.log.error('Timeout during tarball extraction on %s' % (
            obj.extra_data["_result"]["tarball"]))

    xml_files_list = find_matching_files(sub_dir, ["xml"])

    obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

    authors = ""

    for xml_file in xml_files_list:
        xml_file_fd = open(xml_file, "r")
        xml_content = xml_file_fd.read()
        xml_file_fd.close()

        match = REGEXP_AUTHLIST.findall(xml_content)
        if match:
            obj.log.info("Found a match for author extraction")

            a_stylesheet = obj.extra_data["repository"]["arguments"].get(
                "a_stylesheet"
            ) or "authorlist2marcxml.xsl"
            authors = convert(xml_content, a_stylesheet)
            authorlist_record = create_records(authors)
            if len(authorlist_record) == 1:
                if authorlist_record[0][0] is None:
                    eng.log.error("Error parsing authorlist record for id: %s" % (
                        identifiers,))
                authorlist_record = authorlist_record[0][0]
                # Convert any LaTeX symbols in authornames
            translate_fieldvalues_from_latex(authorlist_record, '100', code='a')
            translate_fieldvalues_from_latex(authorlist_record, '700', code='a')

            updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \
                          + record_xml_output(authorlist_record) + '</collection>'
            if not None == updated_xml:
                # We store the path to the directory  the tarball contents live
                # Read and grab MARCXML from plotextractor run
                new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
                obj.data['authors'] = new_dict_representation["authors"]
                obj.data['number_of_authors'] = new_dict_representation[
                    "number_of_authors"]
                obj.add_task_result("authors", new_dict_representation["authors"])
                obj.add_task_result("number_of_authors",
                                    new_dict_representation["number_of_authors"])
                break
예제 #24
0
def author_list(obj, eng):
    """
    Performs the special authorlist extraction step (Mostly INSPIRE/CERN related).

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.oaiharvest.utils import (translate_fieldvalues_from_latex,
                                                 find_matching_files)
    from invenio.legacy.bibrecord import create_records, record_xml_output
    from invenio.legacy.bibconvert.xslt_engine import convert
    from invenio.utils.plotextractor.cli import get_defaults

    identifiers = obj.data["system_number_external"]["value"]
    bibtask.task_sleep_now_if_required()
    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}
    if "tarball" not in obj.extra_data["_result"]:
        extract_path = plotextractor_getter.make_single_directory(cfg['CFG_TMPSHAREDDIR'], eng.uuid)
        tarball, pdf = plotextractor_getter.harvest_single(obj.data["system_number_external"]["value"], extract_path, ["tarball"])
        tarball = str(tarball)
        if tarball is None:
            raise workflows_error.WorkflowError(str("Error harvesting tarball from id: %s %s" % (identifiers, extract_path)),
                                                eng.uuid,
                                                id_object=obj.id)
        obj.extra_data["_result"]["tarball"] = tarball

    sub_dir, dummy = get_defaults(obj.extra_data["_result"]["tarball"], cfg['CFG_TMPDIR'], "")

    try:
        untar(obj.extra_data["_result"]["tarball"], sub_dir)
    except Timeout:
        eng.log.error('Timeout during tarball extraction on %s' % (obj.extra_data["_result"]["tarball"]))

    xml_files_list = find_matching_files(sub_dir, ["xml"])

    authors = ""

    for xml_file in xml_files_list:
        xml_file_fd = open(xml_file, "r")
        xml_content = xml_file_fd.read()
        xml_file_fd.close()

        match = REGEXP_AUTHLIST.findall(xml_content)
        if not match == []:
            authors += match[0]
            # Generate file to store conversion results
    if authors is not '':
        authors = convert(authors, "authorlist2marcxml.xsl")
        authorlist_record = create_records(authors)
        if len(authorlist_record) == 1:
            if authorlist_record[0][0] is None:
                eng.log.error("Error parsing authorlist record for id: %s" % (identifiers,))
            authorlist_record = authorlist_record[0][0]
            # Convert any LaTeX symbols in authornames
        translate_fieldvalues_from_latex(authorlist_record, '100', code='a')
        translate_fieldvalues_from_latex(authorlist_record, '700', code='a')

        updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' + record_xml_output(authorlist_record) \
                      + '</collection>'
        if not None == updated_xml:
            # We store the path to the directory  the tarball contents live
            # Read and grab MARCXML from plotextractor run
            new_dict_representation = records_api.create_record(updated_xml, master_format="marc").dumps()
            obj.data['authors'] = new_dict_representation["authors"]
            obj.data['number_of_authors'] = new_dict_representation["number_of_authors"]
            obj.add_task_result("authors", new_dict_representation["authors"])
            obj.add_task_result("number_of_authors", new_dict_representation["number_of_authors"])
예제 #25
0
def main():
    """Execute script."""
    import sys

    cmdusage = """Usage: %s [options] <marcxmlfile>
    General options:
      -h, --help            Print this help.
      -v, --verbose=LEVEL   Verbose level (from 0 to 9, default 0).
    Description: checks the validity of MARCXML file.
    """ % (sys.argv[0])

    verbose = 0
    badrecords = []
    listofrecs = []

    try:
        opts, args = getopt.getopt(sys.argv[1:], "hv:", ["help", "verbose="])
    except getopt.GetoptError:
        print(cmdusage)
        sys.exit(2)

    for opt in opts:
        if opt[0] in ("-h", "--help"):
            sys.stderr.write(cmdusage)
            sys.exit(0)
        elif opt[0] in ("-v", "--verbose"):
            try:
                verbose = string.atoi(opt[1])
            except ValueError:
                print("[ERROR] verbose must be an integer.")
                sys.exit(2)

    try:
        xmlfile = args[0]
    except IndexError:
        sys.stderr.write(cmdusage)
        sys.exit(0)

    try:
        xmltext = open(xmlfile, 'r').read()
    except IOError:
        print("[ERROR] File %s not found." % xmlfile)
        import sys
        sys.exit(1)

    listofrecs = create_records(xmltext, 0, 1)
    badr = filter((lambda x: x[1] == 0), listofrecs)
    badrecords = map((lambda x: x[0]), badr)

    s = ''
    errors = []

    if xmltext and not listofrecs:
        print("[ERROR] No valid record detected.")
        sys.exit(1)

    if verbose:
        if verbose <= 3:
            errors.extend(map((lambda x: x[2]), listofrecs))
        else:
            s = print_recs(badrecords)
            errors.extend(map((lambda x: x[2]), listofrecs))
    else:
        if badrecords:
            print(
                "[ERROR] Bad records detected.  For more information, increase verbosity."
            )
            print("\n[INFO] You may also want to run `xmllint %s' to help "
                  "localise errors in the input file." % xmlfile)
            sys.exit(1)

    errors = [error for error in errors if error]

    if s or errors:
        if s:
            print(s)
        for error in errors:
            print("[ERROR]", error)
        print("[INFO] You may also want to run `xmllint %s' to help "
              "localise errors in the input file." % xmlfile)
        sys.exit(1)