def _filter_records_fields(self, records_xml, output_fields): """Leaves in the records only fields that are necessary. All the other fields are removed from the records. @param records_xml: MARC XML containing all the information about the records @param output_fields: list of fields that should remain in the records @return: MARC XML with records containing only fields that are in output_fields list. """ # Add 001/970 to the output fields. 970 is necessary for system number # extraction when exporting in aleph marc. When we add more formats, # we can add it optionally only when exporting aleph marc. output_fields.append("001") output_fields.append("970") records = bibrecord.create_records(records_xml) output_records = [] for (record, status_code, list_of_errors) in records: record = self._filter_fields(record, output_fields) # do not return empty records if not self._is_record_empty(record): output_records.append(record) output_xml = bibrecord.print_recs(output_records) return output_xml
def _create_marc(records_xml): """Creates MARC from MARCXML. @param records_xml: MARCXML containing information about the records @return: string containing information about the records in MARC format """ aleph_marc_output = "" records = bibrecord.create_records(records_xml) for (record, status_code, list_of_errors) in records: sysno = "" options = {"aleph-marc":0, "correct-mode":1, "append-mode":0, "delete-mode":0, "insert-mode":0, "replace-mode":0, "text-marc":1} aleph_record = xmlmarc2textmarc.create_marc_record(record, sysno, options) aleph_marc_output += aleph_record return aleph_marc_output
def _create_marc(self, records_xml): """Creates MARC from MARCXML. @param records_xml: MARCXML containing information about the records @return: string containing information about the records in MARC format """ aleph_marc_output = "" records = bibrecord.create_records(records_xml) for (record, status_code, list_of_errors) in records: sysno_options = {"text-marc": 1} sysno = xmlmarc2textmarc.get_sysno_from_record( record, sysno_options) options = { "aleph-marc": 0, "correct-mode": 1, "append-mode": 0, "delete-mode": 0, "insert-mode": 0, "replace-mode": 0, "text-marc": 1 } aleph_record = xmlmarc2textmarc.create_marc_record( record, sysno, options) aleph_marc_output += aleph_record return aleph_marc_output
def _check_client_can_submit_file(client_ip="", metafile="", req=None, webupload=0, ln=CFG_SITE_LANG): """ Is this client able to upload such a FILENAME? check 980 $a values and collection tags in the file to see if they are among the permitted ones as specified by CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS and ACC_AUTHORIZE_ACTION. Useful to make sure that the client does not override other records by mistake. """ _ = gettext_set_language(ln) recs = create_records(metafile, 0, 0) user_info = collect_user_info(req) permitted_dbcollids = _get_client_authorized_collections(client_ip) if '*' in permitted_dbcollids: if not webupload: return True else: return (0, " ") filename_tag980_values = _detect_980_values_from_marcxml_file(recs) for filename_tag980_value in filename_tag980_values: if not filename_tag980_value: if not webupload: return False else: return (1, "Invalid collection in tag 980") if not webupload: if not filename_tag980_value in permitted_dbcollids: return False else: auth_code, auth_message = acc_authorize_action( req, 'runbatchuploader', collection=filename_tag980_value) if auth_code != 0: error_msg = _("The user '%(x_user)s' is not authorized to modify collection '%(x_coll)s'") % \ {'x_user': user_info['nickname'], 'x_coll': filename_tag980_value} return (auth_code, error_msg) filename_rec_id_collections = _detect_collections_from_marcxml_file(recs) for filename_rec_id_collection in filename_rec_id_collections: if not webupload: if not filename_rec_id_collection in permitted_dbcollids: return False else: auth_code, auth_message = acc_authorize_action( req, 'runbatchuploader', collection=filename_rec_id_collection) if auth_code != 0: error_msg = _("The user '%(x_user)s' is not authorized to modify collection '%(x_coll)s'") % \ {'x_user': user_info['nickname'], 'x_coll': filename_rec_id_collection} return (auth_code, error_msg) if not webupload: return True else: return (0, " ")
def _check_client_can_submit_file(client_ip="", metafile="", req=None, webupload=0, ln=CFG_SITE_LANG): """ Is this client able to upload such a FILENAME? check 980 $a values and collection tags in the file to see if they are among the permitted ones as specified by CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS and ACC_AUTHORIZE_ACTION. Useful to make sure that the client does not override other records by mistake. """ _ = gettext_set_language(ln) recs = create_records(metafile, 0, 0) user_info = collect_user_info(req) permitted_dbcollids = _get_client_authorized_collections(client_ip) if '*' in permitted_dbcollids: if not webupload: return True else: return (0, " ") filename_tag980_values = _detect_980_values_from_marcxml_file(recs) for filename_tag980_value in filename_tag980_values: if not filename_tag980_value: if not webupload: return False else: return(1, "Invalid collection in tag 980") if not webupload: if not filename_tag980_value in permitted_dbcollids: return False else: auth_code, auth_message = acc_authorize_action(req, 'runbatchuploader', collection=filename_tag980_value) if auth_code != 0: error_msg = _("The user '%(x_user)s' is not authorized to modify collection '%(x_coll)s'") % \ {'x_user': user_info['nickname'], 'x_coll': filename_tag980_value} return (auth_code, error_msg) filename_rec_id_collections = _detect_collections_from_marcxml_file(recs) for filename_rec_id_collection in filename_rec_id_collections: if not webupload: if not filename_rec_id_collection in permitted_dbcollids: return False else: auth_code, auth_message = acc_authorize_action(req, 'runbatchuploader', collection=filename_rec_id_collection) if auth_code != 0: error_msg = _("The user '%(x_user)s' is not authorized to modify collection '%(x_coll)s'") % \ {'x_user': user_info['nickname'], 'x_coll': filename_rec_id_collection} return (auth_code, error_msg) if not webupload: return True else: return (0, " ")
def create_oaiharvest_log_str(task_id, oai_src_id, xml_content): """ Function which creates the harvesting logs @param task_id bibupload task id """ records = create_records(xml_content) for record in records: oai_id = record_extract_oai_id(record[0]) my_new_harvest_log = OaiHARVESTLOG() my_new_harvest_log.id_oaiHARVEST = oai_src_id my_new_harvest_log.oai_id = oai_id my_new_harvest_log.date_harvested = datetime.now() my_new_harvest_log.bibupload_task_id = task_id db.session.add(my_new_harvest_log)
def perform_basic_upload_checks(xml_record): """ Performs tests that would provoke the bibupload task to fail with an exit status 1, to prevent batchupload from crashing while alarming the user wabout the issue """ from invenio.legacy.bibupload.engine import writing_rights_p errors = [] if not writing_rights_p(): errors.append("Error: BibUpload does not have rights to write fulltext files.") recs = create_records(xml_record, 1, 1) if recs == []: errors.append("Error: Cannot parse MARCXML file.") elif recs[0][0] is None: errors.append("Error: MARCXML file has wrong format: %s" % recs) return errors
def get_records(): """Fetch records either from file or from StdIn""" try: with codecs.open(sys.argv[1], encoding='utf-8', mode='r') as handle: input_xml = handle.read() except Exception: input_xml = sys.stdin.read() records_out = [] for record, code, errors in create_records(input_xml): if code != 1: msg = "Record Error: %s%s" % (str(record)[:30], str(errors)) raise ValueError(msg) records_out.append(record) _print(TF.YELLOW + "Processing %d records" % len(records_out) + TF.END) return records_out, input_xml
def perform_basic_upload_checks(xml_record): """ Performs tests that would provoke the bibupload task to fail with an exit status 1, to prevent batchupload from crashing while alarming the user wabout the issue """ from invenio.legacy.bibupload.engine import writing_rights_p errors = [] if not writing_rights_p(): errors.append( "Error: BibUpload does not have rights to write fulltext files.") recs = create_records(xml_record, 1, 1) if recs == []: errors.append("Error: Cannot parse MARCXML file.") elif recs[0][0] is None: errors.append("Error: MARCXML file has wrong format: %s" % recs) return errors
def _record_in_files_p(recid, filenames): """Search XML files for given record.""" # Get id tags of record in question rec_oaiid = rec_sysno = -1 rec_oaiid_tag = get_fieldvalues(recid, OAIID_TAG) rec_sysno_tag = get_fieldvalues(recid, SYSNO_TAG) if rec_sysno_tag: rec_sysno = rec_sysno_tag[0] # For each record in each file, compare ids and abort if match is found for filename in filenames: try: if CFG_BIBEDIT_QUEUE_CHECK_METHOD == 'regexp': # check via regexp: this is fast, but may not be precise file_content = open(filename).read() re_match_001 = re.compile( '<controlfield tag="001">%s</controlfield>' % (recid)) if re_match_001.search(file_content): return True for rec_oaiid in rec_oaiid_tag: re_match_oaiid = re.compile( r'<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s' % (OAIID_TAG[0:3], re.escape(rec_oaiid))) if re_match_oaiid.search(file_content): return True re_match_sysno = re.compile( r'<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s' % (SYSNO_TAG[0:3], re.escape(str(rec_sysno)))) if rec_sysno_tag: if re_match_sysno.search(file_content): return True else: # by default, check via bibrecord: this is accurate, but may be slow file_ = open(filename) records = create_records(file_.read(), 0, 0) for i in range(0, len(records)): record, all_good = records[i][:2] if record and all_good: if _record_has_id_p(record, recid, rec_oaiid, rec_sysno): return True file_.close() except IOError: continue return False
def _record_in_files_p(recid, filenames): """Search XML files for given record.""" # Get id tags of record in question rec_oaiid = rec_sysno = -1 rec_oaiid_tag = get_fieldvalues(recid, OAIID_TAG) rec_sysno_tag = get_fieldvalues(recid, SYSNO_TAG) if rec_sysno_tag: rec_sysno = rec_sysno_tag[0] # For each record in each file, compare ids and abort if match is found for filename in filenames: try: if CFG_BIBEDIT_QUEUE_CHECK_METHOD == "regexp": # check via regexp: this is fast, but may not be precise file_content = open(filename).read() re_match_001 = re.compile('<controlfield tag="001">%s</controlfield>' % (recid)) if re_match_001.search(file_content): return True for rec_oaiid in rec_oaiid_tag: re_match_oaiid = re.compile( r'<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s' % (OAIID_TAG[0:3], re.escape(rec_oaiid)) ) if re_match_oaiid.search(file_content): return True re_match_sysno = re.compile( r'<datafield tag="%s" ind1=" " ind2=" ">(\s*<subfield code="a">\s*|\s*<subfield code="9">\s*.*\s*</subfield>\s*<subfield code="a">\s*)%s' % (SYSNO_TAG[0:3], re.escape(str(rec_sysno))) ) if rec_sysno_tag: if re_match_sysno.search(file_content): return True else: # by default, check via bibrecord: this is accurate, but may be slow file_ = open(filename) records = create_records(file_.read(), 0, 0) for i in range(0, len(records)): record, all_good = records[i][:2] if record and all_good: if _record_has_id_p(record, recid, rec_oaiid, rec_sysno): return True file_.close() except IOError: continue return False
def create_oaiharvest_log_str(task_id, oai_src_id, xml_content): """ Function which creates the harvesting logs @param task_id bibupload task id """ try: records = create_records(xml_content) for record in records: oai_id = record_extract_oai_id(record[0]) my_new_harvest_log = OaiHARVESTLOG() my_new_harvest_log.id_oaiHARVEST = oai_src_id my_new_harvest_log.oai_id = oai_id my_new_harvest_log.date_harvested = datetime.datetime.now my_new_harvest_log.bibupload_task_id = task_id db.session.add(my_new_harvest_log) db.session.commit() except Exception as msg: print("Logging exception : %s " % (str(msg),))
def recxml2recmarc(xmltext, options, sysno_generator=get_sysno_generator()): """The function that processes creating the records from an XML string, and prints these records to the standard output stream. @param xmltext: An XML MARC record in string form. @param options: Various options about the record to be created, as passed from the command line. @param sysno_generator: A static parameter to act as an Aleph system number generator. Do not provide a value for this - it will be assigned upon first call to this function. """ rec_count = 0 ## Counter used to record the number of the rec ## that is being processed. Used in error messages ## for the user, when a record cannot be processed ## create internal records structure from xmltext: records = create_records(xmltext, 1, 1) ## now loop through each record, get its sysno, and convert it: for rec_tuple in records: rec_count += 1 ## Get the record-dictionary itself from the record-tuple: record = rec_tuple[0] if record is None: ## if the record is None, there was probably a problem ## with the MARC XML. Display a warning message on stderr and ## move past this record: sys.stderr.write("E: Unable to process record number %s; The XML " \ " may be broken for this record.\n" \ % str(rec_count)) continue ## From the record, get the SYS if running in ALEPH-MARC mode, or ## the recid (001) if running in TEXT-MARC mode: sysno = get_sysno_from_record(record, options) if sysno is None: ## No 'sysno' was found in the record: if options["text-marc"] == 1: ## 'sysno' (001) (which is actually the recid) is mandatory ## for the creation of TEXT-MARC. Report the error and skip ## past the record: sys.stderr.write("E: Record number %s has no 'recid' (001). " \ "This field is mandatory for the " \ "creation of TEXT MARC. The record has been " \ "skipped.\n" % str(rec_count)) continue elif options["aleph-marc"] == 1 and \ 1 in (options["append-mode"], options["delete-mode"], \ options["correct-mode"], options["replace-mode"]): ## When creating ALEPH-MARC that will be used to manipulate ## a record in some way (i.e. correct, append, delete, replace), ## the ALEPH SYS (970__a in MARC XML) is mandatory. Report the ## error and skip past the record: sys.stderr.write("E: Record number %s has no ALEPH 'SYS' " \ "(970__a). This field is mandatory for the " \ "creation of ALEPH MARC that is used for the" \ " manipulation of records (i.e. replace, " \ "correct, append, delete). The record has " \ "been skipped.\n" % str(rec_count)) continue elif options["aleph-marc"] == 1 and type(sysno) in (list, tuple): ## multiple values for SYS (970__a) in ALEPH-MARC mode are not ## permitted. Report the error and skip past the record: sys.stderr.write("E: Multiple values have been found for the " \ "ALEPH SYS (970__a) in record number %s. This " \ "is not permitted when running in ALEPH-MARC " \ "mode. The record has been skipped." \ % str(rec_count)) continue if options["aleph-marc"] == 1 and options["insert-mode"] == 1: ## Creating an ALEPH "insert" record. Since the resulting record ## should be treated as a new insert into ALEPH, any 'sysno' that ## may have been found in the MARC XML record cannot be used - ## that would be dangerous. Therefore, set 'sysno' to None and ## create a random sysno: sysno = None try: sysno = sysno_generator.next() except StopIteration: ## generator counter has overstepped the MAX ALEPH SYS! ## Without a SYS, we cannot create ALEPH MARC sys.stderr.write("""E: Maximum ALEPH SYS has been """ \ """reached - unable to continue.\n""") sys.exit(1) ## No problems were encountered with SYS or recid. Display the ## translated record: rec_out = create_marc_record(record, sysno, options) sys.stdout.write(rec_out) sys.stdout.flush()
def main(): cmdusage = """Usage: %s [options] <marcxmlfile> General options: -h, --help Print this help. -V, --version Print version information. -v, --verbose=LEVEL Verbose level (from 0 to 9, default 0). Description: checks the validity of MARCXML file. """ % (sys.argv[0]) verbose = 0 badrecords = [] listofrecs = [] try: opts, args = getopt.getopt(sys.argv[1:], "hVv:", ["help", "version", "verbose="]) except getopt.GetoptError: print(cmdusage) sys.exit(2) for opt in opts: if opt[0] in ("-V","--version"): print(__revision__) sys.exit(0) elif opt[0] in ("-h","--help"): sys.stderr.write(cmdusage) sys.exit(0) elif opt[0] in ("-v", "--verbose"): try: verbose = string.atoi(opt[1]) except ValueError: print("[ERROR] verbose must be an integer.") sys.exit(2) try: xmlfile = args[0] except IndexError: sys.stderr.write(cmdusage) sys.exit(0) try: xmltext = open(xmlfile,'r').read() except IOError: print("[ERROR] File %s not found." % xmlfile) import sys sys.exit(1) listofrecs = create_records(xmltext, 0, 1) badr = filter((lambda x: x[1]==0), listofrecs) badrecords = map((lambda x:x[0]), badr) s = '' errors = [] if xmltext and not listofrecs: print("[ERROR] No valid record detected.") sys.exit(1) if verbose: if verbose <= 3: errors.extend(map((lambda x:x[2]), listofrecs)) else: s = print_recs(badrecords) errors.extend(map((lambda x:x[2]), listofrecs)) else: if badrecords: print("[ERROR] Bad records detected. For more information, increase verbosity.") print("\n[INFO] You may also want to run `xmllint %s' to help " \ "localise errors in the input file." % xmlfile) sys.exit(1) errors = [error for error in errors if error] if s or errors: if s: print(s) for error in errors: print("[ERROR]", error) print("[INFO] You may also want to run `xmllint %s' to help " \ "localise errors in the input file." % xmlfile) sys.exit(1)
def validate_matches(bibmatch_recid, record, server, result_recids, \ collections="", verbose=0, ascii_mode=False): """ Perform record validation on a set of matches. This function will try to find any search-result that "really" is a correct match, based on various methods defined in a given rule-set. See more about rule-sets in validate_match() function documentation. This function will return a tuple containing a list of all record IDs satisfying the count of field matching needed for exact matches and a similar list for fuzzy matches that has less fields matching then the threshold. Records that are not matching at all are simply left out of the lists. @param bibmatch_recid: Current record number. Used for logging. @type bibmatch_recid: int @param record: bibrec structure of original record @type record: dict @param server: InvenioConnector object to matched record source repository @type server: InvenioConnector object @param result_recids: the list of record ids from search result. @type result_recids: list @param collections: list of collections to search, if specified @type collections: list @param verbose: be loud @type verbose: int @param ascii_mode: True to transform values to its ascii representation @type ascii_mode: bool @return: list of record IDs matched @rtype: list """ matches_found = [] fuzzy_matches_found = [] # Generate final rule-set by analyzing the record final_ruleset = get_validation_ruleset(record) if not final_ruleset: raise BibMatchValidationError("Bad configuration rule-set." \ "Please check that CFG_BIBMATCH_MATCH_VALIDATION_RULESETS" \ " is formed correctly.") if verbose > 8: sys.stderr.write("\nStart record validation:\n\nFinal validation ruleset used:\n") pp = pprint.PrettyPrinter(stream=sys.stderr, indent=2) pp.pprint(final_ruleset) CFG_BIBMATCH_LOGGER.info("Final validation ruleset used: %s" % (final_ruleset,)) # Fetch all records in MARCXML and convert to BibRec found_record_list = [] query = " OR ".join(["001:%d" % (recid,) for recid in result_recids]) if collections: search_params = dict(p=query, of="xm", c=collections) else: search_params = dict(p=query, of="xm") CFG_BIBMATCH_LOGGER.info("Fetching records to match: %s" % (str(search_params),)) result_marcxml = server.search_with_retry(**search_params) # Check if record was found if result_marcxml: found_record_list = [r[0] for r in create_records(result_marcxml)] # Check if BibRecord generation was successful if not found_record_list: # Error fetching records. Unable to validate. Abort. raise BibMatchValidationError("Error retrieving MARCXML for possible matches from %s. Aborting." \ % (server.server_url,)) if len(found_record_list) < len(result_recids): # Error fetching all records. Will still continue. sys.stderr.write("\nError retrieving all MARCXML for possible matched records from %s.\n" \ % (server.server_url,)) # Validate records one-by-one, adding any matches to the list of matching record IDs current_index = 1 for matched_record in found_record_list: recid = record_get_field_values(matched_record, tag="001")[0] if verbose > 8: sys.stderr.write("\n Validating matched record #%d (%s):\n" % \ (current_index, recid)) CFG_BIBMATCH_LOGGER.info("Matching of record %d: Comparing to matched record %s" % \ (bibmatch_recid, recid)) match_ratio = validate_match(record, matched_record, final_ruleset, \ verbose, ascii_mode) if match_ratio == 1.0: # All matches were a success, this is an exact match CFG_BIBMATCH_LOGGER.info("Matching of record %d: Exact match found -> %s" % (bibmatch_recid, recid)) matches_found.append(recid) elif match_ratio >= CFG_BIBMATCH_FUZZY_MATCH_VALIDATION_LIMIT: # This means that some matches failed, but some succeeded as well. That's fuzzy... CFG_BIBMATCH_LOGGER.info("Matching of record %d: Fuzzy match found -> %s" % \ (bibmatch_recid, recid)) fuzzy_matches_found.append(recid) else: CFG_BIBMATCH_LOGGER.info("Matching of record %d: Not a match" % (bibmatch_recid,)) current_index += 1 # Return list of matching record IDs return matches_found, fuzzy_matches_found
def get_number_of_records_found(self): """Returns the number of records in the result""" records = bibrecord.create_records(self._result) records_count = len(records) return records_count
def _author_list(obj, eng): from invenio.legacy.bibrecord import create_records, record_xml_output from invenio.legacy.bibconvert.xslt_engine import convert from invenio.utils.plotextractor.api import get_tarball_from_arxiv from invenio.utils.plotextractor.cli import get_defaults from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.plotextractor.converter import untar from invenio.utils.shell import Timeout from ..utils import find_matching_files identifiers = obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "") if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path ) if tarball is None: obj.log.error("No tarball found") return else: tarball = obj.extra_data["_result"]["tarball"] # FIXME tarball = str(tarball) sub_dir, dummy = get_defaults(tarball, cfg['CFG_TMPDIR'], "") try: untar(tarball, sub_dir) obj.log.info("Extracted tarball to: {0}".format(sub_dir)) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % ( obj.extra_data["_result"]["tarball"])) xml_files_list = find_matching_files(sub_dir, ["xml"]) obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) authors = "" for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors = convert(xml_content, stylesheet) authorlist_record = create_records(authors) if len(authorlist_record) == 1: if authorlist_record[0][0] is None: eng.log.error("Error parsing authorlist record for id: %s" % ( identifiers,)) authorlist_record = authorlist_record[0][0] author_xml = record_xml_output(authorlist_record) if author_xml: updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \ + record_xml_output(authorlist_record) + '</collection>' new_dict_representation = convert_marcxml_to_bibfield(updated_xml) obj.data["authors"] = new_dict_representation["authors"] obj.update_task_results( "authors", [{ "name": "authors", "results": new_dict_representation["authors"] }] ) obj.update_task_results( "number_of_authors", [{ "name": "number_of_authors", "results": new_dict_representation["number_of_authors"] }] ) break
def _author_list(obj, eng): from invenio.legacy.bibrecord import create_records, record_xml_output from invenio.legacy.bibconvert.xslt_engine import convert from invenio.utils.plotextractor.api import get_tarball_from_arxiv from invenio.utils.plotextractor.cli import get_defaults from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.plotextractor.converter import untar from invenio.utils.shell import Timeout from ..utils import find_matching_files identifiers = obj.data.get( cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "") if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid)) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path) if tarball is None: obj.log.error("No tarball found") return else: tarball = obj.extra_data["_result"]["tarball"] # FIXME tarball = str(tarball) sub_dir, dummy = get_defaults(tarball, cfg['CFG_TMPDIR'], "") try: untar(tarball, sub_dir) obj.log.info("Extracted tarball to: {0}".format(sub_dir)) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % (obj.extra_data["_result"]["tarball"])) xml_files_list = find_matching_files(sub_dir, ["xml"]) obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) authors = "" for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors = convert(xml_content, stylesheet) authorlist_record = create_records(authors) if len(authorlist_record) == 1: if authorlist_record[0][0] is None: eng.log.error( "Error parsing authorlist record for id: %s" % (identifiers, )) authorlist_record = authorlist_record[0][0] author_xml = record_xml_output(authorlist_record) if author_xml: updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \ + record_xml_output(authorlist_record) + '</collection>' new_dict_representation = convert_marcxml_to_bibfield( updated_xml) obj.data["authors"] = new_dict_representation["authors"] obj.update_task_results( "authors", [{ "name": "authors", "results": new_dict_representation["authors"] }]) obj.update_task_results("number_of_authors", [{ "name": "number_of_authors", "results": new_dict_representation["number_of_authors"] }]) break
def author_list(obj, eng): """Perform the special authorlist extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.oaiharvest.utils import (translate_fieldvalues_from_latex, find_matching_files) from invenio.legacy.bibrecord import create_records, record_xml_output from invenio.legacy.bibconvert.xslt_engine import convert from invenio.utils.plotextractor.cli import get_defaults from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.plotextractor.getter import harvest_single from invenio.modules.workflows.errors import WorkflowError from invenio.utils.plotextractor.converter import untar from invenio.utils.shell import Timeout identifiers = obj.data["system_control_number"]["value"] if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg['CFG_TMPSHAREDDIR'], str(eng.uuid) ) if not os.path.exists(extract_path): os.makedirs(extract_path) tarball, pdf = harvest_single( obj.data["system_control_number"]["value"], extract_path, ["tarball"]) tarball = str(tarball) if tarball is None: raise WorkflowError(str( "Error harvesting tarball from id: %s %s" % ( identifiers, extract_path)), eng.uuid, id_object=obj.id) obj.extra_data["_result"]["tarball"] = tarball sub_dir, dummy = get_defaults(obj.extra_data["_result"]["tarball"], cfg['CFG_TMPDIR'], "") try: untar(obj.extra_data["_result"]["tarball"], sub_dir) obj.log.info("Extracted tarball to: {0}".format(sub_dir)) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % ( obj.extra_data["_result"]["tarball"])) xml_files_list = find_matching_files(sub_dir, ["xml"]) obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) authors = "" for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") a_stylesheet = obj.extra_data["repository"]["arguments"].get( "a_stylesheet" ) or "authorlist2marcxml.xsl" authors = convert(xml_content, a_stylesheet) authorlist_record = create_records(authors) if len(authorlist_record) == 1: if authorlist_record[0][0] is None: eng.log.error("Error parsing authorlist record for id: %s" % ( identifiers,)) authorlist_record = authorlist_record[0][0] # Convert any LaTeX symbols in authornames translate_fieldvalues_from_latex(authorlist_record, '100', code='a') translate_fieldvalues_from_latex(authorlist_record, '700', code='a') updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \ + record_xml_output(authorlist_record) + '</collection>' if not None == updated_xml: # We store the path to the directory the tarball contents live # Read and grab MARCXML from plotextractor run new_dict_representation = convert_marcxml_to_bibfield(updated_xml) obj.data['authors'] = new_dict_representation["authors"] obj.data['number_of_authors'] = new_dict_representation[ "number_of_authors"] obj.add_task_result("authors", new_dict_representation["authors"]) obj.add_task_result("number_of_authors", new_dict_representation["number_of_authors"]) break
def author_list(obj, eng): """ Performs the special authorlist extraction step (Mostly INSPIRE/CERN related). :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.oaiharvest.utils import (translate_fieldvalues_from_latex, find_matching_files) from invenio.legacy.bibrecord import create_records, record_xml_output from invenio.legacy.bibconvert.xslt_engine import convert from invenio.utils.plotextractor.cli import get_defaults identifiers = obj.data["system_number_external"]["value"] bibtask.task_sleep_now_if_required() if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = plotextractor_getter.make_single_directory(cfg['CFG_TMPSHAREDDIR'], eng.uuid) tarball, pdf = plotextractor_getter.harvest_single(obj.data["system_number_external"]["value"], extract_path, ["tarball"]) tarball = str(tarball) if tarball is None: raise workflows_error.WorkflowError(str("Error harvesting tarball from id: %s %s" % (identifiers, extract_path)), eng.uuid, id_object=obj.id) obj.extra_data["_result"]["tarball"] = tarball sub_dir, dummy = get_defaults(obj.extra_data["_result"]["tarball"], cfg['CFG_TMPDIR'], "") try: untar(obj.extra_data["_result"]["tarball"], sub_dir) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % (obj.extra_data["_result"]["tarball"])) xml_files_list = find_matching_files(sub_dir, ["xml"]) authors = "" for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if not match == []: authors += match[0] # Generate file to store conversion results if authors is not '': authors = convert(authors, "authorlist2marcxml.xsl") authorlist_record = create_records(authors) if len(authorlist_record) == 1: if authorlist_record[0][0] is None: eng.log.error("Error parsing authorlist record for id: %s" % (identifiers,)) authorlist_record = authorlist_record[0][0] # Convert any LaTeX symbols in authornames translate_fieldvalues_from_latex(authorlist_record, '100', code='a') translate_fieldvalues_from_latex(authorlist_record, '700', code='a') updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' + record_xml_output(authorlist_record) \ + '</collection>' if not None == updated_xml: # We store the path to the directory the tarball contents live # Read and grab MARCXML from plotextractor run new_dict_representation = records_api.create_record(updated_xml, master_format="marc").dumps() obj.data['authors'] = new_dict_representation["authors"] obj.data['number_of_authors'] = new_dict_representation["number_of_authors"] obj.add_task_result("authors", new_dict_representation["authors"]) obj.add_task_result("number_of_authors", new_dict_representation["number_of_authors"])
def main(): """Execute script.""" import sys cmdusage = """Usage: %s [options] <marcxmlfile> General options: -h, --help Print this help. -v, --verbose=LEVEL Verbose level (from 0 to 9, default 0). Description: checks the validity of MARCXML file. """ % (sys.argv[0]) verbose = 0 badrecords = [] listofrecs = [] try: opts, args = getopt.getopt(sys.argv[1:], "hv:", ["help", "verbose="]) except getopt.GetoptError: print(cmdusage) sys.exit(2) for opt in opts: if opt[0] in ("-h", "--help"): sys.stderr.write(cmdusage) sys.exit(0) elif opt[0] in ("-v", "--verbose"): try: verbose = string.atoi(opt[1]) except ValueError: print("[ERROR] verbose must be an integer.") sys.exit(2) try: xmlfile = args[0] except IndexError: sys.stderr.write(cmdusage) sys.exit(0) try: xmltext = open(xmlfile, 'r').read() except IOError: print("[ERROR] File %s not found." % xmlfile) import sys sys.exit(1) listofrecs = create_records(xmltext, 0, 1) badr = filter((lambda x: x[1] == 0), listofrecs) badrecords = map((lambda x: x[0]), badr) s = '' errors = [] if xmltext and not listofrecs: print("[ERROR] No valid record detected.") sys.exit(1) if verbose: if verbose <= 3: errors.extend(map((lambda x: x[2]), listofrecs)) else: s = print_recs(badrecords) errors.extend(map((lambda x: x[2]), listofrecs)) else: if badrecords: print( "[ERROR] Bad records detected. For more information, increase verbosity." ) print("\n[INFO] You may also want to run `xmllint %s' to help " "localise errors in the input file." % xmlfile) sys.exit(1) errors = [error for error in errors if error] if s or errors: if s: print(s) for error in errors: print("[ERROR]", error) print("[INFO] You may also want to run `xmllint %s' to help " "localise errors in the input file." % xmlfile) sys.exit(1)