def compare_references(test, a, b): ## Let's normalize records to remove the Invenio refextract signature a = create_record(a)[0] b = create_record(b)[0] record_delete_field(a, '999', 'C', '6') a = record_xml_output(a) b = record_xml_output(b) test.assertXmlEqual(a, b)
def _get_record_slave(recid, result, mode=None, uid=None): """Check if record exists and return it in dictionary format. If any kind of error occurs returns None. If mode=='revision' then recid parameter is considered as revid.""" record = None if recid == 'none': mode = 'none' if mode == 'recid': record_status = record_exists(recid) #check for errors if record_status == 0: result['resultCode'], result[ 'resultText'] = 1, 'Non-existent record: %s' % recid elif record_status == -1: result['resultCode'], result[ 'resultText'] = 1, 'Deleted record: %s' % recid elif record_locked_by_queue(recid): result['resultCode'], result[ 'resultText'] = 1, 'Record %s locked by queue' % recid else: record = create_record(print_record(recid, 'xm'))[0] record_order_subfields(record) elif mode == 'tmpfile': file_path = '%s_%s.xml' % (_get_file_path( recid, uid), CFG_BIBEDIT_TO_MERGE_SUFFIX) if not os.path.isfile(file_path): #check if file doesn't exist result['resultCode'], result[ 'resultText'] = 1, 'Temporary file doesnt exist' else: #open file tmpfile = open(file_path, 'r') record = create_record(tmpfile.read())[0] tmpfile.close() elif mode == 'revision': if revision_format_valid_p(recid): marcxml = get_marcxml_of_revision_id(recid) if marcxml: record = create_record(marcxml)[0] else: result['resultCode'], result[ 'resultText'] = 1, 'The specified revision does not exist' else: result['resultCode'], result[ 'resultText'] = 1, 'Invalid revision id' elif mode == 'none': return {} else: result['resultCode'], result[ 'resultText'] = 1, 'Invalid record mode for record2' return record
def get_templates(templatesDir, tmpl_name, tmpl_description, extractContent=False): """Return list of templates [filename, name, description, content*] the extractContent variable indicated if the parsed content should be included""" template_fnames = fnmatch.filter(os.listdir(templatesDir), "*.xml") templates = [] for fname in template_fnames: filepath = "%s%s%s" % (templatesDir, os.sep, fname) template_file = open(filepath, "r") template = template_file.read() template_file.close() fname_stripped = os.path.splitext(fname)[0] mo_name = tmpl_name.search(template) mo_description = tmpl_description.search(template) date_modified = time.ctime(os.path.getmtime(filepath)) if mo_name: name = mo_name.group(1) else: name = fname_stripped if mo_description: description = mo_description.group(1) else: description = "" if extractContent: parsedTemplate = create_record(template)[0] if parsedTemplate != None: # If the template was correct templates.append([fname_stripped, name, description, parsedTemplate]) else: raise "Problem when parsing the template %s" % (fname,) else: templates.append([fname_stripped, name, description, date_modified]) return templates
def doilookup(self, req, form): """ Returns the metadata from the crossref website based on the DOI. """ args = wash_urlargd(form, { 'doi': (str, '')}) response = defaultdict(list) if args['doi']: doi = args['doi'] try: marcxml_template = get_marcxml_for_doi(doi) except CrossrefError: # Just ignore Crossref errors pass else: record = create_record(marcxml_template)[0] if record: # We need to convert this record structure to a simple dictionary for key, value in record.items(): # key, value = (773, [([('0', 'PER:64142'), ...], ' ', ' ', '', 47)]) for val in value: # val = ([('0', 'PER:64142'), ...], ' ', ' ', '', 47) ind1 = val[1].replace(" ", "_") ind2 = val[2].replace(" ", "_") for (k, v) in val[0]: # k, v = ('0', 'PER:5409') response[key+ind1+ind2+k].append(v) # The output dictionary is something like: # {"100__a": ['Smith, J.'], # "700__a": ['Anderson, J.', 'Someoneelse, E.'], # "700__u": ['University1', 'University2']} # return dictionary as JSON return json.dumps(response)
def get_templates(templatesDir, tmpl_name, tmpl_description, extractContent = False): """Return list of templates [filename, name, description, content*] the extractContent variable indicated if the parsed content should be included""" template_fnames = fnmatch.filter(os.listdir( templatesDir), '*.xml') templates = [] for fname in template_fnames: template_file = open('%s%s%s' % ( templatesDir, os.sep, fname),'r') template = template_file.read() template_file.close() fname_stripped = os.path.splitext(fname)[0] mo_name = tmpl_name.search(template) mo_description = tmpl_description.search(template) if mo_name: name = mo_name.group(1) else: name = fname_stripped if mo_description: description = mo_description.group(1) else: description = '' if (extractContent): parsedTemplate = create_record(template)[0] if parsedTemplate != None: # If the template was correct templates.append([fname_stripped, name, description, parsedTemplate]) else: raise "Problem when parsing the template %s" % (fname, ) else: templates.append([fname_stripped, name, description]) return templates
def merge_record_with_template(rec, template_name, is_hp_record=False): """ Extend the record rec with the contents of the template and return it""" template = get_record_template(template_name) if not template: return template_bibrec = create_record(template)[0] # if the record is a holding pen record make all subfields volatile if is_hp_record: record_make_all_subfields_volatile(template_bibrec) for field_tag in template_bibrec: if not record_has_field(rec, field_tag): for field_instance in template_bibrec[field_tag]: record_add_field(rec, field_tag, field_instance[1], field_instance[2], subfields=field_instance[0]) else: for template_field_instance in template_bibrec[field_tag]: subfield_codes_template = field_get_subfield_codes(template_field_instance) for field_instance in rec[field_tag]: subfield_codes = field_get_subfield_codes(field_instance) for code in subfield_codes_template: if code not in subfield_codes: field_add_subfield(field_instance, code, field_get_subfield_values(template_field_instance, code)[0]) return rec
def perform_request_holdingpen(request_type, recId, changeId=None): """ A method performing the holdingPen ajax request. The following types of requests can be made: getHoldingPenUpdates - retrieving the holding pen updates pending for a given record """ response = {} if request_type == 'getHoldingPenUpdates': changeSet = get_related_hp_changesets(recId) changes = [] for change in changeSet: changes.append((str(change[0]), str(change[1]))) response["changes"] = changes elif request_type == 'getHoldingPenUpdateDetails': # returning the list of changes related to the holding pen update # the format based on what the record difference xtool returns assert(changeId != None) hpContent = get_hp_update_xml(changeId) holdingPenRecord = create_record(hpContent[0], "xm")[0] databaseRecord = get_record(hpContent[1]) response['record'] = holdingPenRecord response['changeset_number'] = changeId; elif request_type == 'deleteHoldingPenChangeset': assert(changeId != None) delete_hp_change(changeId); return response
def replace_references(recid): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record """ # Parse references references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml.encode("utf-8")) # Record marc xml record = get_record(recid) if references[0]: fields_to_add = record_get_field_instances(references[0], tag="999", ind1="%", ind2="%") # Replace 999 fields record_delete_fields(record, "999") record_add_fields(record, "999", fields_to_add) # Update record references out_xml = record_xml_output(record) else: out_xml = None return out_xml
def merge_record_with_template(rec, template_name): """ Extend the record rec with the contents of the template and return it""" template = get_record_template(template_name) if not template: return template_bibrec = create_record(template)[0] for field_tag in template_bibrec: if not record_has_field(rec, field_tag): for field_instance in template_bibrec[field_tag]: record_add_field(rec, field_tag, field_instance[1], field_instance[2], subfields=field_instance[0]) else: for template_field_instance in template_bibrec[field_tag]: subfield_codes_template = field_get_subfield_codes( template_field_instance) for field_instance in rec[field_tag]: subfield_codes = field_get_subfield_codes(field_instance) for code in subfield_codes_template: if code not in subfield_codes: field_add_subfield( field_instance, code, field_get_subfield_values( template_field_instance, code)[0]) return rec
def fetch_remote_record(remote_url): """ Gets MARCXML from a server instance of Invenio and returns a single BibRecord structure. Raises ValueError if returned data is not MARCXML and URLError if there's an issue accessing the page after DOWNLOAD_ATTEMPTS times """ url = "%s/export/xm" % (remote_url) for cnt in xrange(DOWNLOAD_ATTEMPTS): try: handle = urlopen(url) xml = handle.read() handle.close() record_creation = create_record(xml) if record_creation[1] == 0: print "Error: Could not parse record %s" % (url,) raise ValueError(str(record_creation[2])) return record_creation[0] except URLError as exc: if cnt < DOWNLOAD_ATTEMPTS - 1: print "Timeout #%d: waiting %d seconds..." % (cnt, TIMEOUT_WAIT) sleep(TIMEOUT_WAIT) else: print("ERROR: Could not download %s (tried %d times)" % (url, DOWNLOAD_ATTEMPTS)) raise exc
def cli_clean_revisions(recid, dry_run=True, verbose=True): """Clean revisions of the given recid, by removing duplicate revisions that do not change the content of the record.""" if recid == '*': recids = intbitset(run_sql("SELECT DISTINCT id_bibrec FROM hstRECORD")) else: try: recids = [int(recid)] except ValueError: print 'ERROR: record ID must be integer, not %s.' % recid sys.exit(1) for recid in recids: all_revisions = run_sql("SELECT marcxml, job_id, job_name, job_person, job_date FROM hstRECORD WHERE id_bibrec=%s ORDER BY job_date ASC", (recid,)) previous_rec = {} deleted_revisions = 0 for marcxml, job_id, job_name, job_person, job_date in all_revisions: try: current_rec = create_record(zlib.decompress(marcxml))[0] except Exception: print >> sys.stderr, "ERROR: corrupted revisions found. Please run %s --fix-revisions '*'" % sys.argv[0] sys.exit(1) if records_identical(current_rec, previous_rec): deleted_revisions += 1 if not dry_run: run_sql("DELETE FROM hstRECORD WHERE id_bibrec=%s AND job_id=%s AND job_name=%s AND job_person=%s AND job_date=%s", (recid, job_id, job_name, job_person, job_date)) previous_rec = current_rec if verbose and deleted_revisions: print "record %s: deleted %s duplicate revisions out of %s" % (recid, deleted_revisions, len(all_revisions)) if verbose: print "DONE"
def replace_references(recid): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record """ # Parse references references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml.encode('utf-8')) # Record marc xml record = get_record(recid) if references[0]: fields_to_add = record_get_field_instances(references[0], tag='999', ind1='%', ind2='%') # Replace 999 fields record_delete_fields(record, '999') record_add_fields(record, '999', fields_to_add) # Update record references out_xml = record_xml_output(record) else: out_xml = None return out_xml
def test_copy_245_fields_add_caption(self): """ Test adding a completely new caption""" rec_string = """<record> <controlfield tag="001">123456</controlfield> <datafield tag="245" ind1=" " ind2=" "> <subfield code="a">Some caption</subfield> </datafield> <datafield tag="245" ind1=" " ind2=" "> <subfield code="z">Some ridiculous caption</subfield> </datafield> <!--some other fields--> <datafield tag="520" ind1="" ind2=" "> <subfield code="9">HEPDATA</subfield> </datafield> <datafield tag="245" ind1="z" ind2=" "> <subfield code="z">Some ridiculous caption</subfield> </datafield> <datafield tag="856" ind1="4" ind2=" "> <subfield code="z">Some other entry not following even the semantics 2</subfield> <subfield code="3">ANOTHER</subfield> </datafield> </record>""" rec = bibrecord.create_record(rec_string)[0] paper = hepdatautils.Paper.create_from_record(rec) self.assertEqual(None, paper.get_diff_marcxml(rec), \ "There should not be need of a patch on the same record") paper.comment = "azerty" diff_xml = paper.get_diff_marcxml(rec) self.assertTrue(diff_xml.find(">Some caption") == -1, \ "One of existing captions not found") self.assertTrue(diff_xml.find(">Some ridiculous caption") == -1, \ "One of existing captions not found") self.assertTrue(diff_xml.find(">azerty") != -1, \ "New caption not found")
def get_remote_record(recid): """ For a given remote record ID, we download the record XML and return the record in a BibRecord structure Parameter: (int) recid - record ID for remote record Returns: BibRecord """ url = "%s/record/%d/export/xm?ot=001,035" % (REMOTE_URL, recid) tmp_file = '' try: bibrec = None tmp_file = download_url(url, retry_count=10, timeout=61.0) with open(tmp_file, 'r') as temp: content = temp.read() bibrec, code, errors = create_record(content) if code != 1 or errors: _print( "Warning: There were errors creating BibRec structure " + "from remote record #%d" % recid, 4) os.remove(tmp_file) return bibrec except (StandardError, InvenioFileDownloadError, HTTPError) as err: _print("Error: Could not download remote record #%d" % recid, 4) _print(str(err), 4) _print(traceback.format_exc(), 4)
def merge_record_with_template(rec, template_name, is_hp_record=False): """ Extend the record rec with the contents of the template and return it""" template = get_record_template(template_name) if not template: return template_bibrec = create_record(template)[0] # if the record is a holding pen record make all subfields volatile if is_hp_record: record_make_all_subfields_volatile(template_bibrec) for field_tag in template_bibrec: if not record_has_field(rec, field_tag): for field_instance in template_bibrec[field_tag]: record_add_field(rec, field_tag, field_instance[1], field_instance[2], subfields=field_instance[0]) else: for template_field_instance in template_bibrec[field_tag]: subfield_codes_template = field_get_subfield_codes( template_field_instance) for field_instance in rec[field_tag]: subfield_codes = field_get_subfield_codes(field_instance) for code in subfield_codes_template: if code not in subfield_codes: field_add_subfield( field_instance, code, field_get_subfield_values( template_field_instance, code)[0]) record_order_subfields(rec) return rec
def record_collect_oai_identifiers(record_xml): """ Collects all OAI identifiers from given MARCXML. Returns a list of found values in the tag CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG. @param record_xml: string containing MARCXML to parse @return list of identifiers """ result = None (record, status_code, list_of_errors) = create_record(record_xml) if not status_code: # Error happened write_message("Error collecting OAI identifier from record: %s" % ("\n".join(list_of_errors),)) else: # All OK! We can get the IDs result = record_get_field_values(record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]) if not result: # No IDs found.. write_message("No OAI IDs found in record") return result
def doilookup(self, req, form): """ Returns the metadata from the crossref website based on the DOI. """ args = wash_urlargd(form, {'doi': (str, '')}) response = defaultdict(list) if args['doi']: doi = args['doi'] try: marcxml_template = get_marcxml_for_doi(doi) except CrossrefError: # Just ignore Crossref errors pass else: record = create_record(marcxml_template)[0] if record: # We need to convert this record structure to a simple dictionary for key, value in record.items( ): # key, value = (773, [([('0', 'PER:64142'), ...], ' ', ' ', '', 47)]) for val in value: # val = ([('0', 'PER:64142'), ...], ' ', ' ', '', 47) ind1 = val[1].replace(" ", "_") ind2 = val[2].replace(" ", "_") for (k, v) in val[0]: # k, v = ('0', 'PER:5409') response[key + ind1 + ind2 + k].append(v) # The output dictionary is something like: # {"100__a": ['Smith, J.'], # "700__a": ['Anderson, J.', 'Someoneelse, E.'], # "700__u": ['University1', 'University2']} # return dictionary as JSON return json.dumps(response)
def record_collect_oai_identifiers(record_xml): """ Collects all OAI identifiers from given MARCXML. Returns a list of found values in the tag CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG. @param record_xml: string containing MARCXML to parse @return list of identifiers """ result = None (record, status_code, list_of_errors) = create_record(record_xml) if not status_code: # Error happened write_message("Error collecting OAI identifier from record: %s" % ("\n".join(list_of_errors), )) else: # All OK! We can get the IDs result = record_get_field_values(record, CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[:3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4], CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]) if not result: # No IDs found.. write_message("No OAI IDs found in record") return result
def _prepare_blob(self, *args, **kwargs): #FIXME stop using recstruct! from invenio.bibrecord import create_record class SaveDict(dict): __getitem__ = dict.get def dict_extend_helper(d, key, value): """ If the key is present inside the dictionary it creates a list (it not present) and extends it with the new value. Almost as in C{list.extend} """ if key in d: current_value = d.get(key) if not isinstance(current_value, list): current_value = [current_value] current_value.append(value) value = current_value d[key] = value self.rec_tree = SaveDict() tmp = create_record(self.blob)[0] for key, values in tmp.iteritems(): if key < '010' and key.isdigit(): self.rec_tree[key] = [value[3] for value in values] else: for value in values: field = SaveDict() for subfield in value[0]: dict_extend_helper(field, subfield[0], subfield[1]) dict_extend_helper(self.rec_tree, (key + value[1] + value[2]).replace(' ', '_'), field)
def save_xml_record(recid, uid, xml_record='', to_upload=True, to_merge=False, spec_name=''): """Write XML record to file. Default behaviour is to read the record from a BibEdit cache file, filter out the unchanged volatile subfields, write it back to an XML file and then pass this file to BibUpload. @param xml_record: give XML as string in stead of reading cache file @param to_upload: pass the XML file to BibUpload @param to_merge: prepare an XML file for BibMerge to use """ if not xml_record: # Read record from cache file. cache = get_cache_file_contents(recid, uid) if cache: record = cache[2] used_changes = cache[4] # record_strip_empty_fields(record) # now performed for every record after removing unfilled volatile fields xml_record = record_xml_output(record) delete_cache_file(recid, uid) delete_disabled_changes(used_changes) else: record = create_record(xml_record)[0] # clean the record from unfilled volatile fields record_strip_empty_volatile_subfields(record) record_strip_empty_fields(record) # order subfields alphabetically before saving the record #TP: nechceme record_order_subfields(record) xml_to_write = wash_for_xml(record_xml_output(record)) # Write XML file. if not to_merge: file_path = '%s.xml' % _get_file_path(recid, uid) else: file_path = '%s_%s.xml' % (_get_file_path( recid, uid), CFG_BIBEDIT_TO_MERGE_SUFFIX) xml_file = open(file_path, 'w') xml_file.write(xml_to_write) xml_file.close() user_name = get_user_info(uid)[1] if to_upload: # TP: check whether to add spec name if spec_name == '': # Pass XML file to BibUpload. task_low_level_submission('bibupload', 'bibedit', '-P', '5', '-r', file_path, '-u', user_name) else: task_low_level_submission('bibupload', 'bibedit', '-P', '5', '-r', file_path, '-u', user_name, '-N', spec_name) return True
def get_bibrec_for_record(marcxml, opt_mode): ''' A record is uploaded to the system using mainly functionality of the bibupload module. Then a bibrec is returned for the record. ''' recs = create_record(marcxml, parser='lxml') _, recid, _ = bibupload(recs[0], opt_mode=opt_mode) return recid
def _get_record_slave(recid, result, mode=None, uid=None): """Check if record exists and return it in dictionary format. If any kind of error occurs returns None. If mode=='revision' then recid parameter is considered as revid.""" record = None if recid == 'none': mode = 'none' if mode == 'recid': record_status = record_exists(recid) #check for errors if record_status == 0: result['resultCode'], result['resultText'] = 1, 'Non-existent record: %s' % recid elif record_status == -1: result['resultCode'], result['resultText'] = 1, 'Deleted record: %s' % recid elif record_locked_by_queue(recid): result['resultCode'], result['resultText'] = 1, 'Record %s locked by queue' % recid else: record = create_record( print_record(recid, 'xm') )[0] record_order_subfields(record) elif mode == 'tmpfile': file_path = '%s_%s.xml' % (_get_file_path(recid, uid), CFG_BIBEDIT_TO_MERGE_SUFFIX) if not os.path.isfile(file_path): #check if file doesn't exist result['resultCode'], result['resultText'] = 1, 'Temporary file doesnt exist' else: #open file tmpfile = open(file_path, 'r') record = create_record( tmpfile.read() )[0] tmpfile.close() elif mode == 'revision': if revision_format_valid_p(recid): marcxml = get_marcxml_of_revision_id(recid) if marcxml: record = create_record(marcxml)[0] else: result['resultCode'], result['resultText'] = 1, 'The specified revision does not exist' else: result['resultCode'], result['resultText'] = 1, 'Invalid revision id' elif mode == 'none': return {} else: result['resultCode'], result['resultText'] = 1, 'Invalid record mode for record2' return record
def get_bibrecord(recid): """Return record in BibRecord wrapping.""" if record_exists(recid): record_revision_ids = get_record_revision_ids(recid) if record_revision_ids: return create_record(get_marcxml_of_revision_id(max(record_revision_ids)))[0] else: return get_record(recid)
def save_xml_record(recid, uid, xml_record='', to_upload=True, to_merge=False, task_name="bibedit", sequence_id=None): """Write XML record to file. Default behaviour is to read the record from a BibEdit cache file, filter out the unchanged volatile subfields, write it back to an XML file and then pass this file to BibUpload. @param xml_record: give XML as string in stead of reading cache file @param to_upload: pass the XML file to BibUpload @param to_merge: prepare an XML file for BibMerge to use """ if not xml_record: # Read record from cache file. cache = get_cache_contents(recid, uid) if cache: record = cache[2] used_changes = cache[4] xml_record = record_xml_output(record) delete_cache(recid, uid) delete_disabled_changes(used_changes) else: record = create_record(xml_record)[0] # clean the record from unfilled volatile fields record_strip_empty_volatile_subfields(record) record_strip_empty_fields(record) # order subfields alphabetically before saving the record record_order_subfields(record) xml_to_write = wash_for_xml(record_xml_output(record)) # Write XML file. if not to_merge: fd, file_path = tempfile.mkstemp(dir=CFG_BIBEDIT_CACHEDIR, prefix="%s_" % CFG_BIBEDIT_FILENAME, suffix="_%s_%s.xml" % (recid, uid)) f = os.fdopen(fd, 'w') f.write(xml_to_write) f.close() else: file_path = '%s_%s.xml' % (_get_file_path(recid, uid), CFG_BIBEDIT_TO_MERGE_SUFFIX) xml_file = open(file_path, 'w') xml_file.write(xml_to_write) xml_file.close() user_name = get_user_info(uid)[1] if to_upload: args = ['bibupload', user_name, '-P', '5', '-r', file_path, '-u', user_name] if task_name == "bibedit": args.extend(['--name', 'bibedit']) if sequence_id: args.extend(["-I", sequence_id]) args.extend(['--email-logs-on-error']) task_low_level_submission(*args) return True
def get_legacy_recstruct(self): """ It creates the recstruct representation using the legacy rules defined in the configuration file #CHECK: it might be a bit overkilling """ from invenio.bibrecord import create_record return create_record(self.legacy_export_as_marc())[0]
def rollback_record(recid): print 'id', recid for rev in get_record_revision_ids(recid): old_record = create_record(get_marcxml_of_revision_id(rev)) fields_to_add = record_get_field_instances(old_record[0], tag='520') if fields_to_add: print 'reverting to', rev return create_our_record(recid, fields_to_add) print 'FAILED', recid
def get_bibrecord(recid): """Return record in BibRecord wrapping.""" if record_exists(recid): record_revision_ids = get_record_revision_ids(recid) if record_revision_ids: return create_record( get_marcxml_of_revision_id(max(record_revision_ids)))[0] else: return get_record(recid)
def _get_record_slave(recid, result, mode=None, uid=None): """Check if record exists and return it in dictionary format. If any kind of error occurs returns None. If mode=='revision' then recid parameter is considered as revid.""" record = None if recid == "none": mode = "none" if mode == "recid": record_status = record_exists(recid) # check for errors if record_status == 0: result["resultCode"], result["resultText"] = 1, "Non-existent record: %s" % recid elif record_status == -1: result["resultCode"], result["resultText"] = 1, "Deleted record: %s" % recid elif record_locked_by_queue(recid): result["resultCode"], result["resultText"] = 1, "Record %s locked by queue" % recid else: record = create_record(print_record(recid, "xm"))[0] elif mode == "tmpfile": file_path = "%s_%s.xml" % (_get_file_path(recid, uid), CFG_BIBEDIT_TO_MERGE_SUFFIX) if not os.path.isfile(file_path): # check if file doesn't exist result["resultCode"], result["resultText"] = 1, "Temporary file doesnt exist" else: # open file tmpfile = open(file_path, "r") record = create_record(tmpfile.read())[0] tmpfile.close() elif mode == "revision": if revision_format_valid_p(recid): marcxml = get_marcxml_of_revision_id(recid) if marcxml: record = create_record(marcxml)[0] else: result["resultCode"], result["resultText"] = 1, "The specified revision does not exist" else: result["resultCode"], result["resultText"] = 1, "Invalid revision id" elif mode == "none": return {} else: result["resultCode"], result["resultText"] = 1, "Invalid record mode for record2" return record
def get_rn(revision): rns = set() record = create_record(get_marcxml_of_revision_id(revision))[0] fields = record_get_field_instances(record, tag='999', ind1='C', ind2='5') for f in fields: subfields = field_get_subfield_instances(f) for index, s in enumerate(subfields): if s[0] == 'r': rns.add(tag_arxiv_more(s[1])) return rns
def _get_record_linking_fields(recid_b, recid_a, tag, ind1, ind2): """ Returns the fields (defined by tag, ind1, ind2) in record (given by recid_b) that do not link to another given record (recid_a). """ fields = [] rec = create_record(format_record(recid_b, "xm"))[0] for field_instance in record_get_field_instances(rec, tag=tag, ind1=ind1, ind2=ind2): if not ('w', str(recid_a)) in field_instance[0]: fields.append(field_instance) return fields
def record_collect_recid(record_xml): """Return recid in MARCXML""" result = None (record, status_code, list_of_errors) = create_record(record_xml) if not status_code: # Error happened write_message("Error collecting OAI identifier from record: %s" % ("\n".join(list_of_errors), )) return if "001" in record: return record['001'][0][3]
def test_update_the_same_record(self): """Tests parsing Paper from a record and diffing with the same hepdata entry. """ rec_string = """<record> <controlfield tag="001">123456</controlfield> <datafield tag="856" ind1="4" ind2=" "> <subfield code="z">Some other entry not following even the semantics 2</subfield> <subfield code="3">ANOTHER</subfield> </datafield> <datafield tag="856" ind1="4" ind2=" "> <subfield code="u">http://google.com</subfield> <subfield code="y">1 This is the link text</subfield> <subfield code="3">ADDITIONAL HEPDATA</subfield> </datafield> <datafield tag="856" ind1="4" ind2=" "> <subfield code="u">http://invenio-software.org</subfield> <subfield code="y">2 This is some other completely unrelated field</subfield> <subfield code="3">ADDITIONAL HEPDATA</subfield> </datafield> <datafield tag="856" ind1=" " ind2=" "> <subfield code="u">http://invenio-software.net</subfield> <subfield code="y">This should not be copied</subfield> <subfield code="3">Different type</subfield> </datafield> <datafield tag="520" ind1=" " ind2=" "> <subfield code="9">HEPDATA</subfield> </datafield> </record> """ rec = bibrecord.create_record(rec_string)[0] paper = hepdatautils.Paper.create_from_record(rec) diff_xml = paper.get_diff_marcxml(rec) self.assertTrue(diff_xml is None, "Expecting empty XML in the case of the same dataset. Produced XML: %s" % (diff_xml, )) self.assertEqual(len(paper.additional_data_links), 2, "Incorrect number of recognised additional data links") if paper.additional_data_links[0]["description"][0] > \ paper.additional_data_links[1]["description"][0]: l1 = paper.additional_data_links[1] l2 = paper.additional_data_links[0] else: l1 = paper.additional_data_links[0] l2 = paper.additional_data_links[1] self.assertEqual(l1["description"], "1 This is the link text", "Incorrect first parsed link") self.assertEqual(l1["href"], "http://google.com", "Incorrect first parsed link") self.assertEqual(l2["description"], "2 This is some other completely unrelated field", "Incorrect second parsed link") self.assertEqual(l2["href"], "http://invenio-software.org", "Incorrect second parsed link")
def move_drafts_articles_to_ready(journal_name, issue): """ Move draft articles to their final "collection". To do so we rely on the convention that an admin-chosen keyword must be removed from the metadata """ protected_datafields = ['100', '245', '246', '520', '590', '700'] keyword_to_remove = get_journal_draft_keyword_to_remove(journal_name) collections_to_refresh = {} categories = get_journal_categories(journal_name, issue) for category in categories: articles = get_journal_articles(journal_name, issue, category) for order, recids in articles.iteritems(): for recid in recids: record_xml = format_record(recid, of='xm') if not record_xml: continue new_record_xml_path = os.path.join(CFG_TMPDIR, 'webjournal_publish_' + \ str(recid) + '.xml') if os.path.exists(new_record_xml_path): # Do not modify twice continue record_struc = create_record(record_xml) record = record_struc[0] new_record = update_draft_record_metadata(record, protected_datafields, keyword_to_remove) new_record_xml = print_rec(new_record) if new_record_xml.find(keyword_to_remove) >= 0: new_record_xml = new_record_xml.replace(keyword_to_remove, '') # Write to file new_record_xml_file = file(new_record_xml_path, 'w') new_record_xml_file.write(new_record_xml) new_record_xml_file.close() # Submit task_low_level_submission('bibupload', 'WebJournal', '-c', new_record_xml_path) task_low_level_submission('bibindex', 'WebJournal', '-i', str(recid)) for collection in get_all_collections_of_a_record(recid): collections_to_refresh[collection] = '' # Refresh collections collections_to_refresh.update([(c, '') for c in get_journal_collection_to_refresh_on_release(journal_name)]) for collection in collections_to_refresh.keys(): task_low_level_submission('webcoll', 'WebJournal', '-f', '-p', '2','-c', collection)
def move_drafts_articles_to_ready(journal_name, issue): """ Move draft articles to their final "collection". To do so we rely on the convention that an admin-chosen keyword must be removed from the metadata """ protected_datafields = ['100', '245', '246', '520', '590', '700'] keyword_to_remove = get_journal_draft_keyword_to_remove(journal_name) collections_to_refresh = {} categories = get_journal_categories(journal_name, issue) for category in categories: articles = get_journal_articles(journal_name, issue, category) for order, recids in articles.iteritems(): for recid in recids: record_xml = format_record(recid, of='xm') if not record_xml: continue new_record_xml_path = os.path.join(CFG_TMPDIR, 'webjournal_publish_' + \ str(recid) + '.xml') if os.path.exists(new_record_xml_path): # Do not modify twice continue record_struc = create_record(record_xml) record = record_struc[0] new_record = update_draft_record_metadata( record, protected_datafields, keyword_to_remove) new_record_xml = print_rec(new_record) if new_record_xml.find(keyword_to_remove) >= 0: new_record_xml = new_record_xml.replace( keyword_to_remove, '') # Write to file new_record_xml_file = file(new_record_xml_path, 'w') new_record_xml_file.write(new_record_xml) new_record_xml_file.close() # Submit task_low_level_submission('bibupload', 'WebJournal', '-c', new_record_xml_path) task_low_level_submission('bibindex', 'WebJournal', '-i', str(recid)) for collection in get_all_collections_of_a_record(recid): collections_to_refresh[collection] = '' # Refresh collections collections_to_refresh.update([ (c, '') for c in get_journal_collection_to_refresh_on_release(journal_name) ]) for collection in collections_to_refresh.keys(): task_low_level_submission('webcoll', 'WebJournal', '-f', '-p', '2', '-c', collection)
def _prepare_blob(self): """ Transforms the blob into rec_tree structure to use it in the standar translation phase inside C{JsonReader} """ self.rec_tree = CoolDict() try: if self.blob_wrapper.schema.lower().startswith('file:'): self.blob_wrapper.blob = open(self.blob_wrapper.blob_file_name, 'r').read() if self.blob_wrapper.schema.lower() in ['recstruct']: self.__create_rectree_from_recstruct() elif self.blob_wrapper.schema.lower() in ['xml', 'file:xml']: #TODO: Implement translation directrly from xml from invenio.bibrecord import create_record self.blob_wrapper.blob = create_record(self.blob_wrapper.blob)[0] self.__create_rectree_from_recstruct() except AttributeError: #Assume marcxml from invenio.bibrecord import create_record self.blob_wrapper.blob = create_record(self.blob_wrapper.blob)[0] self.__create_rectree_from_recstruct()
def save_xml_record(recid, uid, xml_record='', to_upload=True, to_merge=False, spec_name=''): """Write XML record to file. Default behaviour is to read the record from a BibEdit cache file, filter out the unchanged volatile subfields, write it back to an XML file and then pass this file to BibUpload. @param xml_record: give XML as string in stead of reading cache file @param to_upload: pass the XML file to BibUpload @param to_merge: prepare an XML file for BibMerge to use """ if not xml_record: # Read record from cache file. cache = get_cache_file_contents(recid, uid) if cache: record = cache[2] used_changes = cache[4] # record_strip_empty_fields(record) # now performed for every record after removing unfilled volatile fields xml_record = record_xml_output(record) delete_cache_file(recid, uid) delete_disabled_changes(used_changes) else: record = create_record(xml_record)[0] # clean the record from unfilled volatile fields record_strip_empty_volatile_subfields(record) record_strip_empty_fields(record) # order subfields alphabetically before saving the record #TP: nechceme record_order_subfields(record) xml_to_write = wash_for_xml(record_xml_output(record)) # Write XML file. if not to_merge: file_path = '%s.xml' % _get_file_path(recid, uid) else: file_path = '%s_%s.xml' % (_get_file_path(recid, uid), CFG_BIBEDIT_TO_MERGE_SUFFIX) xml_file = open(file_path, 'w') xml_file.write(xml_to_write) xml_file.close() user_name = get_user_info(uid)[1] if to_upload: # TP: check whether to add spec name if spec_name == '': # Pass XML file to BibUpload. task_low_level_submission('bibupload', 'bibedit', '-P', '5', '-r', file_path, '-u', user_name) else: task_low_level_submission('bibupload', 'bibedit', '-P', '5', '-r', file_path, '-u', user_name, '-N', spec_name) return True
def crossref_process_template(template, change=False): """ Creates record from template based on xml template @param change: if set to True, makes changes to the record (translating the title, unifying autroh names etc.), if not - returns record without any changes @return: record """ record = create_record(template)[0] if change: crossref_translate_title(record) crossref_normalize_name(record) return record
def modify_record_timestamp(revision_xml, last_revision_ts): """ Modify tag 005 to add the revision passed as parameter. @param revision_xml: marcxml representation of the record to modify @type revision_xml: string @param last_revision_ts: timestamp to add to 005 tag @type last_revision_ts: string @return: marcxml with 005 tag modified """ recstruct = create_record(revision_xml)[0] record_modify_controlfield(recstruct, "005", last_revision_ts, field_position_local=0) return record_xml_output(recstruct)
def rollback_record(recid, weight): print 'id', recid, 'weight', weight for rev in get_record_revision_ids(recid): if weight == 0: break if 'refextract' in get_info_of_revision_id(rev): weight -= 1 print 'rev', rev old_record = create_record(get_marcxml_of_revision_id(rev)) fields_to_add = record_get_field_instances(old_record[0], tag='999', ind1='%', ind2='%') submit_xml(create_our_record(recid, fields_to_add))
def _prepare_blob(self): """ Transforms the blob into rec_tree structure to use it in the standar translation phase inside C{JsonReader} """ self.rec_tree = CoolDict() try: if self.blob_wrapper.schema.lower().startswith('file:'): self.blob_wrapper.blob = open(self.blob_wrapper.blob_file_name, 'r').read() if self.blob_wrapper.schema.lower() in ['recstruct']: self.__create_rectree_from_recstruct() elif self.blob_wrapper.schema.lower() in ['xml', 'file:xml']: #TODO: Implement translation directrly from xml from invenio.bibrecord import create_record self.blob_wrapper.blob = create_record( self.blob_wrapper.blob)[0] self.__create_rectree_from_recstruct() except AttributeError: #Assume marcxml from invenio.bibrecord import create_record self.blob_wrapper.blob = create_record(self.blob_wrapper.blob)[0] self.__create_rectree_from_recstruct()
def _next_value(self, recid=None, xml_record=None, start_date=None): """ Returns the next cnum for the given recid @param recid: id of the record where the cnum will be generated @type recid: int @param xml_record: record in xml format @type xml_record: string @param start_date: use given start date @type start_date: string @return: next cnum for the given recid. Format is Cyy-mm-dd.[.1n] @rtype: string @raises ConferenceNoStartDateError: No date information found in the given recid """ bibrecord = None if recid is None and xml_record is not None: bibrecord = create_record(xml_record)[0] elif recid is not None: bibrecord = get_bibrecord(recid) if start_date is None and bibrecord is not None: start_date = record_get_field_value(bibrecord, tag="111", ind1="", ind2="", code="x") if not start_date: raise ConferenceNoStartDateError base_cnum = "C" + start_date[2:] record_cnums = self._get_record_cnums(base_cnum) if not record_cnums: new_cnum = base_cnum elif len(record_cnums) == 1: new_cnum = base_cnum + '.' + '1' else: # Get the max current revision, cnums are in format Cyy-mm-dd, # Cyy-mm-dd.1, Cyy-mm-dd.2 highest_revision = max( [int(rev[0].split('.')[1]) for rev in record_cnums[1:]]) new_cnum = base_cnum + '.' + str(highest_revision + 1) return new_cnum
def _next_value(self, recid=None, xml_record=None, start_date=None): """ Returns the next cnum for the given recid @param recid: id of the record where the cnum will be generated @type recid: int @param xml_record: record in xml format @type xml_record: string @param start_date: use given start date @type start_date: string @return: next cnum for the given recid. Format is Cyy-mm-dd.[.1n] @rtype: string @raises ConferenceNoStartDateError: No date information found in the given recid """ bibrecord = None if recid is None and xml_record is not None: bibrecord = create_record(xml_record)[0] elif recid is not None: bibrecord = get_bibrecord(recid) if start_date is None and bibrecord is not None: start_date = record_get_field_value(bibrecord, tag="111", ind1="", ind2="", code="x") if not start_date: raise ConferenceNoStartDateError base_cnum = "C" + start_date[2:] record_cnums = self._get_record_cnums(base_cnum) if not record_cnums: new_cnum = base_cnum elif len(record_cnums) == 1: new_cnum = base_cnum + '.' + '1' else: # Get the max current revision, cnums are in format Cyy-mm-dd, # Cyy-mm-dd.1, Cyy-mm-dd.2 highest_revision = max([int(rev[0].split('.')[1]) for rev in record_cnums[1:]]) new_cnum = base_cnum + '.' + str(highest_revision + 1) return new_cnum
def move_drafts_articles_to_ready(journal_name, issue): """ Move draft articles to their final "collection". To do so we rely on the convention that an admin-chosen keyword must be removed from the metadata """ protected_datafields = ["100", "245", "246", "520", "590", "700"] keyword_to_remove = get_journal_draft_keyword_to_remove(journal_name) collections_to_refresh = {} categories = get_journal_categories(journal_name, issue) for category in categories: articles = get_journal_articles(journal_name, issue, category) for order, recids in articles.iteritems(): for recid in recids: record_xml = format_record(recid, of="xm") if not record_xml: continue new_record_xml_path = os.path.join(CFG_TMPDIR, "webjournal_publish_" + str(recid) + ".xml") if os.path.exists(new_record_xml_path): # Do not modify twice continue record_struc = create_record(record_xml) record = record_struc[0] new_record = update_draft_record_metadata(record, protected_datafields, keyword_to_remove) new_record_xml = print_rec(new_record) if new_record_xml.find(keyword_to_remove) >= 0: new_record_xml = new_record_xml.replace(keyword_to_remove, "") # Write to file new_record_xml_file = file(new_record_xml_path, "w") new_record_xml_file.write(new_record_xml) new_record_xml_file.close() # Submit task_low_level_submission("bibupload", "WebJournal", "-c", new_record_xml_path) task_low_level_submission("bibindex", "WebJournal", "-i", str(recid)) for collection in get_all_collections_of_a_record(recid): collections_to_refresh[collection] = "" # Refresh collections collections_to_refresh.update([(c, "") for c in get_journal_collection_to_refresh_on_release(journal_name)]) for collection in collections_to_refresh.keys(): task_low_level_submission("webcoll", "WebJournal", "-f", "-p", "2", "-c", collection)
def create_objects(path_to_file): from invenio.bibworkflow_model import BibWorkflowObject list_of_bwo = [] f = open(path_to_file, "r") records = f.read() f.close() record_xmls = REGEXP_RECORD.findall(records) for record_xml in record_xmls: rec = "<record>" rec += record_xml rec += "</record>" rec = create_record(rec)[0] #check for errors, if record is empty bwo = BibWorkflowObject(rec, "bibrecord") list_of_bwo.append(bwo) return list_of_bwo
def replace_references(recid, uid=None, txt=None, url=None): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record * txt: references in text mode * inspire: format of ther references """ # Parse references if txt is not None: references_xml = extract_references_from_string_xml( txt, is_only_references=True) elif url is not None: references_xml = extract_references_from_url_xml(url) else: references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml.encode('utf-8')) dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_file_contents( recid, uid) out_xml = None references_to_add = record_get_field_instances(references[0], tag='999', ind1='C', ind2='5') refextract_status = record_get_field_instances(references[0], tag='999', ind1='C', ind2='6') if references_to_add: # Replace 999 fields record_delete_fields(record, '999') record_add_fields(record, '999', references_to_add) record_add_fields(record, '999', refextract_status) # Update record references out_xml = record_xml_output(record) return out_xml
def match_record(obj, eng): """ Will try to find matches in stored records """ from invenio.bibrecord import create_record from invenio.bibmatch_engine import match_records obj.db_obj.last_task_name = 'match_record' rec = create_record(obj.data['data']) matches = match_records(records=[rec], qrystrs=[("title", "[245__a]")]) obj.db_obj.extra_data['tasks_results']['match_record'] = matches if matches[2] or matches[3]: # we have ambiguous or fuzzy results # render holding pen corresponding template eng.halt("Match resolution needed") elif matches[0]: eng.log.info("Matching: new record") else: results = matches[1][0][1] eng.log.info("Matching: existing record %s" % (results, ))
def element_tree_collection_to_records(tree, header_subs=None): """ Takes an ElementTree and converts the nodes into BibRecord records so they can be worked with. This function is for a tree root of collection as such: <collection> <record> <!-- MARCXML --> </record> <record> ... </record> </collection> """ records = [] collection = tree.getroot() for record_element in collection.getchildren(): marcxml = ET.tostring(record_element, encoding="utf-8") record, status, errors = create_record(marcxml) if errors: _print(str(status)) records.append(record) return records, []
def merge_record_with_template(rec, template_name): """ Extend the record rec with the contents of the template and return it""" template = get_record_template(template_name) template_bibrec = create_record(template)[0] for field_tag in template_bibrec: if not record_has_field(rec, field_tag): for field_instance in template_bibrec[field_tag]: record_add_field(rec, field_tag, field_instance[1], field_instance[2], subfields=field_instance[0]) else: for template_field_instance in template_bibrec[field_tag]: subfield_codes_template = field_get_subfield_codes(template_field_instance) for field_instance in rec[field_tag]: subfield_codes = field_get_subfield_codes(field_instance) for code in subfield_codes_template: if code not in subfield_codes: field_add_subfield( field_instance, code, field_get_subfield_values(template_field_instance, code)[0] ) return rec
def bst_labssync(): """ Synchronizes from Labs via redis. """ r = redis.StrictRedis.from_url(CFG_REDIS_HOST_LABS) user_agent = make_user_agent_string('labssync') s = requests.Session() s.headers['User-Agent'] = user_agent s.headers['Accept'] = 'application/marcxml+xml' tot = r.scard(CFG_REDIS_KEY) if tot == 0: write_message("Nothing to do") return else: write_message("At least %s records to synchronize from labs" % tot) errors = [] final_total = 0 uploader = ChunkedBibUpload(mode='r', user='******') while True: elem = r.spop(CFG_REDIS_KEY) if not elem: break final_total += 1 try: record = s.get("https://%s/api/%s" % (CFG_LABS_HOSTNAME, elem)).text # Let's strip collection/XML header record = record_xml_output(create_record(record)[0]) uploader.add(record) task_sleep_now_if_required() except Exception as err: register_exception() write_message("ERROR: when retrieving %s: %s" % (elem, err), stream=sys.stderr) errors.append(elem) write_message("Finally synced %s records from labs" % final_total) if errors: write_message("All those %s records had errors and might need to be resynced: %s" % (len(errors), ', '.join(errors)))
def _prepare_blob(self, *args, **kwargs): #FIXME stop using recstruct! from invenio.bibrecord import create_record class SaveDict(dict): __getitem__ = dict.get def dict_extend_helper(d, key, value): """ If the key is present inside the dictionary it creates a list (it not present) and extends it with the new value. Almost as in C{list.extend} """ if key in d: current_value = d.get(key) if not isinstance(current_value, list): current_value = [current_value] current_value.append(value) value = current_value d[key] = value self.rec_tree = SaveDict() record, status_code, errors = create_record(self.blob) if status_code == 0: if isinstance(errors, list): errors = "\n".join(errors) # There was an error raise ReaderException( "There was an error while parsing MARCXML: %s" % (errors, )) for key, values in record.iteritems(): if key < '010' and key.isdigit(): self.rec_tree[key] = [value[3] for value in values] else: for value in values: field = SaveDict() for subfield in value[0]: dict_extend_helper(field, subfield[0], subfield[1]) dict_extend_helper(self.rec_tree, (key + value[1] + value[2]).replace( ' ', '_'), field)
def output_record(xml_file): """ Function that returns a record representation from a xml file @param xml_file: the file in xml format @return: the record """ xml_to_string = '' list_of_words = [] f = open(xml_file) try: for line in f: words = line.split() for word in words: list_of_words.append(word) finally: f.close() xml_to_string = ' '.join(list_of_words) # create_record is a function that takes a string representation of an xml and returns a dictionary (record, status_code, list_of_errors) = create_record(xml_to_string) return record
def cli_clean_revisions(recid, dry_run=True, verbose=True): """Clean revisions of the given recid, by removing duplicate revisions that do not change the content of the record.""" if recid == '*': recids = intbitset(run_sql("SELECT DISTINCT id_bibrec FROM hstRECORD")) else: try: recids = [int(recid)] except ValueError: print 'ERROR: record ID must be integer, not %s.' % recid sys.exit(1) for recid in recids: all_revisions = run_sql( "SELECT marcxml, job_id, job_name, job_person, job_date FROM hstRECORD WHERE id_bibrec=%s ORDER BY job_date ASC", (recid, )) previous_rec = {} deleted_revisions = 0 for marcxml, job_id, job_name, job_person, job_date in all_revisions: try: current_rec = create_record(zlib.decompress(marcxml))[0] except Exception: print >> sys.stderr, "ERROR: corrupted revisions found. Please run %s --fix-revisions '*'" % sys.argv[ 0] sys.exit(1) if records_identical(current_rec, previous_rec): deleted_revisions += 1 if not dry_run: run_sql( "DELETE FROM hstRECORD WHERE id_bibrec=%s AND job_id=%s AND job_name=%s AND job_person=%s AND job_date=%s", (recid, job_id, job_name, job_person, job_date)) previous_rec = current_rec if verbose and deleted_revisions: print "record %s: deleted %s duplicate revisions out of %s" % ( recid, deleted_revisions, len(all_revisions)) if verbose: print "DONE"
def salvage_deleted_record_from_history(recid): return create_record( decompress( run_sql( "SELECT marcxml FROM hstRECORD WHERE id_bibrec=%s ORDER BY job_date DESC LIMIT 1", (recid, ))[0][0]))[0]