def main(): from_base = 'http://openaire.cern.ch' to_base = config.CFG_SITE_URL # All records recids = search_pattern(p="0->Z", f="8564_u") print "<collection>" for recid in recids: # Get record information touched = False file_links = get_fieldvalues(recid, "8564_u") new_file_links = map(replace_link_func(from_base, to_base), file_links) # Print correcting to record rec = {} record_add_field(rec, "001", controlfield_value=str(recid)) for old_link, new_link in zip(file_links, new_file_links): if old_link != new_link: touched = True record_add_field(rec, '856', ind1='4', subfields=[('u', new_link)]) if touched: print record_xml_output(rec) print "</collection>"
def main(): # from_base = 'http://openaire.cern.ch/' to_base = 'http://localhost:4000/' # All records recids = search_pattern(p="0->Z", f="8564_u") print "<collection>" for recid in recids: # Get record information touched = False file_links = get_fieldvalues(recid, "8564_u") def replace_link(x): if x.startswith(from_base): return x.replace(from_base, to_base) else: return x new_file_links = map(replace_link, file_links) # Print correcting to record rec = {} record_add_field(rec, "001", controlfield_value=str(recid)) for old_link,new_link in zip(file_links, new_file_links): if old_link != new_link: touched = True record_add_field(rec, '856', ind1='4', subfields=[('u', new_link)]) if touched: print record_xml_output(rec) print "</collection>"
def bibupload(record=None, collection=None, file_prefix="", mode="-c"): """ General purpose function that will write a MARCXML file and call bibupload on it. """ if collection is None and record is None: return (file_out, filename) = open_temp_file(file_prefix) if collection is not None: file_out.write("<collection>") tot = 0 for rec in collection: file_out.write(record_xml_output(rec)) tot += 1 if tot == MAX_RECORDS: file_out.write("</collection>") file_out.close() logger.debug("Submitting bibupload %s -n %s" % (mode, filename)) task_low_level_submission('bibupload', 'openaire', mode, filename, '-n') (file_out, filename) = open_temp_file(file_prefix) file_out.write("<collection>") tot = 0 file_out.write("</collection>") elif record is not None: tot = 1 file_out.write(record_xml_output(record)) file_out.close() if tot > 0: logger.debug("Submitting bibupload %s -n %s" % (mode, filename)) task_low_level_submission('bibupload', 'openaire', mode, filename, '-n')
def convert_record(record, response_date, request): header = record.getElementsByTagName("header")[0] oai_identifier = get_value_in_tag(header, "identifier") datestamp = get_value_in_tag(header, "datestamp") status = header.getAttribute("status").encode('utf8') rec = {} record_add_field(rec, tag="035", subfields=[('a', oai_identifier), ('u', request), ('9', 'Hindawi'), ('d', datestamp), ('h', response_date), ('m', 'marc21'), ('t', 'false')]) new = True if find_records_from_extoaiid(oai_identifier, 'Hindawi'): new = False if status == 'deleted': if new: ## deleting a record we didn't have? Who cares :-) return None, True else: record_add_field(rec, tag="980", subfields=[('a', 'SCOAP3'), ('b', 'Hindawi'), ('c', 'DELETED')]) return record_xml_output(rec), False for datafield in record.getElementsByTagName("datafield"): tag = datafield.getAttribute("tag").encode('utf-8') ind1 = datafield.getAttribute("ind1").encode('utf-8') or ' ' ind2 = datafield.getAttribute("ind2").encode('utf-8') or ' ' subfields = [] for subfield in datafield.getElementsByTagName("subfield"): code = subfield.getAttribute("code").encode('utf-8') value = xml_to_text(subfield) subfields.append((code, value)) record_add_field(rec, tag=tag, ind1=ind1, ind2=ind2, subfields=subfields) return record_xml_output(rec), new
def compare_references(test, a, b): ## Let's normalize records to remove the Invenio refextract signature a = create_record(a)[0] b = create_record(b)[0] record_delete_field(a, '999', 'C', '6') a = record_xml_output(a) b = record_xml_output(b) test.assertXmlEqual(a, b)
def _prepare_marcxml(recid_a, rn_a, recid_b, rn_b, what_is_a_for_b, what_is_b_for_a, display_in_a=True, display_in_b=True): record_a = {} record_b = {} record_add_field(record_a, "001", controlfield_value=str(recid_a)) record_add_field(record_a, CFG_OTHER_RELATIONSHIP_ENTRY, ind1=display_in_a and "0" or "1", subfields=[('i', what_is_b_for_a), ('r', rn_b), ('w', str(recid_b))]) record_add_field(record_b, "001", controlfield_value=str(recid_b)) record_add_field(record_b, CFG_OTHER_RELATIONSHIP_ENTRY, ind1=display_in_b and "0" or "1", subfields=[('i', what_is_a_for_b), ('r', rn_a), ('w', str(recid_a))]) return "<collection>\n%s\n%s</collection>" % (record_xml_output(record_a), record_xml_output(record_b))
def save_xml_record(recid, uid, xml_record='', to_upload=True, to_merge=False, task_name="bibedit", sequence_id=None): """Write XML record to file. Default behaviour is to read the record from a BibEdit cache file, filter out the unchanged volatile subfields, write it back to an XML file and then pass this file to BibUpload. @param xml_record: give XML as string in stead of reading cache file @param to_upload: pass the XML file to BibUpload @param to_merge: prepare an XML file for BibMerge to use """ if not xml_record: # Read record from cache file. cache = get_cache_contents(recid, uid) if cache: record = cache[2] used_changes = cache[4] xml_record = record_xml_output(record) delete_cache(recid, uid) delete_disabled_changes(used_changes) else: record = create_record(xml_record)[0] # clean the record from unfilled volatile fields record_strip_empty_volatile_subfields(record) record_strip_empty_fields(record) # order subfields alphabetically before saving the record record_order_subfields(record) xml_to_write = wash_for_xml(record_xml_output(record)) # Write XML file. if not to_merge: fd, file_path = tempfile.mkstemp(dir=CFG_BIBEDIT_CACHEDIR, prefix="%s_" % CFG_BIBEDIT_FILENAME, suffix="_%s_%s.xml" % (recid, uid)) f = os.fdopen(fd, 'w') f.write(xml_to_write) f.close() else: file_path = '%s_%s.xml' % (_get_file_path(recid, uid), CFG_BIBEDIT_TO_MERGE_SUFFIX) xml_file = open(file_path, 'w') xml_file.write(xml_to_write) xml_file.close() user_name = get_user_info(uid)[1] if to_upload: args = ['bibupload', user_name, '-P', '5', '-r', file_path, '-u', user_name] if task_name == "bibedit": args.extend(['--name', 'bibedit']) if sequence_id: args.extend(["-I", sequence_id]) args.extend(['--email-logs-on-error']) task_low_level_submission(*args) return True
def test_for_special_delete_field(self): """ BibUpload Revision Verifier - Rev1-100/300, Modified 100 in Rev1-Mod, Deleted 300 in Rev1-Mod (100/300), Patch for DELETE generated""" upload_rec = xml_marc_to_records(self.rev1_mod) orig_rec = xml_marc_to_records(self.rev1) rev_verifier = RevisionVerifier() (opt_mode, final_patch, dummy_affected_tags) = rev_verifier.verify_revision(upload_rec[0], \ orig_rec[0], \ 'replace') self.assertEqual('correct', opt_mode) self.failUnless((compare_xmbuffers(self.patch_1, record_xml_output(final_patch))!='') or \ (compare_xmbuffers(self.patch_2, record_xml_output(final_patch))!=''))
def save_xml_record(recid, uid, xml_record='', to_upload=True, to_merge=False, spec_name=''): """Write XML record to file. Default behaviour is to read the record from a BibEdit cache file, filter out the unchanged volatile subfields, write it back to an XML file and then pass this file to BibUpload. @param xml_record: give XML as string in stead of reading cache file @param to_upload: pass the XML file to BibUpload @param to_merge: prepare an XML file for BibMerge to use """ if not xml_record: # Read record from cache file. cache = get_cache_file_contents(recid, uid) if cache: record = cache[2] used_changes = cache[4] # record_strip_empty_fields(record) # now performed for every record after removing unfilled volatile fields xml_record = record_xml_output(record) delete_cache_file(recid, uid) delete_disabled_changes(used_changes) else: record = create_record(xml_record)[0] # clean the record from unfilled volatile fields record_strip_empty_volatile_subfields(record) record_strip_empty_fields(record) # order subfields alphabetically before saving the record #TP: nechceme record_order_subfields(record) xml_to_write = wash_for_xml(record_xml_output(record)) # Write XML file. if not to_merge: file_path = '%s.xml' % _get_file_path(recid, uid) else: file_path = '%s_%s.xml' % (_get_file_path(recid, uid), CFG_BIBEDIT_TO_MERGE_SUFFIX) xml_file = open(file_path, 'w') xml_file.write(xml_to_write) xml_file.close() user_name = get_user_info(uid)[1] if to_upload: # TP: check whether to add spec name if spec_name == '': # Pass XML file to BibUpload. task_low_level_submission('bibupload', 'bibedit', '-P', '5', '-r', file_path, '-u', user_name) else: task_low_level_submission('bibupload', 'bibedit', '-P', '5', '-r', file_path, '-u', user_name, '-N', spec_name) return True
def apply_hepnames_updates(hepname_updates): bibupload = ChunkedBibUpload(mode='a', user='******') for recid, entry in hepname_updates.iteritems(): record = {} record_add_field(record, '001', controlfield_value=str(recid)) for key, value in entry.iteritems(): if key in ('ORCID', 'ORIGINAL_BAI', 'INSPIRE', 'KAKEN'): if key == 'ORIGINAL_BAI': key = 'BAI' record_add_field(record, '035', subfields=[('a', value), ('9', key)]) write_message(record_xml_output(record)) bibupload.add(record_xml_output(record))
def _get_formated_record(record_id, output_format, update_commands, language, outputTags="", checked=True, displayed_records=None): """Returns a record in a given format @param record_id: the ID of record to format @param output_format: an output format code (or short identifier for the output format) @param update_commands: list of commands used to update record contents @param language: the language to use to format the record @param outputTags: the tags to be shown to the user @param checked: is the record checked by the user? @param displayed_records: records to be displayed on a given page @returns: record formated to be displayed or None """ if update_commands and checked: # Modify the bibrecord object with the appropriate actions updated_record = _get_updated_record(record_id, update_commands) textmarc_options = {"aleph-marc":0, "correct-mode":1, "append-mode":0, "delete-mode":0, "insert-mode":0, "replace-mode":0, "text-marc":1} if record_id not in displayed_records: return old_record = search_engine.get_record(recid=record_id) old_record_textmarc = xmlmarc2textmarc.create_marc_record(old_record, sysno="", options=textmarc_options) if "hm" == output_format: if update_commands and checked: updated_record_textmarc = xmlmarc2textmarc.create_marc_record(updated_record, sysno="", options=textmarc_options) result = _get_record_diff(old_record_textmarc, updated_record_textmarc, outputTags, record_id) else: filter_tags = "All tags" not in outputTags and outputTags result = ['<pre>'] for line in old_record_textmarc.splitlines(): if not filter_tags or line.split()[0].replace('_', '') in outputTags: result.append("%09d " % record_id + line.strip()) result.append('</pre>') result = '\n'.join(result) else: if update_commands and checked: # No coloring of modifications in this case xml_record = bibrecord.record_xml_output(updated_record) else: xml_record = bibrecord.record_xml_output(old_record) result = bibformat.format_record(recID=None, of=output_format, xml_record=xml_record, ln=language) return result
def _get_formated_record(record_id, output_format, update_commands, language, outputTags=""): """Returns a record in a given format @param record_id: the ID of record to format @param output_format: an output format code (or short identifier for the output format) @param update_commands: list of commands used to update record contents @param language: the language to use to format the record """ if update_commands: updated_record = _get_updated_record(record_id, update_commands) old_record = search_engine.get_record(recid=record_id) xml_record = bibrecord.record_xml_output(old_record) if "hm" == output_format: result = "<pre>\n" if ("All tags" not in outputTags) and outputTags: if update_commands: marc_record = _get_record_diff(record_id, old_record, updated_record) tag_position = 1 else: marc_record = _create_marc(xml_record) tag_position = 0 for line in marc_record.split('\n')[:-1]: if line.split()[tag_position][:3] in outputTags: if update_commands: result += line.strip() + '\n' else: result += "%09d " % record_id + line.strip() + '\n' elif '<strong' in line: if line.split()[3][5:8] in outputTags: result += line.strip() + '\n' else: if update_commands: result += _get_record_diff(record_id, old_record, updated_record) else: marc_record = _create_marc(xml_record) for line in marc_record.split('\n')[:-1]: result += "%09d " % record_id + line.strip() + '\n' result += "</pre>" return result if update_commands: xml_record = bibrecord.record_xml_output(updated_record) result = bibformat.format_record(recID=None, of=output_format, xml_record=xml_record, ln=language) return result
def test_correcting_del_field_add_field_diff_ind(self): """ BibUpload Revision Verifier - Rev3-100/970__/888, Deleted 970__ and Added 970CP in Rev2(100/970__), Patch Generated for 970__/970CP""" upload_recs = xml_marc_to_records(self.rev2_mod_del_one_add_one) orig_recs = xml_marc_to_records(self.data["rev3"][0]) rev_verifier = RevisionVerifier() (opt_mode, patch, dummy_affected_tags) = rev_verifier.verify_revision(upload_recs[0], orig_recs[0], "replace") self.assertEqual("correct", opt_mode) # NOTE:for multiple fields in patch it is better to compare with different possible patch strings # This is due to unsorted key-value pairs of generated patch dictionary # self.assertEqual(compare_xmbuffers(record_xml_output(patch), self.patch_del_one_add_one), '') self.failUnless( (compare_xmbuffers(record_xml_output(patch), self.patch_del_one_add_one) != "") or (compare_xmbuffers(record_xml_output(patch), self.patch_del_one_add_one_2) != "") )
def _get_formated_record(record_id, output_format, update_commands, language, outputTags=""): """Returns a record in a given format @param record_id: the ID of record to format @param output_format: an output format code (or short identifier for the output format) @param update_commands: list of commands used to update record contents @param language: the language to use to format the record """ updated_record = _get_updated_record(record_id, update_commands) xml_record = bibrecord.record_xml_output(updated_record) old_record = search_engine.get_record(recid=record_id) if "hm" == output_format: result = "<pre>\n" if "All tags" not in outputTags or not outputTags: diff_result = _get_record_diff(record_id, old_record, updated_record) for line in diff_result.split('\n')[:-1]: for tag in outputTags: if tag in line.split()[1]: result += line.strip() + '\n' elif '<strong' in line: if tag in line.split()[3]: result += line.strip() + '\n' else: result += _get_record_diff(record_id, old_record, updated_record) result += "</pre>" return result result = bibformat.format_record(recID=None, of=output_format, xml_record=xml_record, ln=language) return result
def replace_references(recid): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record """ # Parse references references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml.encode("utf-8")) # Record marc xml record = get_record(recid) if references[0]: fields_to_add = record_get_field_instances(references[0], tag="999", ind1="%", ind2="%") # Replace 999 fields record_delete_fields(record, "999") record_add_fields(record, "999", fields_to_add) # Update record references out_xml = record_xml_output(record) else: out_xml = None return out_xml
def _get_formated_record(record_id, output_format, update_commands, language, outputTags=""): """Returns a record in a given format @param record_id: the ID of record to format @param output_format: an output format code (or short identifier for the output format) @param update_commands: list of commands used to update record contents @param language: the language to use to format the record """ updated_record = _get_updated_record(record_id, update_commands) xml_record = bibrecord.record_xml_output(updated_record) if "hm" == output_format: result = "<pre>\n" marc_record = _create_marc(xml_record) if "All tags" not in outputTags or not outputTags: for line in marc_record.split('\n')[:-1]: for tag in outputTags: if tag in line.split()[0]: result += "%09d " % record_id + line.strip() + '\n' else: for line in marc_record.split('\n')[:-1]: result += "%09d " % record_id + line.strip() + '\n' result += "</pre>" return result result = bibformat.format_record(recID=None, of=output_format, xml_record=xml_record, ln=language) return result
def main(): usage = """ Usage: $ %s [tags_csv] [marcxml_in] [marcxml_out] tags_csv Tags to preserve as CSVs marcxml_in MARCXML file to read from marcxml_out MARCXML file to write""" % (PROGRAM_NAME,) if len(argv) == 4: tags = argv[1].split(',') fin = argv[2] fout = argv[3] else: print(usage) return with open(fin) as handle: records = create_records(handle.read()) xmlout = ('<?xml version="1.0"?>\n' + '<collection xmlns="http://www.loc.gov/MARC21/slim">\n') for record, err, reason in records: if err == '0': print('Error: Could not create record\n' + reason) else: xmlout += record_xml_output(record, tags=tags) + '\n' with open(fout, 'w') as handle: handle.write(xmlout + '</collection>\n')
def get_record(self, xml_file): """ Reads a xml file in JATS format and returns a xml string in marc format """ self.document = parse(xml_file) rec = {} title = self._get_title() if title: record_add_field(rec, '245', subfields=[('a', title)]) journal, volume, issue, year, start_date, doi,\ article_id = self._get_publition_information() if start_date: record_add_field(rec, '260', subfields=[('c', start_date)]) if doi: record_add_field(rec, '024', ind1='7', subfields=[('a', doi), ('2', 'DOI')]) authors = self._get_authors() first_author = True for author in authors: subfields = [('a', author[0]), ('v', author[1])] if first_author: record_add_field(rec, '100', subfields=subfields) first_author = False else: record_add_field(rec, '700', subfields=subfields) abstract = self._get_abstract() if abstract: record_add_field(rec, '520', subfields=[('a', abstract), ('9', 'APS')]) copyrightt = self._get_copyright() if copyrightt: year = '' if copyrightt.startswith('©'): year = copyrightt[2:].strip() year = year.split()[0] if year.isdigit(): copyrightt = copyrightt[2:].strip() copyrightt = " ".join(copyrightt.split()[1:]) record_add_field(rec, '542', subfields=[('d', copyrightt), ('g', year), ('3', 'Article')]) else: year = start_date[:4] record_add_field(rec, '542', subfields=[('f', copyrightt), ('g', year), ('3', 'Article')]) record_add_field(rec, '773', subfields=[('p', journal), ('v', volume), ('n', issue), ('y', year), ('c', article_id)]) record_add_field(rec, '980', subfields=[('a', 'HEP')]) record_add_field(rec, '980', subfields=[('a', 'Citeable')]) record_add_field(rec, '980', subfields=[('a', 'Published')]) self._add_references(rec) try: return record_xml_output(rec) except UnicodeDecodeError: sys.stderr.write("""Found a bad char in the file for the article """ + doi) return ""
def upload_amendments(records, holdingpen): """ Upload a modified record """ if task_get_option("no_upload", False) or len(records) == 0: return xml = '<collection xmlns="http://www.loc.gov/MARC21/slim">' for record in records: xml += record_xml_output(record) xml += "</collection>" tmp_file_fd, tmp_file = mkstemp( suffix='.xml', prefix="bibcheckfile_%s" % time.strftime("%Y-%m-%d_%H:%M:%S"), dir=CFG_TMPSHAREDDIR ) os.write(tmp_file_fd, xml) os.close(tmp_file_fd) os.chmod(tmp_file, 0644) if holdingpen: flag = "-o" else: flag = "-r" task = task_low_level_submission('bibupload', 'bibcheck', flag, tmp_file) write_message("Submitted bibupload task %s" % task)
def _get_formated_record(record_id, output_format, update_commands, language): """Returns a record in a given format @param record_id: the ID of record to format @param output_format: an output format code (or short identifier for the output format) @param update_commands: list of commands used to update record contents @param language: the language to use to format the record """ updated_record = _get_updated_record(record_id, update_commands) xml_record = bibrecord.record_xml_output(updated_record) # FIXME: Remove this as soon as the formatting for MARC is # implemented in bibformat if "hm" == output_format: result = _create_marc(xml_record) return result result = bibformat.format_record(recID=None, of=output_format, xml_record=xml_record, ln=language) return result
def get_keywords_body(keywords, req, recid, argd): """Returns the body associated with the keywords.""" body = [] rec = get_record(recid) extend_argd(argd) if keywords: weights_available = 0 not in zip(*keywords)[1] else: req.write('There are no keywords associated with this document.<br>' \ '<form action="" method="get">' \ ' <input type="hidden" name="generate" value="yes">' \ ' <input type="submit" value="Generate keywords">' \ '</form>') return if argd['type'] == 'tagcloud' and not weights_available: # No weight is specified for at least one of the keywords. # Display the keywords as a list. argd['type'] = 'list' if argd['type'] == 'tagcloud': body.append('<div style="text-align: center; color: red; ' 'font-size: 80%; margin-top: 15px">Single keywords in grey, ' 'composite keywords in blue.</div>') if argd['type'] == 'list': # Display keywords as a list. body.append(_get_keywords_list(keywords, argd)) elif argd['type'] == 'tagcloud': if argd['sort'] == 'related' and not keywords: print 'No similar document was found.' # Separate single and composite keywords. single_keywords, composite_keywords = [], [] for keyword in keywords: if ': ' in keyword[0]: composite_keywords.append(keyword) else: single_keywords.append(keyword) # Display keywords as a tag cloud. single_levels = _get_font_levels(single_keywords) composite_levels = _get_font_levels(composite_keywords) body.append(_get_html_tag_cloud(single_levels + composite_levels, argd)) elif argd['type'] == 'xml': body.append('<pre><code>%s</code></pre>' % escape_html(record_xml_output(rec, ['653']))) else: body = 'Unknown type: ' + argd['type'] out = '' for element in body: out += '<br>' + element.encode('utf-8') req.write(out) return
def test_correcting_added_field_with_diff_ind(self): """ BibUpload Revision Verifier - Rev3-100/970__/888, Added 970CP in Rev2(100/970__), Patch Generated for 970CP""" upload_recs = xml_marc_to_records(self.rev2_mod_field_diff_ind) orig_recs = xml_marc_to_records(self.data["rev3"][0]) rev_verifier = RevisionVerifier() (opt_mode, patch, dummy_affected_tags) = rev_verifier.verify_revision(upload_recs[0], orig_recs[0], "replace") self.assertEqual("correct", opt_mode) self.assertEqual(compare_xmbuffers(record_xml_output(patch), self.patch_diff_ind), "")
def test_add_new_field(self): """ BibUpload Revision Verifier - Rev3-100/970/888, Added 300 to Rev2(100/970), Patch Generated for 300""" upload_recs = xml_marc_to_records(self.rev2_add_field) orig_recs = xml_marc_to_records(self.data["rev3"][0]) rev_verifier = RevisionVerifier() (opt_mode, patch, dummy_affected_tags) = rev_verifier.verify_revision(upload_recs[0], orig_recs[0], "replace") self.assertEqual("correct", opt_mode) self.assertEqual(compare_xmbuffers(record_xml_output(patch), self.patch), "")
def _prepare_marcxml(recid_a, rn_a, recids_and_rns_b, what_is_a_for_b, what_is_b_for_a, display_in_a=True, display_in_b=True, marc_for_a=None, marc_for_b=None, upload_mode='append', consider_empty_p=False): output = '<collection>' record_a = {} record_b = {} if what_is_b_for_a is not None: marc_tag_for_a, marc_ind1_for_a, marc_ind2_for_a = \ _prepare_marc(marc_for_a, CFG_OTHER_RELATIONSHIP_ENTRY, display_in_a and "0" or "1") record_add_field(record_a, "001", controlfield_value=str(recid_a)) if upload_mode == 'correct' and not recids_and_rns_b and consider_empty_p: # Add empty field in order to account for cases where all # linkings are removed by the submitter record_add_field(record_a, marc_tag_for_a, ind1=marc_ind1_for_a, ind2=marc_ind2_for_a) for recid_b, rn_b in recids_and_rns_b: record_add_field(record_a, marc_tag_for_a, ind1=marc_ind1_for_a, ind2=marc_ind2_for_a, subfields=[('i', what_is_b_for_a), ('r', rn_b), ('w', str(recid_b))]) output += record_xml_output(record_a) if what_is_a_for_b is not None: marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b = \ _prepare_marc(marc_for_b, CFG_OTHER_RELATIONSHIP_ENTRY, display_in_b and "0" or "1") for recid_b, rn_b in recids_and_rns_b: record_b = {} record_add_field(record_b, "001", controlfield_value=str(recid_b)) if upload_mode == 'correct': original_linking_fields = _get_record_linking_fields(recid_b, recid_a, marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b) record_add_fields(record_b, marc_tag_for_b, original_linking_fields) record_add_field(record_b, marc_tag_for_b, ind1=marc_ind1_for_b, ind2=marc_ind2_for_b, subfields=[('i', what_is_a_for_b), ('r', rn_a), ('w', str(recid_a))]) output += record_xml_output(record_b) # Remove linking in remote records where adequate if consider_empty_p: unlinked_recids = get_unlinked_records(recid_a, marc_for_b, display_in_b, upload_mode, recids_and_rns_b) for recid_b in unlinked_recids: record_b = {} record_add_field(record_b, "001", controlfield_value=str(recid_b)) original_linking_fields = _get_record_linking_fields(recid_b, recid_a, marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b) if not original_linking_fields: # Add empty field in order to account for cases where all # linkings are removed by the submitter record_add_field(record_b, marc_tag_for_b, ind1=marc_ind1_for_b, ind2=marc_ind2_for_b) record_add_fields(record_b, marc_tag_for_b, original_linking_fields) output += record_xml_output(record_b) output += '</collection>' return output
def generate_marc_to_append(local, remote): """ Generates MarcXML to append an 035 remote ID to a record """ newrec = {} record_add_field(newrec, '001', controlfield_value=str(local)) field_pos = record_add_field(newrec, '035') record_add_subfield_into(newrec, '035', '9', REMOTE_INSTANCE, field_position_global=field_pos) record_add_subfield_into(newrec, '035', 'a', str(remote), field_position_global=field_pos) return record_xml_output(newrec)
def test_interchanged_fields(self): """ BibUpload Revision Verifier - Rev1--100-1/100-2/100-3/970/888, Rev1-Up--100-2/100-3/100-1/970/888, Patch Generated for 100""" upload_recs = xml_marc_to_records(self.rev1_mod) orig_recs = xml_marc_to_records(self.rev1) rev_verifier = RevisionVerifier() (opt_mode, patch, dummy_affected_tags) = rev_verifier.verify_revision(upload_recs[0], orig_recs[0], "replace") self.assertEqual("correct", opt_mode) self.assertEqual(compare_xmbuffers(record_xml_output(patch), self.patch), "")
def _get_formated_record(record_id, output_format, update_commands, language, outputTags="", run_diff=True): """Returns a record in a given format @param record_id: the ID of record to format @param output_format: an output format code (or short identifier for the output format) @param update_commands: list of commands used to update record contents @param language: the language to use to format the record @param run_diff: determines if we want to run _get_recodr_diff function, which sometimes takes too much time """ if update_commands: # Modify te bibrecord object with the appropriate actions updated_record = _get_updated_record(record_id, update_commands) textmarc_options = {"aleph-marc":0, "correct-mode":1, "append-mode":0, "delete-mode":0, "insert-mode":0, "replace-mode":0, "text-marc":1} old_record = search_engine.get_record(recid=record_id) old_record_textmarc = xmlmarc2textmarc.create_marc_record(old_record, sysno="", options=textmarc_options) if "hm" == output_format: if update_commands and run_diff: updated_record_textmarc = xmlmarc2textmarc.create_marc_record(updated_record, sysno="", options=textmarc_options) result = _get_record_diff(old_record_textmarc, updated_record_textmarc, outputTags, record_id) else: filter_tags = "All tags" not in outputTags and outputTags result = ['<pre>'] for line in old_record_textmarc.splitlines()[:-1]: if not filter_tags or line.split()[0].replace('_', '') in outputTags: result.append("%09d " % record_id + line.strip()) result.append('</pre>') result = '\n'.join(result) else: if update_commands: # No coloring of modifications in this case xml_record = bibrecord.record_xml_output(updated_record) else: xml_record = bibrecord.record_xml_output(old_record) result = bibformat.format_record(recID=None, of=output_format, xml_record=xml_record, ln=language) return result
def create_xml(recs_to_change, subfields): """ Create xmls for upload. @param recs_to_change: affected tagiic in recid @type recs_to_change: dict @param subfields: VOLATILE content in tagiic @type subfields: dict @return: (string, string) xml's for correct and delete """ xml_correct = '' xml_delete = '' for recid in recs_to_change.keys(): tags_correct = [] tags_delete = [] tags4update = [] record_old = get_record(recid) record = deepcopy(record_old) for tagiic in recs_to_change[recid]: tag = tagiic[:3] for value in subfields[tagiic]: if record.has_key(tag): for field_position, field in enumerate(record[tag]): for subfield_position, subfield in enumerate(field[0]): if subfield[1] == value and subfield[0] == tagiic[5]: record_delete_subfield_from( record, tag, subfield_position, field_position_local=field_position) tags4update.append(tag) for tag in set(tags4update): if record.has_key(tag): tags_correct.append(tag) else: tags_delete.append(tag) if tags_correct: xml_correct += record_xml_output(record, ['001', '005'] + tags_correct) + '\n' if tags_delete: xml_delete += record_xml_output(record_old, ['001', '005'] + tags_delete) + '\n' return xml_correct, xml_delete
def test_add_identical_field(self): """ BibUpload Revision Verifier - Rev3-100/970/888, Added 100 to Rev2(100/970), Patch Generated for 100""" upload_identical_rec = xml_marc_to_records(self.rev2_add_sim_field) orig_recs = xml_marc_to_records(self.data['rev3'][0]) rev_verifier = RevisionVerifier() (opt_mode, patch, dummy_affected_tags) = rev_verifier.verify_revision(upload_identical_rec[0], \ orig_recs[0], \ 'replace') self.assertEqual('correct', opt_mode) self.assertEqual(compare_xmbuffers(record_xml_output(patch), self.patch_identical_field), '')
def bibupload_record(record=None, collection=None, file_prefix="bibuploadutils", mode="-c", alias='bibuploadutils', opts=[]): """ General purpose function that will write a MARCXML file and call bibupload on it. """ if collection is None and record is None: return (file_out, filename) = open_temp_file(file_prefix) if collection is not None: file_out.write("<collection>") tot = 0 for rec in collection: file_out.write(record_xml_output(rec)) tot += 1 if tot == CFG_MAX_RECORDS: file_out.write("</collection>") close_temp_file(file_out, filename) task_low_level_submission( 'bibupload', alias, mode, filename, *opts ) (file_out, filename) = open_temp_file(file_prefix) file_out.write("<collection>") tot = 0 file_out.write("</collection>") elif record is not None: tot = 1 file_out.write(record_xml_output(record)) close_temp_file(file_out, filename) if tot > 0: return task_low_level_submission( 'bibupload', alias, mode, filename, *opts ) return None
def write_records_to_file(output_dir, name, records, dry_run): """ Writes a new MARCXML file to specified path from a list of records. """ if len(records) > 0: out = [] out.append("<collection>") for record in records.itervalues(): if record != {}: out.extend(record_xml_output(record).split('\n')) out.append("</collection>") if dry_run: _print_out("DRY: Ready to write " + str(len(records)) + " entries to file.") else: _print_out("-> Writing " + str(len(records)) + " entries to file...") write_list_to_file(output_dir, name, out)
def write_record_to_file(filename, record_list): """ Writes a new MARCXML file to specified path from a list of records. """ if len(record_list) > 0: out = [] out.append("<collection>") for record in record_list: if record != {}: record = record_drop_duplicate_fields(record) out.append(record_xml_output(record)) out.append("</collection>") if len(out) > 2: file_fd = open(filename, 'w') file_fd.write("\n".join(out)) file_fd.close()
def write_record_to_file(filename, record_list): """Writes a new MARCXML file to specified path from BibRecord list.""" from invenio.bibrecord import record_xml_output if len(record_list) > 0: out = [] out.append("<collection>") for record in record_list: if record != {}: out.append(record_xml_output(record)) out.append("</collection>") if len(out) > 2: file_fd = open(filename, 'w') file_fd.write("\n".join(out)) file_fd.close() return True
def bst_hepdata(): uploader = ChunkedHepDataUpload() dumper = HepDataDumper() for record in dumper: marcxml_record = hepdata2marcxml(record) uploader.add(marcxml_record) inspire_ids = dumper.inspire_ids current_inspire_ids = intbitset(perform_request_search(p='035__9:HEPDATA')) records_to_amend = inspire_ids - current_inspire_ids id_appender = ChunkedBibUpload(mode='a', user='******') for recid in records_to_amend: rec = {} record_add_field(rec, tag="001", controlfield_value=str(recid)) record_add_field(rec, tag="035", subfields=[('a', 'ins%s' % recid), ('9', 'HEPDATA')]) id_appender.add(record_xml_output(rec))
def modify_record_timestamp(revision_xml, last_revision_ts): """ Modify tag 005 to add the revision passed as parameter. @param revision_xml: marcxml representation of the record to modify @type revision_xml: string @param last_revision_ts: timestamp to add to 005 tag @type last_revision_ts: string @return: marcxml with 005 tag modified """ recstruct = create_record(revision_xml)[0] if "005" in recstruct: record_modify_controlfield(recstruct, "005", last_revision_ts, field_position_local=0) else: record_add_field(recstruct, '005', controlfield_value=last_revision_ts) return record_xml_output(recstruct)
def approve_record(self, recid): """ Approve a record to make it publicly available """ # Make MARCXML to approve record rec = {} record_add_field(rec, '001', controlfield_value=str(recid)) record_add_field(rec, '980', subfields=[('a', 'OPENAIRE')]) output = "<collection>%s</collection>" % record_xml_output(rec) # Upload MARCXML run_sql("TRUNCATE schTASK") # Ensures we run bibupload (hdl, marcxml_path) = mkstemp(suffix=".xml", text=True) open(marcxml_path, 'w').write(output) task_low_level_submission('bibupload', 'openaire', '-c', marcxml_path, '-P5') task_low_level_submission('bibindex', 'openaire') task_low_level_submission('webcoll', 'openaire') os.system("%s/bin/bibupload 1 > /dev/null" % CFG_PREFIX) os.system("%s/bin/bibindex 2 > /dev/null" % CFG_PREFIX) os.system("%s/bin/webcoll 3 > /dev/null" % CFG_PREFIX)
def replace_references(recid, uid=None, txt=None, url=None): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record * txt: references in text mode * inspire: format of ther references """ # Parse references if txt is not None: references_xml = extract_references_from_string_xml( txt, is_only_references=True) elif url is not None: references_xml = extract_references_from_url_xml(url) else: references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml.encode('utf-8')) dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_file_contents( recid, uid) out_xml = None references_to_add = record_get_field_instances(references[0], tag='999', ind1='C', ind2='5') refextract_status = record_get_field_instances(references[0], tag='999', ind1='C', ind2='6') if references_to_add: # Replace 999 fields record_delete_fields(record, '999') record_add_fields(record, '999', references_to_add) record_add_fields(record, '999', refextract_status) # Update record references out_xml = record_xml_output(record) return out_xml
def main(): """ TODO: Fix file download of funny URLS ?conf... No keywords - check presentation pubype not shown for some reason. """ f = codecs.open(os.path.expanduser("~/Desktop/emi.csv"), "r", "utf-8") fout = open(os.path.expanduser("~/Desktop/emi.xml"), "w") fout.write("<collection>") for (i, row) in enumerate(unicode_csv_reader(f)): if i == 0: continue print i try: fout.write(record_xml_output(handle_row(i, row))) except Exception, e: print e print "Couldn't handle row:", i
def bst_hal(): doi_map, arxiv_map = get_hal_maps() matchable_records = get_record_ids_to_export() write_message("Total matchable records: %s" % len(matchable_records)) hal_records = get_hal_records() write_message("Already matched records: %s" % len(hal_records)) bibupload = ChunkedBibUpload(mode='a', notimechange=True, user='******') tot_records = matchable_records - hal_records write_message("Records to be checked: %s" % len(tot_records)) for i, recid in enumerate(tot_records): if i % 1000 == 0: write_message("%s records done out of %s" % (i, len(tot_records))) task_sleep_now_if_required() dois = get_fieldvalues(recid, tag='0247__a', sort=False) arxivs = get_fieldvalues(recid, tag='037__a', sort=False) matched_hal = [doi_map[doi] for doi in dois if doi in doi_map] matched_hal += [ arxiv_map[arxiv] for arxiv in arxivs if arxiv in arxiv_map ] # Let's assert that we matched only one single hal document at most matched_hal_id = set(id(entry) for entry in matched_hal) if len(matched_hal) > 1: write_message( "WARNING: record %s matches more than 1 HAL record: %s" % (recid, matched_hal), stream=sys.stderr) continue elif not matched_hal: continue hal_id = matched_hal[0]['halId_s'] rec = {} record_add_field(rec, '001', controlfield_value=str(recid)) record_add_field(rec, '035', subfields=[('a', hal_id), ('9', 'HAL')]) write_message("Record %s matched HAL record %s" % (recid, hal_id)) bibupload.add(record_xml_output(rec)) return True
def bst_labssync(): """ Synchronizes from Labs via redis. """ r = redis.StrictRedis.from_url(CFG_REDIS_HOST_LABS) user_agent = make_user_agent_string('labssync') s = requests.Session() s.headers['User-Agent'] = user_agent s.headers['Accept'] = 'application/marcxml+xml' tot = r.scard(CFG_REDIS_KEY) if tot == 0: write_message("Nothing to do") return else: write_message("At least %s records to synchronize from labs" % tot) errors = [] final_total = 0 uploader = ChunkedBibUpload(mode='r', user='******') while True: elem = r.spop(CFG_REDIS_KEY) if not elem: break final_total += 1 try: record = s.get("https://%s/api/%s" % (CFG_LABS_HOSTNAME, elem)).text # Let's strip collection/XML header record = record_xml_output(create_record(record)[0]) uploader.add(record) task_sleep_now_if_required() except Exception as err: register_exception() write_message("ERROR: when retrieving %s: %s" % (elem, err), stream=sys.stderr) errors.append(elem) write_message("Finally synced %s records from labs" % final_total) if errors: write_message("All those %s records had errors and might need to be resynced: %s" % (len(errors), ', '.join(errors)))
def hepdata2marcxml(record): out = {} record_add_field(out, '024', '7', subfields=[('a', record['doi']), ('2', 'DOI')]) if record.get('title'): title = 'Data from {title} from: {paper_title}' else: title = 'Additional data from: {paper_title}' record_add_field(out, '245', subfields=[ ('a', title.format(title=record.get('title'), paper_title=record['paper_title'])), ('9', 'HEPDATA') ]) record_add_field(out, '336', subfields=[('t', 'DATASET')]) record_add_field(out, '520', subfields=[('h', record['abstract']), ('9', 'HEPDATA')]) for keyword in record['keywords']: name = keyword['name'] value = keyword['value'] if name in ('observables', 'cmenergies'): value = '%s: %s' % (name, value) record_add_field(out, '695', subfields=[('a', value), ('9', 'HEPDATA')]) for collaboration in record['collaborations']: record_add_field(out, '710', subfields=[('g', collaboration)]) record_add_field(out, '786', subfields=[('q', str(record['position'])), ('w', str(record['inspire_id']))]) record_add_field(out, '980', subfields=[('a', 'DATA')]) return record_xml_output(out)
def upload_amendments(records, holdingpen): """ Upload a modified record """ if task_get_option("no_upload", False) or len(records) == 0: return xml = '<collection xmlns="http://www.loc.gov/MARC21/slim">' for record in records: xml += record_xml_output(record) xml += "</collection>" tmp_file_fd, tmp_file = mkstemp(suffix='.xml', prefix="bibcheckfile_%s" % time.strftime("%Y-%m-%d_%H:%M:%S"), dir=CFG_TMPSHAREDDIR) os.write(tmp_file_fd, xml) os.close(tmp_file_fd) os.chmod(tmp_file, 0644) if holdingpen: flag = "-o" else: flag = "-r" task = task_low_level_submission('bibupload', 'bibcheck', flag, tmp_file) write_message("Submitted bibupload task %s" % task)
def bst_scoap3_importer(): """Import from SCOAP3.""" try: request = requests.get( 'http://repo.scoap3.org/ffts_for_inspire.py/csv') except (HTTPError, ConnectionError, Timeout): register_exception() return task_sleep_now_if_required(can_stop_too=True) fd_update, name_update = mkstemp(suffix='.xml', prefix='bibupload_scoap3_', dir=CFG_TMPSHAREDDIR) out_update = fdopen(fd_update, 'w') fd_new, name_new = mkstemp(suffix='.xml', prefix='bibupload_scoap3_', dir=CFG_TMPSHAREDDIR) out_new = fdopen(fd_new, 'w') print >> out_update, "<collection>" print >> out_new, "<collection>" line_count_new = 0 # to avoid empty bibupload line_count_update = 0 # to avoid empty bibupload # We strip the first line. for line in request.text.split("\n")[1:]: if not line.strip(): continue task_sleep_now_if_required(can_stop_too=True) recid, arxiv_id, cr_date, checksum, link, file_format, doi = [ x.strip() for x in line.split(',') ] write_message(line.strip()) if checksum == "None": write_message("... no PDF. Skipping") continue if arxiv_id == "None": inspire_record = perform_request_search(p="doi:%s" % (doi, ), cc="HEP") else: inspire_record = perform_request_search(p="037:%s or doi:%s" % (arxiv_id, doi), cc="HEP") if len(inspire_record) > 1: write_message( "ERROR: more than one INSPIRE record matched %s and %s for SCOAP3 record %s: %s" % (arxiv_id, doi, recid, list(inspire_record)), stream=sys.stderr) continue elif not inspire_record: write_message( "WARNING: no INSPIRE record matched %s or %s for SCOAP3 record %s" % (arxiv_id, doi, recid), stream=sys.stderr) continue action = None # do nothing rec = {} inspire_record = inspire_record[0] record = BibRecDocs(inspire_record) for doc in record.list_latest_files('SCOAP3'): if doc.format == file_format: if doc.checksum == checksum: write_message( "... OK: file alredy attached to INSPIRE record %s (doc.checksum=%s, checksum=%s)" % (inspire_record, doc.checksum, checksum)) else: write_message( "... OK: new revision available for INSPIRE record %s (doc.checksum=%s, checksum=%s)" % (inspire_record, doc.checksum, checksum)) action = "UPDATE" break else: write_message("... OK: need to add new file to INSPIRE record %s" % inspire_record) action = "APPEND" if action: if file_format == '.pdf;pdfa': record_add_field(rec, 'FFT', subfields=[('a', link), ('n', 'scoap3-fulltext'), ('f', '.pdf;pdfa'), ('t', 'SCOAP3'), ('d', 'Article from SCOAP3')]) else: record_add_field(rec, 'FFT', subfields=[('a', link), ('n', 'scoap3-fulltext'), ('t', 'SCOAP3'), ('d', 'Article from SCOAP3')]) record_add_field(rec, '001', controlfield_value=str(inspire_record)) if action == "UPDATE": line_count_update += 1 print >> out_update, record_xml_output(rec) elif action == "APPEND": line_count_new += 1 print >> out_new, record_xml_output(rec) print >> out_update, "</collection>" print >> out_new, "</collection>" out_new.close() out_update.close() if line_count_new: # We use correct here instead of append to deal with potential sync issues. # Basically BibUpload should handle "new" corrections as "append" if it is not there. id = task_low_level_submission("bibupload", "admin", "-N", "SCOAP3-import", "-c", name_new) write_message("Scheduled bibupload --correct %s with ID #%s" % (name_new, id)) else: remove(name_new) if line_count_update: id = task_low_level_submission("bibupload", "admin", "-N", "SCOAP3-import", "-c", name_update) write_message("Scheduled bibupload --correct %s with ID #%s" % (name_update, id)) else: remove(name_update)
(len(match_results[2]), )) sys.stderr.write("\n Fuzzy records : %d\n" % (len(match_results[3]), )) sys.stderr.write("=" * 35) sys.stderr.write("\n Total records : %d\n" % (len(records), )) if not noprocess: options = {'text-marc': 1, 'aleph-marc': 0} for record, results in recs_out: if textmarc_output: # FIXME: textmarc output does not print matching results sysno = get_sysno_from_record(record, options) print create_marc_record(record, sysno, options) else: print results print record_xml_output(record) if batch_output: i = 0 options = {'text-marc': 1, 'aleph-marc': 0} outputs = ['new', 'matched', 'ambiguous', 'fuzzy'] for result in match_results: filename = "%s.%s" % (batch_output, outputs[i]) file_fd = open(filename, "w") for record, results in result: out = [] if textmarc_output: # FIXME: textmarc output does not print matching results sysno = get_sysno_from_record(record, options) out.append(create_marc_record(record, sysno, options)) else:
sys.stderr.write("\n New records : %d" % len(match_results[0])) sys.stderr.write("\n Matched records : %d" % len(match_results[1])) sys.stderr.write("\n Ambiguous records : %d" % len(match_results[2])) sys.stderr.write("\n Fuzzy records : %d\n" % len(match_results[3])) sys.stderr.write("=" * 35) sys.stderr.write("\n Total records : %d\n" % len(records)) if not noprocess: options = {'text-marc':1, 'aleph-marc':0} for record in recs_out: if textmarc_output: sysno = get_sysno_from_record(record[0], options) print create_marc_record(record[0], sysno, options) else: print record[3] print record_xml_output(record[0]) if batch_output: i = 0 options = {'text-marc':1, 'aleph-marc':0} for result in match_results: filename = "%s.%i" % (batch_output, i) file_fd = open(filename,"w") for record in result: out = "" if textmarc_output: sysno = get_sysno_from_record(record[0], options) out += create_marc_record(record[0], sysno, options) else: out += record[3] out += record_xml_output(record[0])
def save_xml_record(recid, uid, xml_record='', to_upload=True, to_merge=False, task_name="bibedit", sequence_id=None): """Write XML record to file. Default behaviour is to read the record from a BibEdit cache file, filter out the unchanged volatile subfields, write it back to an XML file and then pass this file to BibUpload. @param xml_record: give XML as string in stead of reading cache file @param to_upload: pass the XML file to BibUpload @param to_merge: prepare an XML file for BibMerge to use """ if not xml_record: # Read record from cache file. cache = get_cache_contents(recid, uid) if cache: record = cache[2] used_changes = cache[4] xml_record = record_xml_output(record) delete_cache(recid, uid) delete_disabled_changes(used_changes) else: record = create_record(xml_record)[0] # clean the record from unfilled volatile fields record_strip_empty_volatile_subfields(record) record_strip_empty_fields(record) # order subfields alphabetically before saving the record record_order_subfields(record) xml_to_write = wash_for_xml(record_xml_output(record)) # Write XML file. if not to_merge: fd, file_path = tempfile.mkstemp(dir=CFG_BIBEDIT_CACHEDIR, prefix="%s_" % CFG_BIBEDIT_FILENAME, suffix="_%s_%s.xml" % (recid, uid)) f = os.fdopen(fd, 'w') f.write(xml_to_write) f.close() else: file_path = '%s_%s.xml' % (_get_file_path( recid, uid), CFG_BIBEDIT_TO_MERGE_SUFFIX) xml_file = open(file_path, 'w') xml_file.write(xml_to_write) xml_file.close() user_name = get_user_info(uid)[1] if to_upload: args = [ 'bibupload', user_name, '-P', '5', '-r', file_path, '-u', user_name ] if task_name == "bibedit": args.extend(['--name', 'bibedit']) if sequence_id: args.extend(["-I", sequence_id]) args.extend(['--email-logs-on-error']) task_low_level_submission(*args) return True
def bst_scoap3_importer(): task_sleep_now_if_required(can_stop_too=True) f = urllib.urlopen('http://repo.scoap3.org/ffts_for_inspire.py/csv') fd_update, name_update = mkstemp(suffix='.xml', prefix='bibupload_scoap3_', dir=CFG_TMPSHAREDDIR) out_update = fdopen(fd_update, 'w') fd_new, name_new = mkstemp(suffix='.xml', prefix='bibupload_scoap3_', dir=CFG_TMPSHAREDDIR) out_new = fdopen(fd_new, 'w') print >> out_update, "<collection>" print >> out_new, "<collection>" line_count_new = 0 # to avoid empty bibupload line_count_update = 0 # to avoid empty bibupload f.readline() ## Let's strip the header line for d in f: task_sleep_now_if_required(can_stop_too=True) recid, arxiv_id, cr_date, checksum, link, type, doi = [ x.strip() for x in d.split(',') ] write_message(d.strip()) if checksum == "None": write_message("... no PDF. Skipping") continue if arxiv_id == "None": inspire_record = perform_request_search(p="doi:%s" % (doi, ), cc="HEP") else: inspire_record = perform_request_search(p="037:%s or doi:%s" % (arxiv_id, doi), cc="HEP") if len(inspire_record) > 1: write_message( "ERROR: more than one INSPIRE record matched %s and %s for SCOAP3 record %s: %s" % (arxiv_id, doi, recid, list(inspire_record)), stream=sys.stderr) continue elif not inspire_record: write_message( "WARNING: no INSPIRE record matched %s or %s for SCOAP3 record %s" % (arxiv_id, doi, recid), stream=sys.stderr) continue action = None # do nothing rec = {} inspire_record = inspire_record[0] record = BibRecDocs(inspire_record) for doc in record.list_latest_files(): if doc.format in ('.pdf', '.pdf;pdfa'): if doc.bibdoc.doctype == 'SCOAP3': if doc.checksum == checksum: write_message( "... OK: file alredy attached to INSPIRE record %s (doc.checksum=%s, checksum=%s)" % (inspire_record, doc.checksum, checksum)) else: write_message( "... OK: new revision available for INSPIRE record %s (doc.checksum=%s, checksum=%s)" % (inspire_record, doc.checksum, checksum)) action = "UPDATE" break else: write_message("... OK: need to add new file to INSPIRE record %s" % inspire_record) action = "APPEND" if action: if type == '.pdf;pdfa': record_add_field(rec, 'FFT', subfields=[('a', link), ('n', 'scoap3-fulltext'), ('f', '.pdf;pdfa'), ('t', 'SCOAP3'), ('d', 'Article from SCOAP3')]) else: record_add_field(rec, 'FFT', subfields=[('a', link), ('n', 'scoap3-fulltext'), ('t', 'SCOAP3'), ('d', 'Article from SCOAP3')]) record_add_field(rec, '001', controlfield_value=str(inspire_record)) if action == "UPDATE": line_count_update += 1 print >> out_update, record_xml_output(rec) elif action == "APPEND": line_count_new += 1 print >> out_new, record_xml_output(rec) print >> out_update, "</collection>" print >> out_new, "</collection>" out_new.close() out_update.close() if line_count_new: id = task_low_level_submission("bibupload", "admin", "-N", "SCOAP3-import", "-a", name_new) write_message("Scheduled bibupload --append %s with ID #%s" % (name_new, id)) else: remove(name_new) if line_count_update: id = task_low_level_submission("bibupload", "admin", "-N", "SCOAP3-import", "-c", name_update) write_message("Scheduled bibupload --correct %s with ID #%s" % (name_new, id)) else: remove(name_update)
def perform_request_record(requestType, uid, data): """Handle 'major' record related requests. Handle retrieving, submitting or cancelling the merging session. """ #TODO add checks before submission and cancel, replace get_bibrecord call result = {'resultCode': 0, 'resultText': ''} recid1 = data["recID1"] record1 = _get_record(recid1, uid, result) if result[ 'resultCode'] != 0: #if record not accessible return error information return result if requestType == 'submit': if data.has_key('duplicate'): recid2 = data['duplicate'] record2 = _get_record_slave(recid2, result, 'recid', uid) if result['resultCode'] != 0: #return in case of error return result (errcode, message) = check_doi_status_after_merge( data["recID1"], data['duplicate'], record1, record2, record2_marked_as_duplicate_p=data.has_key('duplicate'), submit_confirmed_p=data.get('additional_data', { 'confirmed_submit': False }).get('confirmed_submit', False)) if errcode: result['resultCode'] = errcode result['resultText'] = message return result # mark record2 as deleted record_add_field(record2, '980', ' ', ' ', '', [('c', 'DELETED')]) # mark record2 as duplicate of record1 record_add_field(record2, '970', ' ', ' ', '', [('d', str(recid1))]) # add recid of deleted record to master record record_add_field(record1, '981', ' ', ' ', '', [('a', str(recid2))]) # To ensure updates happen in order, use a seq id sequence_id = str(random.randrange(1, 2147483648)) # submit record2 to be deleted xml_record2 = record_xml_output(record2) save_xml_record(recid2, uid, xml_record2, task_name="bibmerge", sequence_id=sequence_id) # submit record1 xml_record1 = record_xml_output(record1) save_xml_record(recid1, uid, xml_record1, task_name="bibmerge", sequence_id=sequence_id) # Delete cache file if it exists if cache_exists(recid1, uid): delete_cache(recid1, uid) result['resultText'] = 'Records submitted' return result (errcode, message) = check_doi_status_after_merge( data["recID1"], data["recID2"], record1, None, submit_confirmed_p=data.get('additional_data', { 'confirmed_submit': False }).get('confirmed_submit', False)) if errcode: result['resultCode'] = errcode result['resultText'] = message return result #submit record1 from cache save_xml_record(recid1, uid, task_name="bibmerge") # Delete cache file if it exists if cache_exists(recid1, uid): delete_cache(recid1, uid) result['resultText'] = 'Record submitted' return result elif requestType == 'cancel': delete_cache(recid1, uid) result['resultText'] = 'Cancelled' return result recid2 = data["recID2"] mode = data['record2Mode'] record2 = _get_record_slave(recid2, result, mode, uid) if result[ 'resultCode'] != 0: #if record not accessible return error information return result if requestType == 'getRecordCompare': result['resultHtml'] = bibmerge_templates.BM_html_all_diff( record1, record2) result['resultText'] = 'Records compared' elif requestType == 'recCopy': copy_R2_to_R1(record1, record2) result['resultHtml'] = bibmerge_templates.BM_html_all_diff( record1, record2) result['resultText'] = 'Record copied' elif requestType == 'recMerge': merge_record(record1, record2, merge_conflicting_fields=True) result['resultHtml'] = bibmerge_templates.BM_html_all_diff( record1, record2) result['resultText'] = 'Records merged' elif requestType == 'recMergeNC': merge_record(record1, record2, merge_conflicting_fields=False) result['resultHtml'] = bibmerge_templates.BM_html_all_diff( record1, record2) result['resultText'] = 'Records merged' else: result['resultCode'], result['resultText'] = 1, 'Wrong request type' return result
def add_other_id(other_id=None, doi="", eprint="", recid=None, system_number=None, reportnumbers=None, all_recids=None): """Search and match using given identifiers.""" query = "" if all_recids is None: all_recids = get_all_recids() if reportnumbers is None: reportnumbers = [] if recid is not None: query = "existing recid" try: recid = int(recid) except ValueError: recid = None if recid and recid not in all_recids: write_message( "WARNING: %s thought that their record %s had recid %s in %s but this seems wrong" % (CFG_OTHER_SITE, other_id, recid, CFG_THIS_SITE), stream=sys.stderr) recid = None if recid is None and eprint: query = 'oai:arXiv.org:%s' % (eprint, ) arxiv_ids = search_pattern(p=query, f='035__a', m='e') & all_recids if len(arxiv_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via %s: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, arxiv_ids), stream=sys.stderr) return [other_id] + list(arxiv_ids) elif len(arxiv_ids) == 1: recid = arxiv_ids[0] if recid is None and doi: query = 'doi:"%s"' % doi doi_ids = search_pattern(p=query) & all_recids if len(doi_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via %s: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, doi_ids), stream=sys.stderr) return [other_id] + list(doi_ids) elif len(doi_ids) == 1: recid = doi_ids[0] if recid is None and reportnumbers: query = "037__a:" + " OR 037__a:".join(reportnumbers) reportnumbers_ids = intbitset() for rn in reportnumbers: reportnumbers_ids |= search_pattern(p=rn, f='037__a', m='e') reportnumbers_ids &= all_recids if len(reportnumbers_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via %s: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, reportnumbers_ids), stream=sys.stderr) return [other_id] + list(reportnumbers_ids) elif len(reportnumbers_ids) == 1: recid = reportnumbers_ids[0] if recid is None and system_number and CFG_CERN_SITE: query = "035:%s 035:SPIRES" % (system_number, ) system_number_ids = search_pattern(p=query) system_number_ids &= all_recids if len(system_number_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via %s: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, query, system_number_ids), stream=sys.stderr) return [other_id] + list(system_number_ids) elif len(system_number_ids) == 1: recid = system_number_ids[0] if recid: recid = int(recid) record = get_record(recid) fields = record_get_field_instances(record, '035') for field in fields: subfields = dict(field_get_subfield_instances(field)) if CFG_OTHER_SITE.upper() == subfields.get('9', '').upper(): stored_recid = subfields.get('a', 0) try: stored_recid = int(stored_recid) except ValueError: # Not an integer, we move on and add the new ID. continue if stored_recid and int(stored_recid) != int(other_id): write_message( "ERROR: %s record %s matches %s record %s which already points back to a different record %s in %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, recid, stored_recid, CFG_OTHER_SITE), stream=sys.stderr) if CFG_INSPIRE_SITE and int(other_id) not in CERN_IDS: write_message( "INFO: ID was found in 035 but the record is not core CERN hence it should be moved into 595" ) else: return if CFG_INSPIRE_SITE: fields = record_get_field_instances(record, '595') for field in fields: subfields = dict(field_get_subfield_instances(field)) if "CDS" in subfields.get('a', '').upper(): stored_recid = subfields.get('a', 0).split("-")[-1] try: stored_recid = int(stored_recid) except ValueError: # Not an integer, we move on and add the new ID. continue if stored_recid and int(stored_recid) != int(other_id): write_message( "ERROR: %s record %s matches %s record %s which already points back to a different record %s in %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, recid, stored_recid, CFG_OTHER_SITE), stream=sys.stderr) if int(other_id) in CERN_IDS: write_message( "INFO: ID was found in 595 but the record is core CERN hence it should be moved into 035" ) else: return write_message("Matched {1}/{0} to {3}/{2} with {4}".format( other_id, CFG_OTHER_URL, recid, CFG_THIS_URL, query)) rec = {} record_add_field(rec, '001', controlfield_value='%s' % recid) # Let's filter out previous values in 035/595 for field in record_get_field_instances(record, '035'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if subfields_dict.get('a') != str(other_id) or subfields_dict.get( '9') != CFG_OTHER_SITE: record_add_field(rec, '035', subfields=subfields) for field in record_get_field_instances(record, '595'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if subfields_dict.get('a') != "CDS-{0}".format( other_id) or subfields_dict.get('9') != 'CERN': record_add_field(rec, '595', subfields=subfields) if CFG_INSPIRE_SITE: if int(other_id) in CERN_IDS: write_message("CERN relevant paper: adding 035") record_add_field(rec, '035', ind1=' ', ind2=' ', subfields=(('9', CFG_OTHER_SITE), ('a', other_id))) else: write_message("Non-CERN relevant paper: adding 595") record_add_field(rec, '595', ind1=' ', ind2=' ', subfields=(('9', "CERN"), ('a', "CDS-{0}".format(other_id)))) else: record_add_field(rec, '035', ind1=' ', ind2=' ', subfields=(('9', CFG_OTHER_SITE), ('a', other_id))) return record_xml_output(rec)
def _get_formated_record(record_id, output_format, update_commands, language, outputTags="", checked=True, displayed_records=None): """Returns a record in a given format @param record_id: the ID of record to format @param output_format: an output format code (or short identifier for the output format) @param update_commands: list of commands used to update record contents @param language: the language to use to format the record @param outputTags: the tags to be shown to the user @param checked: is the record checked by the user? @param displayed_records: records to be displayed on a given page @returns: record formated to be displayed or None """ if update_commands and checked: # Modify the bibrecord object with the appropriate actions updated_record = _get_updated_record(record_id, update_commands) textmarc_options = { "aleph-marc": 0, "correct-mode": 1, "append-mode": 0, "delete-mode": 0, "insert-mode": 0, "replace-mode": 0, "text-marc": 1 } if record_id not in displayed_records: return old_record = search_engine.get_record(recid=record_id) old_record_textmarc = xmlmarc2textmarc.create_marc_record( old_record, sysno="", options=textmarc_options) if "hm" == output_format: if update_commands and checked: updated_record_textmarc = xmlmarc2textmarc.create_marc_record( updated_record, sysno="", options=textmarc_options) result = _get_record_diff(old_record_textmarc, updated_record_textmarc, outputTags, record_id) else: filter_tags = "All tags" not in outputTags and outputTags result = ['<pre>'] for line in old_record_textmarc.splitlines(): if not filter_tags or line.split()[0].replace( '_', '') in outputTags: result.append("%09d " % record_id + line.strip()) result.append('</pre>') result = '\n'.join(result) else: if update_commands and checked: # No coloring of modifications in this case xml_record = bibrecord.record_xml_output(updated_record) else: xml_record = bibrecord.record_xml_output(old_record) result = bibformat.format_record(recID=None, of=output_format, xml_record=xml_record, ln=language) return result
def perform_request_record(requestType, uid, data): """Handle 'major' record related requests. Handle retrieving, submitting or cancelling the merging session. """ #TODO add checks before submission and cancel, replace get_bibrecord call result = {'resultCode': 0, 'resultText': ''} recid1 = data["recID1"] record1 = _get_record(recid1, uid, result) if result[ 'resultCode'] != 0: #if record not accessible return error information return result if requestType == 'submit': if data.has_key('duplicate'): recid2 = data['duplicate'] record2 = _get_record_slave(recid2, result, 'recid', uid) if result['resultCode'] != 0: #return in case of error return result # mark record2 as deleted record_add_field(record2, '980', ' ', ' ', '', [('c', 'DELETED')]) # mark record2 as duplicate of record1 record_add_field(record2, '970', ' ', ' ', '', [('d', str(recid1))]) #submit record2 xml_record = record_xml_output(record2) save_xml_record(recid2, uid, xml_record) #submit record1 save_xml_record(recid1, uid) result['resultText'] = 'Record submitted' return result elif requestType == 'cancel': delete_cache_file(recid1, uid) result['resultText'] = 'Cancelled' return result recid2 = data["recID2"] mode = data['record2Mode'] record2 = _get_record_slave(recid2, result, mode, uid) if result[ 'resultCode'] != 0: #if record not accessible return error information return result if requestType == 'getRecordCompare': result['resultHtml'] = bibmerge_templates.BM_html_all_diff( record1, record2) result['resultText'] = 'Records compared' elif requestType == 'recCopy': copy_R2_to_R1(record1, record2) result['resultHtml'] = bibmerge_templates.BM_html_all_diff( record1, record2) result['resultText'] = 'Record copied' elif requestType == 'recMerge': merge_record(record1, record2, merge_conflicting_fields=True) result['resultHtml'] = bibmerge_templates.BM_html_all_diff( record1, record2) result['resultText'] = 'Records merged' elif requestType == 'recMergeNC': merge_record(record1, record2, merge_conflicting_fields=False) result['resultHtml'] = bibmerge_templates.BM_html_all_diff( record1, record2) result['resultText'] = 'Records merged' else: result['resultCode'], result['resultText'] = 1, 'Wrong request type' return result
def format_element(bfo, oai=0): """Produce MARCXML with enhanced fields. Adds 100/700 $x with Record ID of linked HepName, 701/702 $y with True/False if the signature is claimed $z with Record ID of institution $w with BAI of linked Profile 371/110 $z with Record ID of institution 119/502 $z with Record ID of institution 999C5 $0 with on the fly discovered Record IDs (not for books) 773 $0 with Record ID of corresponding Book or Proceeding or Report $1 with Record ID of corresponding Journal $2 with Record ID of corresponding Conference 693/710 $0 with Record ID of corresponding experiment """ can_see_hidden_stuff = not acc_authorize_action(bfo.user_info, 'runbibedit')[0] recid = bfo.recID if can_see_hidden_stuff and is_record_deleted(bfo): record = salvage_deleted_record_from_history(recid) else: record = bfo.get_record() # Let's filter hidden fields if can_see_hidden_stuff: # Let's add bibdoc info bibrecdocs = BibRecDocs(recid) for bibdocfile in bibrecdocs.list_latest_files(): fft = [ ('a', bibdocfile.fullpath), ('d', bibdocfile.description or ''), ('f', bibdocfile.format or ''), ('n', bibdocfile.name or ''), ('r', bibdocfile.status or ''), ('s', bibdocfile.cd.strftime('%Y-%m-%d %H:%M:%S')), ('t', bibdocfile.bibdoc.doctype), ('v', str(bibdocfile.version)), ('z', bibdocfile.comment or ''), ] for flag in bibdocfile.flags: fft.append(('o', flag)) record_add_field(record, 'FFT', subfields=fft) else: # not authorized for tag in CFG_BIBFORMAT_HIDDEN_TAGS: if tag in record: del record[tag] is_institution = 'INSTITUTION' in [ collection.upper() for collection in bfo.fields('980__a') ] signatures = {} if '100' in record or '700' in record: signatures = dict(( name, (personid, flag) ) for name, personid, flag in run_sql( "SELECT name, personid, flag FROM aidPERSONIDPAPERS WHERE bibrec=%s AND flag>-2", (recid, ))) # Let's add signatures for field in record_get_field_instances( record, '100') + record_get_field_instances( record, '700') + record_get_field_instances( record, '701') + record_get_field_instances(record, '702'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict: author_name = subfield_dict['a'] personid, flag = signatures.get(author_name, (None, None)) bai = get_personid_canonical_id().get(personid) if bai: subfields.append(('w', bai)) hepname_id = get_hepname_id(personid) if hepname_id: subfields.append(('x', '%i' % hepname_id)) subfields.append(('y', '%i' % (flag == 2))) # And matched affiliations if 'u' in subfield_dict: for code, value in subfields: if code == 'u': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Thesis institution for field in record_get_field_instances(record, '502'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'c' in subfield_dict: for code, value in subfields: if code == 'c': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Related institution for field in record_get_field_instances(record, '510'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict and not '0' in subfield_dict: ids = get_institution_ids(subfield_dict['a']) if len(ids) == 1: subfields.append(('0', '%i' % ids[0])) # Related journal for field in record_get_field_instances(record, '530'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict and not '0' in subfield_dict: ids = get_institution_ids(subfield_dict['a']) if len(ids) == 1: subfields.append(('0', '%i' % ids[0])) # Enhance affiliation in Experiments for field in record_get_field_instances(record, '119'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'u' in subfield_dict: for code, value in subfields: if code == 'u': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Enhance affiliation in HepNames and Jobs and Institutions and # naked affiliations in HEP for field in record_get_field_instances( record, '371') + record_get_field_instances(record, '902'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if 'a' in subfield_dict: for code, value in subfields: if code == 'a': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) for field in record_get_field_instances(record, '110'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if is_institution: # We try to resolve obsolete ICNs if 'x' in subfield_dict: for code, value in subfields: if code == 'x': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) else: # In other collections institution is in a if 'a' in subfield_dict: for code, value in subfields: if code == 'a': ids = get_institution_ids(value) if len(ids) == 1: subfields.append(('z', '%i' % ids[0])) # Enhance citation for field in record_get_field_instances(record, '999', ind1='C', ind2='5'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) if '0' in subfield_dict: # Already available recid subfields.append(('z', '1')) else: matched_id = get_matched_id(subfields) if matched_id: subfields.append(('0', str(matched_id))) # Enhance related records for field in ( record_get_field_instances(record, '780', ind1='0', ind2='2') + record_get_field_instances(record, '785', ind1='0', ind2='2') + record_get_field_instances(record, '787', ind1='0', ind2='8')): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) subfield_citation = [] if subfield_dict.get('r'): # Reportnumber subfield_citation.append(('r', subfield_dict['r'])) if subfield_dict.get('z'): # ISBN subfield_citation.append(('i', subfield_dict['z'])) if 'w' not in subfield_dict and subfield_citation: matched_id = get_matched_id(subfield_citation) if matched_id: subfields.append(('w', str(matched_id))) # Enhance CNUMs and Journals for field in record_get_field_instances(record, '773'): subfields = field_get_subfield_instances(field) subfield_dict = dict(subfields) for code, value in subfields: if code == 'w': # Conference CNUMs recids = perform_request_search(p='111__g:"%s"' % value, cc='Conferences') if len(recids) == 1: subfields.append(('2', str(recids.pop()))) if '0' not in subfield_dict: recids = perform_request_search( p='773__w:"%s" 980:PROCEEDINGS' % value) if recid in recids: # We remove this very record, since it can be a proceedings recids.remove(recid) if len(recids) == 1: subfields.append(('0', str(recids.pop()))) elif code == 'p': # Journal title recids = perform_request_search(p='711__a:"%s"' % value, cc='Journals') if len(recids) == 1: subfields.append(('1', str(recids.pop()))) elif code == 'z' and '0' not in subfield_dict: # ISBN recids = find_isbn({'ISBN': value}) if len(recids) == 1: subfields.append(('0', str(recids.pop()))) elif code == 'r' and '0' not in subfield_dict: # Report recids = perform_request_search(p='reportnumber:"%s"' % value) if len(recids) == 1: subfields.append(('0', str(recids.pop()))) # Enhance Experiments for field in record_get_field_instances(record, '693'): subfields = field_get_subfield_instances(field) for code, value in subfields: if code == 'e': recids = perform_request_search(p='119__a:"%s"' % value, cc='Experiments') if len(recids) == 1: subfields.append(('0', str(recids.pop()))) elif code == 'a': recids = perform_request_search(p='119__b:"%s"' % value, cc='Experiments') if len(recids) == 1: subfields.append(('0', str(recids.pop()))) # Enhance Experiments for field in record_get_field_instances(record, '710'): subfields = field_get_subfield_instances(field) for code, value in subfields: if code == 'g': recids = perform_request_search(p='119__a:"%s"' % value, cc='Experiments') if len(recids) == 1: subfields.append(('0', str(recids.pop()))) # Add Creation date: if '961' in record: del record['961'] creation_date, modification_date = run_sql( "SELECT creation_date, modification_date FROM bibrec WHERE id=%s", (recid, ))[0] record_add_field(record, '961', subfields=[('x', creation_date.strftime('%Y-%m-%d')), ('c', modification_date.strftime('%Y-%m-%d'))]) formatted_record = record_xml_output(record) if oai: formatted_record = formatted_record.replace( "<record>", "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\" type=\"Bibliographic\">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>" ) formatted_record = formatted_record.replace( "<record xmlns=\"http://www.loc.gov/MARC21/slim\">", "<marc:record xmlns:marc=\"http://www.loc.gov/MARC21/slim\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd\" type=\"Bibliographic\">\n <marc:leader>00000coc 2200000uu 4500</marc:leader>" ) formatted_record = formatted_record.replace("</record", "</marc:record") formatted_record = formatted_record.replace("<controlfield", "<marc:controlfield") formatted_record = formatted_record.replace("</controlfield", "</marc:controlfield") formatted_record = formatted_record.replace("<datafield", "<marc:datafield") formatted_record = formatted_record.replace("</datafield", "</marc:datafield") formatted_record = formatted_record.replace("<subfield", "<marc:subfield") formatted_record = formatted_record.replace("</subfield", "</marc:subfield") return formatted_record
def add_other_id(other_id=None, doi="", eprint="", recid=None, reportnumbers=None, all_recids=None): if all_recids is None: all_recids = get_all_recids() if reportnumbers is None: reportnumbers = [] if recid is not None and recid not in all_recids: write_message( "WARNING: %s thought that their record %s had recid %s in %s but this seems wrong" % (CFG_OTHER_SITE, other_id, recid, CFG_THIS_SITE), stream=sys.stderr) recid = None if recid is None and eprint: arxiv_ids = search_pattern( p='oai:arXiv.org:%s' % (eprint, ), f='035__a', m='e') & all_recids if len(arxiv_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via arXiv eprint matching: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, arxiv_ids), stream=sys.stderr) return elif len(arxiv_ids) == 1: recid = arxiv_ids[0] if recid is None and doi: doi_ids = search_pattern(p='doi:"%s"' % doi) & all_recids if len(doi_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via DOI matching: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, doi_ids), stream=sys.stderr) return elif len(doi_ids) == 1: recid = doi_ids[0] if recid is None and reportnumbers: reportnumbers_ids = intbitset() for rn in reportnumbers: reportnumbers_ids |= search_pattern(p=rn, f='037__a', m='e') reportnumbers_ids &= all_recids() if len(reportnumbers_ids) > 1: write_message( "ERROR: %s record %s matches more than one record in %s via reportnumber matching: %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, reportnumbers_ids), stream=sys.stderr) return elif len(reportnumbers_ids) == 1: recid = reportnumbers_ids[0] if recid: record = get_record(recid) fields = record_get_field_instances(record, '035') for field in fields: subfields = dict(field_get_subfield_instances(field)) if CFG_OTHER_SITE.upper() == subfields.get('9', '').upper(): stored_recid = int(subfields.get('a', 0)) if stored_recid and stored_recid != other_id: write_message( "ERROR: %s record %s matches %s record %s which already points back to a different record %s in %s" % (CFG_OTHER_SITE, other_id, CFG_THIS_SITE, recid, stored_recid, CFG_OTHER_SITE), stream=sys.stderr) return rec = {} record_add_field(rec, '001', controlfield_value='%s' % recid) record_add_field(rec, '035', ind1=' ', ind2=' ', subfields=(('9', CFG_OTHER_SITE), ('a', other_id))) return record_xml_output(rec)
## Let's tag this record as a TWEET so that later we can build a collection ## out of these records. record_add_field(rec, '980', subfields=[('a', 'TWEET'), ('b', query)]) ## Some smart manipulations: let's parse out URLs and tags from the body ## of the Tweet. for url in _RE_GET_HTTP.findall(text): url = url[0] record_add_field(rec, '856', '4', subfields=[('u', url)]) for tag in _RE_TAGS.findall(text): ## And here we add the keywords. record_add_field(rec, '653', '1', subfields=[('a', tag), ('9', 'TWITTER')]) ## Finally we shall serialize everything to MARCXML return record_xml_output(rec) def bst_twitter_fetcher(query): """ Fetch the tweets related to the user and upload them into Invenio. @param user: the user """ ## We prepare a temporary MARCXML file to upload. fd, name = tempfile.mkstemp(suffix='.xml', prefix='tweets', dir=CFG_TMPDIR) tweets = get_tweets(query) if tweets: os.write(fd, """<collection>\n""") for i, tweet in enumerate(tweets): ## For every tweet we transform it to MARCXML and we dump it in the file. task_update_progress('DONE: tweet %s out %s' % (i, len(tweets))) os.write(fd, tweet_to_record(tweet, query))
def oairepositoryupdater_task(): """Main business logic code of oai_archive""" no_upload = task_get_option("no_upload") report = task_get_option("report") if report > 1: print_repository_status(verbose=report) return True initial_snapshot = {} for set_spec in all_set_specs(): initial_snapshot[set_spec] = get_set_definitions(set_spec) write_message("Initial set snapshot: %s" % pformat(initial_snapshot), verbose=2) task_update_progress("Fetching records to process") recids_with_oaiid = search_unit_in_bibxxx(p='*', f=CFG_OAI_ID_FIELD, type='e') write_message("%s recids have an OAI ID" % len(recids_with_oaiid), verbose=2) all_current_recids = search_unit_in_bibxxx(p='*', f=CFG_OAI_SET_FIELD, type='e') no_more_exported_recids = intbitset(all_current_recids) write_message("%s recids are currently exported" % (len(all_current_recids)), verbose=2) all_affected_recids = intbitset() all_should_recids = intbitset() recids_for_set = {} for set_spec in all_set_specs(): if not set_spec: set_spec = CFG_OAI_REPOSITORY_GLOBAL_SET_SPEC should_recids = get_recids_for_set_spec(set_spec) recids_for_set[set_spec] = should_recids no_more_exported_recids -= should_recids all_should_recids |= should_recids current_recids = search_unit_in_bibxxx(p=set_spec, f=CFG_OAI_SET_FIELD, type='e') write_message( "%s recids should be in %s. Currently %s are in %s" % (len(should_recids), set_spec, len(current_recids), set_spec), verbose=2) to_add = should_recids - current_recids write_message("%s recids should be added to %s" % (len(to_add), set_spec), verbose=2) to_remove = current_recids - should_recids write_message("%s recids should be removed from %s" % (len(to_remove), set_spec), verbose=2) affected_recids = to_add | to_remove write_message("%s recids should be hence updated for %s" % (len(affected_recids), set_spec), verbose=2) all_affected_recids |= affected_recids missing_oaiid = all_should_recids - recids_with_oaiid write_message("%s recids are missing an oaiid" % len(missing_oaiid)) write_message("%s recids should no longer be exported" % len(no_more_exported_recids)) ## Let's add records with missing OAI ID all_affected_recids |= missing_oaiid | no_more_exported_recids write_message("%s recids should updated" % (len(all_affected_recids)), verbose=2) if not all_affected_recids: write_message("Nothing to do!") return True # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 # Iterate over the recids for i, recid in enumerate(all_affected_recids): task_sleep_now_if_required(can_stop_too=True) task_update_progress("Done %s out of %s records." % \ (i, len(all_affected_recids))) write_message("Elaborating recid %s" % recid, verbose=3) record = get_record(recid) if not record: write_message("Record %s seems empty. Let's skip it." % recid, verbose=3) continue new_record = {} # Check if an OAI identifier is already in the record or # not. assign_oai_id_entry = False oai_id_entry = record_get_field_value(record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], code=CFG_OAI_ID_FIELD[5]) if not oai_id_entry: assign_oai_id_entry = True oai_id_entry = "oai:%s:%s" % (CFG_OAI_ID_PREFIX, recid) write_message("Setting new oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) else: write_message("Already existing oai_id %s for record %s" % (oai_id_entry, recid), verbose=3) # Get the sets to which this record already belongs according # to the metadata current_oai_sets = set( record_get_field_values(record, tag=CFG_OAI_SET_FIELD[:3], ind1=CFG_OAI_SET_FIELD[3], ind2=CFG_OAI_SET_FIELD[4], code=CFG_OAI_SET_FIELD[5])) write_message("Record %s currently belongs to these oai_sets: %s" % (recid, ", ".join(current_oai_sets)), verbose=3) current_previous_oai_sets = set( record_get_field_values(record, tag=CFG_OAI_PREVIOUS_SET_FIELD[:3], ind1=CFG_OAI_PREVIOUS_SET_FIELD[3], ind2=CFG_OAI_PREVIOUS_SET_FIELD[4], code=CFG_OAI_PREVIOUS_SET_FIELD[5])) write_message( "Record %s currently doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(current_previous_oai_sets)), verbose=3) # Get the sets that should be in this record according to # settings updated_oai_sets = set(_set for _set, _recids in recids_for_set.iteritems() if recid in _recids) write_message("Record %s now belongs to these oai_sets: %s" % (recid, ", ".join(updated_oai_sets)), verbose=3) updated_previous_oai_sets = set( _set for _set in (current_previous_oai_sets - updated_oai_sets) | (current_oai_sets - updated_oai_sets)) write_message( "Record %s now doesn't belong anymore to these oai_sets: %s" % (recid, ", ".join(updated_previous_oai_sets)), verbose=3) # Ok, we have the old sets and the new sets. If they are equal # and oai ID does not need to be added, then great, nothing to # change . Otherwise apply the new sets. if current_oai_sets == updated_oai_sets and not assign_oai_id_entry: write_message("Nothing has changed for record %s, let's move on!" % recid, verbose=3) continue # Jump to next recid write_message("Something has changed for record %s, let's update it!" % recid, verbose=3) subfields = [(CFG_OAI_ID_FIELD[5], oai_id_entry)] for oai_set in updated_oai_sets: subfields.append((CFG_OAI_SET_FIELD[5], oai_set)) for oai_set in updated_previous_oai_sets: subfields.append((CFG_OAI_PREVIOUS_SET_FIELD[5], oai_set)) record_add_field(new_record, tag="001", controlfield_value=str(recid)) record_add_field(new_record, tag=CFG_OAI_ID_FIELD[:3], ind1=CFG_OAI_ID_FIELD[3], ind2=CFG_OAI_ID_FIELD[4], subfields=subfields) oai_out.write(record_xml_output(new_record)) tot += 1 if tot == CFG_OAI_REPOSITORY_MARCXML_SIZE: oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') # Prepare to save results in a tmp file (fd, filename) = mkstemp(dir=CFG_TMPDIR, prefix='oairepository_' + \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) oai_out = os.fdopen(fd, "w") oai_out.write("<collection>") tot = 0 task_sleep_now_if_required(can_stop_too=True) oai_out.write("</collection>") oai_out.close() write_message("Wrote to file %s" % filename) if not no_upload: task_sleep_now_if_required(can_stop_too=True) if tot > 0: task_low_level_submission('bibupload', 'oairepository', '-c', filename, '-n') else: os.remove(filename) return True
Run as:: python fix_8560.py > output.xml bibupload -c output.xml """ from invenio.search_engine import search_pattern, get_fieldvalues from invenio.bibrecord import record_add_field, record_xml_output from invenio.webuser import collect_user_info, get_uid_from_email # All records recids = search_pattern(p="0->Z", f="8560_f") print "<collection>" for recid in recids: # Get record information email = get_fieldvalues(recid, "8560_f")[0] if "<" in email: email = email.split()[-1][1:-1].strip() user_info = collect_user_info(get_uid_from_email(email)) name = user_info.get("external_fullname", user_info.get("nickname", "")) external_id = user_info.get("external_id", "") # Create correction for record rec = {} record_add_field(rec, "001", controlfield_value=str(recid)) record_add_field(rec, '856', ind1='0', subfields=[('f', email), ('y', name)]) print record_xml_output(rec) print "</collection>"
def _prepare_marcxml(recid_a, rn_a, recids_and_rns_b, what_is_a_for_b, what_is_b_for_a, display_in_a=True, display_in_b=True, marc_for_a=None, marc_for_b=None, upload_mode='append', consider_empty_p=False): output = '<collection>' record_a = {} record_b = {} if what_is_b_for_a is not None: marc_tag_for_a, marc_ind1_for_a, marc_ind2_for_a = \ _prepare_marc(marc_for_a, CFG_OTHER_RELATIONSHIP_ENTRY, display_in_a and "0" or "1") record_add_field(record_a, "001", controlfield_value=str(recid_a)) if upload_mode == 'correct' and not recids_and_rns_b and consider_empty_p: # Add empty field in order to account for cases where all # linkings are removed by the submitter record_add_field(record_a, marc_tag_for_a, ind1=marc_ind1_for_a, ind2=marc_ind2_for_a) for recid_b, rn_b in recids_and_rns_b: record_add_field(record_a, marc_tag_for_a, ind1=marc_ind1_for_a, ind2=marc_ind2_for_a, subfields=[('i', what_is_b_for_a), ('r', rn_b), ('w', str(recid_b))]) output += record_xml_output(record_a) if what_is_a_for_b is not None: marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b = \ _prepare_marc(marc_for_b, CFG_OTHER_RELATIONSHIP_ENTRY, display_in_b and "0" or "1") for recid_b, rn_b in recids_and_rns_b: record_b = {} record_add_field(record_b, "001", controlfield_value=str(recid_b)) if upload_mode == 'correct': original_linking_fields = _get_record_linking_fields( recid_b, recid_a, marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b) record_add_fields(record_b, marc_tag_for_b, original_linking_fields) record_add_field(record_b, marc_tag_for_b, ind1=marc_ind1_for_b, ind2=marc_ind2_for_b, subfields=[('i', what_is_a_for_b), ('r', rn_a), ('w', str(recid_a))]) output += record_xml_output(record_b) # Remove linking in remote records where adequate if consider_empty_p: unlinked_recids = get_unlinked_records(recid_a, marc_for_b, display_in_b, upload_mode, recids_and_rns_b) for recid_b in unlinked_recids: record_b = {} record_add_field(record_b, "001", controlfield_value=str(recid_b)) original_linking_fields = _get_record_linking_fields( recid_b, recid_a, marc_tag_for_b, marc_ind1_for_b, marc_ind2_for_b) if not original_linking_fields: # Add empty field in order to account for cases where all # linkings are removed by the submitter record_add_field(record_b, marc_tag_for_b, ind1=marc_ind1_for_b, ind2=marc_ind2_for_b) record_add_fields(record_b, marc_tag_for_b, original_linking_fields) output += record_xml_output(record_b) output += '</collection>' return output