def start_bibupload_job(id_pairs): """ Submits the append job to bibupload id_pairs - {local_recid: remote_recid} """ bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=True) for local, remote in id_pairs.iteritems(): bibupload.add(generate_marc_to_append(local, remote)) bibupload.cleanup() # This initiates the job
def apply_hepnames_updates(hepname_updates): bibupload = ChunkedBibUpload(mode='a', user='******') for recid, entry in hepname_updates.iteritems(): record = {} record_add_field(record, '001', controlfield_value=str(recid)) for key, value in entry.iteritems(): if key in ('ORCID', 'ORIGINAL_BAI', 'INSPIRE', 'KAKEN'): if key == 'ORIGINAL_BAI': key = 'BAI' record_add_field(record, '035', subfields=[('a', value), ('9', key)]) write_message(record_xml_output(record)) bibupload.add(record_xml_output(record))
def bst_hepnames_orcid_sync(): bai_orcids = run_sql("SELECT bai.data, orcid.data FROM aidPERSONIDDATA as bai JOIN aidPERSONIDDATA as orcid ON bai.personid=orcid.personid WHERE orcid.tag='extid:ORCID' AND bai.tag='canonical_name'") recs = [] not_matched_profiles = 0 enhanced_records = 0 conflicting_orcids = 0 for bai, orcid in bai_orcids: recids = perform_request_search(p="035:%s" % bai, cc="HepNames") if len(recids) > 1: write_message("WARNING: %s/author/profile/%s, %s matches more than one HepNames: %s" % (CFG_SITE_URL, bai, orcid, recids), stream=sys.stderr) not_matched_profiles += 1 elif not recids: write_message("WARNING: %s/author/profile/%s, %s does not match any HepName" % (CFG_SITE_URL, bai, orcid), stream=sys.stderr) not_matched_profiles += 1 else: recid = recids[0] record = get_record(recid) for field in record_get_field_instances(record, tag="035"): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if subfields_dict.get('9') == 'ORCID': if subfields_dict.get('a') != orcid: if not subfields_dict.get('a', '').strip(): write_message("WARNING: record %s/record/%s has an empty ORCID" % (CFG_SITE_URL, recid), stream=sys.stderr) continue write_message("WARNING: record %s/record/%s matched by BAI %s/author/profile/%s has a different ORCID %s than the profile one: %s" % (CFG_SITE_URL, recid, CFG_SITE_URL, bai, subfields_dict.get('a'), orcid), stream=sys.stderr) conflicting_orcids += 1 break else: new_record = {} record_add_field(new_record, tag="001", controlfield_value=str(recid)) record_add_field(new_record, tag="035", subfields=[('a', orcid), ('9', 'ORCID')]) recs.append(new_record) write_message("INFO: adding ORCID %s to record %s/record/%s matched by BAI %s/author/profile/%s" % (orcid, CFG_SITE_URL, recid, CFG_SITE_URL, bai)) enhanced_records += 1 if recs: write_message("INFO: initiating uploads") bibupload = ChunkedBibUpload(mode="a", user='******') for record in recs: bibupload.add(record_xml_output(record)) bibupload.cleanup() else: write_message("INFO: no modification are necessary") write_message("INFO: not_matched_profiles: %s, enhanced_records: %s, conflicting_orcids: %s" % (not_matched_profiles, enhanced_records, conflicting_orcids))
def bst_hepdata(): uploader = ChunkedHepDataUpload() dumper = HepDataDumper() for record in dumper: marcxml_record = hepdata2marcxml(record) uploader.add(marcxml_record) inspire_ids = dumper.inspire_ids current_inspire_ids = intbitset(perform_request_search(p='035__9:HEPDATA')) records_to_amend = inspire_ids - current_inspire_ids id_appender = ChunkedBibUpload(mode='a', user='******') for recid in records_to_amend: rec = {} record_add_field(rec, tag="001", controlfield_value=str(recid)) record_add_field(rec, tag="035", subfields=[('a', 'ins%s' % recid), ('9', 'HEPDATA')]) id_appender.add(record_xml_output(rec))
def bst_hal(): doi_map, arxiv_map = get_hal_maps() matchable_records = get_record_ids_to_export() write_message("Total matchable records: %s" % len(matchable_records)) hal_records = get_hal_records() write_message("Already matched records: %s" % len(hal_records)) bibupload = ChunkedBibUpload(mode='a', notimechange=True, user='******') tot_records = matchable_records - hal_records write_message("Records to be checked: %s" % len(tot_records)) for i, recid in enumerate(tot_records): if i % 1000 == 0: write_message("%s records done out of %s" % (i, len(tot_records))) task_sleep_now_if_required() dois = get_fieldvalues(recid, tag='0247__a', sort=False) arxivs = get_fieldvalues(recid, tag='037__a', sort=False) matched_hal = [doi_map[doi] for doi in dois if doi in doi_map] matched_hal += [ arxiv_map[arxiv] for arxiv in arxivs if arxiv in arxiv_map ] # Let's assert that we matched only one single hal document at most matched_hal_id = set(id(entry) for entry in matched_hal) if len(matched_hal) > 1: write_message( "WARNING: record %s matches more than 1 HAL record: %s" % (recid, matched_hal), stream=sys.stderr) continue elif not matched_hal: continue hal_id = matched_hal[0]['halId_s'] rec = {} record_add_field(rec, '001', controlfield_value=str(recid)) record_add_field(rec, '035', subfields=[('a', hal_id), ('9', 'HAL')]) write_message("Record %s matched HAL record %s" % (recid, hal_id)) bibupload.add(record_xml_output(rec)) return True
def bst_labssync(): """ Synchronizes from Labs via redis. """ r = redis.StrictRedis.from_url(CFG_REDIS_HOST_LABS) user_agent = make_user_agent_string('labssync') s = requests.Session() s.headers['User-Agent'] = user_agent s.headers['Accept'] = 'application/marcxml+xml' tot = r.scard(CFG_REDIS_KEY) if tot == 0: write_message("Nothing to do") return else: write_message("At least %s records to synchronize from labs" % tot) errors = [] final_total = 0 uploader = ChunkedBibUpload(mode='r', user='******') while True: elem = r.spop(CFG_REDIS_KEY) if not elem: break final_total += 1 try: record = s.get("https://%s/api/%s" % (CFG_LABS_HOSTNAME, elem)).text # Let's strip collection/XML header record = record_xml_output(create_record(record)[0]) uploader.add(record) task_sleep_now_if_required() except Exception as err: register_exception() write_message("ERROR: when retrieving %s: %s" % (elem, err), stream=sys.stderr) errors.append(elem) write_message("Finally synced %s records from labs" % final_total) if errors: write_message("All those %s records had errors and might need to be resynced: %s" % (len(errors), ', '.join(errors)))
def bst_hal(): doi_map, arxiv_map = get_hal_maps() matchable_records = get_record_ids_to_export() write_message("Total matchable records: %s" % len(matchable_records)) hal_records = get_hal_records() write_message("Already matched records: %s" % len(hal_records)) bibupload = ChunkedBibUpload(mode='a', notimechange=True, user='******') tot_records = matchable_records - hal_records write_message("Records to be checked: %s" % len(tot_records)) for i, recid in enumerate(tot_records): if i % 1000 == 0: write_message("%s records done out of %s" % (i, len(tot_records))) task_sleep_now_if_required() dois = get_fieldvalues(recid, tag='0247__a', sort=False) arxivs = get_fieldvalues(recid, tag='037__a', sort=False) matched_hal = [doi_map[doi] for doi in dois if doi in doi_map] matched_hal += [arxiv_map[arxiv] for arxiv in arxivs if arxiv in arxiv_map] # Let's assert that we matched only one single hal document at most matched_hal_id = set(id(entry) for entry in matched_hal) if len(matched_hal) > 1: write_message("WARNING: record %s matches more than 1 HAL record: %s" % (recid, matched_hal), stream=sys.stderr) continue elif not matched_hal: continue hal_id = matched_hal[0]['halId_s'] rec = {} record_add_field(rec, '001', controlfield_value=str(recid)) record_add_field(rec, '035', subfields=[('a', hal_id), ('9', 'HAL')]) write_message("Record %s matched HAL record %s" % (recid, hal_id)) bibupload.add(record_xml_output(rec)) return True
def bst_hal(): doi_map, arxiv_map, recid_map = get_hal_maps() matchable_records = get_record_ids_to_export() write_message("Total matchable records: %s" % len(matchable_records)) hal_records = get_hal_records() write_message("Already matched records: %s" % len(hal_records)) new_inspire_ids = intbitset(recid_map.keys()) - hal_records write_message("New records pushed from Inspire: %s" % len(new_inspire_ids)) bibupload = ChunkedBibUpload(mode='a', notimechange=True, user='******') for recid in new_inspire_ids: hal_id = recid_map[recid]['halId_s'] update_record(recid, hal_id, bibupload) write_message('Added HAL ids to all records pushed from Inspire') task_sleep_now_if_required() tot_records = matchable_records - hal_records - new_inspire_ids write_message("Additional records to be checked: %s" % len(tot_records)) for i, recid in enumerate(tot_records): if i % 1000 == 0: write_message("%s records done out of %s" % (i, len(tot_records))) task_sleep_now_if_required() dois = get_fieldvalues(recid, tag='0247__a', sort=False) arxivs = get_fieldvalues(recid, tag='037__a', sort=False) matched_hal = [doi_map[doi] for doi in dois if doi in doi_map] matched_hal += [arxiv_map[arxiv] for arxiv in arxivs if arxiv in arxiv_map] # Let's assert that we matched only one single hal document at most matched_hal_id = set(id(entry) for entry in matched_hal) if len(matched_hal_id) > 1: write_message("WARNING: record %s matches more than 1 HAL record: %s" % (recid, matched_hal), stream=sys.stderr) continue elif not matched_hal: continue hal_id = matched_hal[0]['halId_s'] update_record(recid, hal_id, bibupload) return True
#!/usr/bin/python # -*- coding: utf-8 -*- """ Deletes records from a local Invenio instance """ from invenio.search_engine import get_record from invenio.bibtaskutils import ChunkedBibUpload from invenio.bibrecord import record_xml_output bibupload = ChunkedBibUpload(mode='d', user='******', notimechange=True) print "Invenio record deleter!" print "Enter range of record IDs to be deleted:" print "Start: " range_start = int(raw_input()) print "End: " range_end = int(raw_input()) + 1 print " ========== Let's do this! ==========" for recid in range(range_start, range_end): record = get_record(recid) if record: marc = record_xml_output(record, tags=list('001')) bibupload.add(marc) print "%s: Got it!" % str(recid) else:
def bst_arxiv_doi_update(input_uri=None, log_dir=CFG_TMPSHAREDDIR, logging=True, asana_key=CFG_ASANA_API_KEY, asana_parent_id=ASANA_PARENT_TASK_ID, skip_result_types='missing'): """Update DOIs on documents harvested from ArXiv. Parameters: :param input_uri: Link to new URI data DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml :param log_dir: Directory to store log files in :param logging: True or False, default True :param asana_key: The Asana API, by default uses the value of CFG_ASANA_API_KEY NOTE: Passing the value of None for this parameter will skip writing to Asana and instead email the instance admin :param asana_parent_id: The taskID of the task in Asana to log subtasks to :param skip_result_types: Error messages to not bother with during reporting, input as Comma Seperated Values CSVs Possible values: missing, ambigous, incorrect """ skip_results = verify_skip_results(skip_result_types) if input_uri is None: _print("Notice: No URI specified, defaulting to " + URI_DEFAULT) input_uri = URI_DEFAULT task_update_progress("Resolving URI: %s" % (input_uri,)) # Testing builds characters bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=False) # open url and parse xml try: tree = ET.parse(urllib.urlopen(input_uri)) _print('Opened DOI file ' + input_uri) except IOError: _print("FATAL ERROR: Could not open URL: " + input_uri, 1) task_update_progress("Failed retrieving DOI data") return False except ExpatError: _print("FATAL ERROR: Could not parse XML from: " + input_uri, 1) task_update_progress("Failed parsing DOI data") return False root = tree.getroot() try: date_el = root.find('date') date_str = '%s-%s-%s' % (date_el.get('year'), date_el.get('month'), date_el.get('day')) _print("Processing DOIs last updated on date %s" % date_str) except AttributeError: _print("Warning: Couldn't get last published date of Arxiv DOI feed.") doi_count = 0 new_count = 0 # Stores any DOIs with have issues with in structure: # Missing: (doi, arxiv preprint_id, published date) # Ambiguous: (doi, arxiv preprint_id, rec_ids) # Incorrect: (rec_id, old-doi, new-doi) problem_dois = {'missing': [], 'ambiguous': [], 'incorrect': []} task_update_progress("Processing records...") # NB: Element.getiterator() is deprecated since version 2.7: Use # method Element.iter() instead. for item in root.getiterator('article'): doi_count += 1 doi = item.get('doi') arxiv = item.get('preprint_id') published_date = item.get('published') _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6) rec_id = get_record_by_arxiv_id(arxiv) if len(rec_id) == 1: rec_id = rec_id[0] try: record_xml = append_to_record(rec_id, doi, published_date) except DOIError as ex: problem_dois['incorrect'].append((rec_id, ex.message, doi)) continue if record_xml: new_count += 1 _print("* Now we will run the bibupload for " + "%s record" % rec_id, 5) _print("** We will upload the following xml code %s" % repr(record_xml), 9) bibupload.add(record_xml) elif len(rec_id) > 1: _print('ERROR: %d records found with matching arXiv ID %s' % (len(rec_id), arxiv)) problem_dois['ambiguous'].append((doi, arxiv, repr(rec_id))) else: _print('No record found matching arxiv ID: %s' % arxiv, 9) problem_dois['missing'].append((doi, arxiv, published_date)) _print("========================| FINAL SCORE |=======================", 1) _print("DOIs found and processed: %d" % doi_count, 1) _print("Arxiv IDs without corresponding records: %d" % len(problem_dois['missing']), 1) _print("Arxiv IDs corresponding to multiple records (duplicates): %d" % len(problem_dois['ambiguous']), 1) _print("Inspire records with an incorrect DOI: %d" % len(problem_dois['incorrect']), 1) _print("Records without DOIs requiring appends: %d" % new_count, 1) _print("==============================================================", 1) bibupload.cleanup() notify_on_errors(problem_dois, log_dir, doi_count, new_count, asana_key, asana_parent_id, skip_results) return True
def bst_arxiv_doi_update(input_uri=None, log_dir=CFG_TMPSHAREDDIR, logging=True, asana_key=CFG_ASANA_API_KEY, asana_parent_id=ASANA_PARENT_TASK_ID, skip_result_types='missing'): """Update DOIs on documents harvested from ArXiv. Parameters: :param input_uri: Link to new URI data DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml :param log_dir: Directory to store log files in :param logging: True or False, default True :param asana_key: The Asana API, by default uses the value of CFG_ASANA_API_KEY NOTE: Passing the value of None for this parameter will skip writing to Asana and instead email the instance admin :param asana_parent_id: The taskID of the task in Asana to log subtasks to :param skip_result_types: Error messages to not bother with during reporting, input as Comma Seperated Values CSVs Possible values: missing, ambigous, incorrect """ skip_results = verify_skip_results(skip_result_types) if input_uri is None: _print("Notice: No URI specified, defaulting to " + URI_DEFAULT) input_uri = URI_DEFAULT task_update_progress("Resolving URI: %s" % (input_uri, )) # Testing builds characters bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=False) # open url and parse xml try: tree = ET.parse(urllib.urlopen(input_uri)) _print('Opened DOI file ' + input_uri) except IOError: _print("FATAL ERROR: Could not open URL: " + input_uri, 1) task_update_progress("Failed retrieving DOI data") return False except ExpatError: _print("FATAL ERROR: Could not parse XML from: " + input_uri, 1) task_update_progress("Failed parsing DOI data") return False root = tree.getroot() try: date_el = root.find('date') date_str = '%s-%s-%s' % (date_el.get('year'), date_el.get('month'), date_el.get('day')) _print("Processing DOIs last updated on date %s" % date_str) except AttributeError: _print("Warning: Couldn't get last published date of Arxiv DOI feed.") doi_count = 0 new_count = 0 # Stores any DOIs with have issues with in structure: # Missing: (doi, arxiv preprint_id, published date) # Ambiguous: (doi, arxiv preprint_id, rec_ids) # Incorrect: (rec_id, old-doi, new-doi) problem_dois = {'missing': [], 'ambiguous': [], 'incorrect': []} task_update_progress("Processing records...") # NB: Element.getiterator() is deprecated since version 2.7: Use # method Element.iter() instead. for item in root.getiterator('article'): doi_count += 1 doi = item.get('doi') arxiv = item.get('preprint_id') published_date = item.get('published') _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6) rec_id = get_record_by_arxiv_id(arxiv) if len(rec_id) == 1: rec_id = rec_id[0] try: record_xml = append_to_record(rec_id, doi, published_date) except DOIError as ex: problem_dois['incorrect'].append((rec_id, ex.message, doi)) continue if record_xml: new_count += 1 _print( "* Now we will run the bibupload for " + "%s record" % rec_id, 5) _print( "** We will upload the following xml code %s" % repr(record_xml), 9) bibupload.add(record_xml) elif len(rec_id) > 1: _print('ERROR: %d records found with matching arXiv ID %s' % (len(rec_id), arxiv)) problem_dois['ambiguous'].append((doi, arxiv, repr(rec_id))) else: _print('No record found matching arxiv ID: %s' % arxiv, 9) problem_dois['missing'].append((doi, arxiv, published_date)) _print("========================| FINAL SCORE |=======================", 1) _print("DOIs found and processed: %d" % doi_count, 1) _print( "Arxiv IDs without corresponding records: %d" % len(problem_dois['missing']), 1) _print( "Arxiv IDs corresponding to multiple records (duplicates): %d" % len(problem_dois['ambiguous']), 1) _print( "Inspire records with an incorrect DOI: %d" % len(problem_dois['incorrect']), 1) _print("Records without DOIs requiring appends: %d" % new_count, 1) _print("==============================================================", 1) bibupload.cleanup() notify_on_errors(problem_dois, log_dir, doi_count, new_count, asana_key, asana_parent_id, skip_results) return True
def bst_arxiv_doi_update(input_uri=None, log_dir=CFG_TMPSHAREDDIR, logging=True): """ bst_arxiv_doi_update Updates DOIs on documents harvested from ArXiv. Parameters: * input_uri - Link to new URI data DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml * log_dir - Directory to store log files in * logging - True or False, default True """ if input_uri is None: _print("Notice: No URI specified, defaulting to " + URI_DEFAULT) input_uri = URI_DEFAULT task_update_progress("Resolving URI...") # Testing builds characters bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=True) bibindex = ChunkedBibIndex(indexes='year,global,journal', user=SCRIPT_NAME) # open url and parse xml try: tree = ET.parse(urllib.urlopen(input_uri)) _print('Opened DOI file ' + input_uri) except IOError: _print("FATAL ERROR: Could not open URL: " + input_uri, 1) task_update_progress("Failed retreiving DOI data") task_update_status("FAILED") return False root = tree.getroot() doi_count = 0 new_count = 0 missing_count = 0 task_update_progress("Processing records...") # NB: Element.getiterator() is deprecated since version 2.7: Use # method Element.iter() instead. for item in root.getiterator('article'): doi_count += 1 doi = item.get('doi') arxiv = item.get('preprint_id') published_date = item.get('published') _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6) rec_id = get_record_by_arxiv_id(arxiv) if len(rec_id) == 1: rec_id = rec_id[0] record_xml = append_to_record(rec_id, doi, published_date) if record_xml: new_count += 1 _print("* Now we will run the bibupload and bibindex for " + str(rec_id) + " record", 5) _print("** We will upload the following xml code " + repr(record_xml), 9) bibupload.add(record_xml) bibindex.add(rec_id) elif len(rec_id) > 1: _print('ERROR: %d records found with matching arXiv ID %s' % (len(rec_id), arxiv)) else: missing_count += 1 _print('No record found matching arxiv ID: ' + arxiv, 9) _print("======================== FINAL SCORE ========================", 1) _print("DOIs found and processed: " + str(doi_count), 1) _print("Arxiv IDs without corresponding records: " + str(missing_count), 1) _print("Records requiring appends: " + str(new_count), 1) if logging: task_update_progress("Logging...") write_list_to_file(log_dir, 'errors', ERRORS) write_list_to_file(log_dir, 'messages', MESSAGES) task_update_progress(SCRIPT_NAME + " finished. %s DOIs processed, %s to add" % (str(doi_count), str(new_count))) task_update_status("DONE") bibupload.__del__() bibindex.__del__() return True
def bst_arxiv_doi_update(input_uri=None, log_dir=CFG_TMPSHAREDDIR, logging=True): """ bst_arxiv_doi_update Updates DOIs on documents harvested from ArXiv. Parameters: * input_uri - Link to new URI data DEFAULT: https://vendor.ridge.aps.org/arXiv/latest_pub.xml NOTE: Test data can be taken from http://arxiv.org/schemas/doi_feed_test.xml * log_dir - Directory to store log files in * logging - True or False, default True """ if input_uri is None: _print("Notice: No URI specified, defaulting to " + URI_DEFAULT) input_uri = URI_DEFAULT task_update_progress("Resolving URI...") # Testing builds characters bibupload = ChunkedBibUpload(mode='a', user=SCRIPT_NAME, notimechange=True) bibindex = ChunkedBibIndex(indexes='year,global,journal', user=SCRIPT_NAME) # open url and parse xml try: tree = ET.parse(urllib.urlopen(input_uri)) _print('Opened DOI file ' + input_uri) except IOError: _print("FATAL ERROR: Could not open URL: " + input_uri, 1) task_update_progress("Failed retreiving DOI data") task_update_status("FAILED") return False root = tree.getroot() doi_count = 0 new_count = 0 missing_count = 0 task_update_progress("Processing records...") # NB: Element.getiterator() is deprecated since version 2.7: Use # method Element.iter() instead. for item in root.getiterator('article'): doi_count += 1 doi = item.get('doi') arxiv = item.get('preprint_id') published_date = item.get('published') _print("XML entry #%s: %s" % (str(doi_count), arxiv), 6) rec_id = get_record_by_arxiv_id(arxiv) if len(rec_id) == 1: rec_id = rec_id[0] record_xml = append_to_record(rec_id, doi, published_date) if record_xml: new_count += 1 _print( "* Now we will run the bibupload and bibindex for " + str(rec_id) + " record", 5) _print( "** We will upload the following xml code " + repr(record_xml), 9) bibupload.add(record_xml) bibindex.add(rec_id) elif len(rec_id) > 1: _print('ERROR: %d records found with matching arXiv ID %s' % (len(rec_id), arxiv)) else: missing_count += 1 _print('No record found matching arxiv ID: ' + arxiv, 9) _print("======================== FINAL SCORE ========================", 1) _print("DOIs found and processed: " + str(doi_count), 1) _print("Arxiv IDs without corresponding records: " + str(missing_count), 1) _print("Records requiring appends: " + str(new_count), 1) if logging: task_update_progress("Logging...") write_list_to_file(log_dir, 'errors', ERRORS) write_list_to_file(log_dir, 'messages', MESSAGES) task_update_progress(SCRIPT_NAME + " finished. %s DOIs processed, %s to add" % (str(doi_count), str(new_count))) task_update_status("DONE") bibupload.__del__() bibindex.__del__() return True