def crossref_normalize_name(record): """ Changes the format of author's name (often with initials) to the proper, unified one, using bibauthor_name_utils tools @return: changed record """ # pattern for removing the spaces between two initials pattern_initials = '([A-Z]\\.)\\s([A-Z]\\.)' # first, change the main author for field in record_get_field_instances(record, '100'): main_author = field[0][0][1] new_author = create_normalized_name(split_name_parts(main_author)) # remove spaces between initials # two iterations are required for _ in range(2): new_author = re.sub(pattern_initials, '\g<1>\g<2>', new_author) position = field[4] record_modify_subfield(rec=record, tag='100', subfield_code='a', \ value=new_author, subfield_position=0, field_position_global=position) # then, change additional authors for field in record_get_field_instances(record, '700'): author = field[0][0][1] new_author = create_normalized_name(split_name_parts(author)) for _ in range(2): new_author = re.sub(pattern_initials, '\g<1>\g<2>', new_author) position = field[4] record_modify_subfield(rec=record, tag='700', subfield_code='a', \ value=new_author, subfield_position=0, field_position_global=position)
def create_ticket(recid, bibcatalog_system, queue=CFG_REFEXTRACT_TICKET_QUEUE): write_message("bibcatalog_system %s" % bibcatalog_system, verbose=1) write_message("queue %s" % queue, verbose=1) if bibcatalog_system and queue: subject = "Refs for #%s" % recid # Add report number in the subjecet report_number = "" record = get_bibrecord(recid) in_hep = False for collection_tag in record_get_field_instances(record, "980"): for collection in field_get_subfield_values(collection_tag, "a"): if collection == "HEP": in_hep = True # Only create tickets for HEP if not in_hep: write_message("not in hep", verbose=1) return for report_tag in record_get_field_instances(record, "037"): for category in field_get_subfield_values(report_tag, "c"): if category.startswith("astro-ph"): write_message("astro-ph", verbose=1) # We do not curate astro-ph return for report_number in field_get_subfield_values(report_tag, "a"): subject += " " + report_number break text = "%s/record/edit/#state=edit&recid=%s" % (CFG_SITE_SECURE_URL, recid) bibcatalog_system.ticket_submit(subject=subject, queue=queue, text=text, recordid=recid)
def check_records(records): """ Update field 700__i: * Replace substring INSPIRE-00227069 with INSPIRE-00341324 When subfield __a is equal to Yang, Yi AND __u is equal to Beijing, Inst. High Energy Phys. * Update field 700 ADD subfield __i INSPIRE-00341324 When subfield __a is equal to Yang, Yi AND __u is equal to Beijing, Inst. High Energy Phys. IF subfield __i Does not exist """ for record in records: for field in record_get_field_instances( record, tag="100") + record_get_field_instances(record, "700"): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if subfields_dict.get('a') == 'Yang, Yi' and subfields_dict.get( 'u') == 'Beijing, Inst. High Energy Phys.': if 'i' not in subfields_dict: subfields.append(('i', 'INSPIRE-00341324')) record.set_amended('Added INSPIRE-00341324 to Yang, Yi') else: for i, (code, value) in enumerate(subfields): if code == 'i' and 'INSPIRE-00227069' in value: subfields[i] = ('i', 'INSPIRE-00341324') record.set_amended( 'Corrected INSPIRE-00227069 with INSPIRE-00341324 for Yang, Yi' )
def crossref_normalize_name(record): """ Changes the format of author's name (often with initials) to the proper, unified one, using bibauthor_name_utils tools @return: changed record """ # pattern for removing the spaces between two initials pattern_initials = '([A-Z]\\.)\\s([A-Z]\\.)' # first, change the main author for field in record_get_field_instances(record, '100'): main_author = field[0][0][1] new_author = create_normalized_name(split_name_parts(main_author)) # remove spaces between initials # two iterations are required for _ in range(2): new_author = re.sub(pattern_initials, r'\g<1>\g<2>', new_author) position = field[4] record_modify_subfield(rec=record, tag='100', subfield_code='a', value=new_author, subfield_position=0, field_position_global=position) # then, change additional authors for field in record_get_field_instances(record, '700'): author = field[0][0][1] new_author = create_normalized_name(split_name_parts(author)) for _ in range(2): new_author = re.sub(pattern_initials, r'\g<1>\g<2>', new_author) position = field[4] record_modify_subfield(rec=record, tag='700', subfield_code='a', value=new_author, subfield_position=0, field_position_global=position)
def get_signatures_with_orcid(record): out = {} for field in record_get_field_instances(record, '100') + record_get_field_instances(record, '700'): subfields = dict(field_get_subfield_instances(field)) if subfields.get('j', '').upper().startswith('ORCID:'): orcid = subfields['j'][len('ORCID:'):] author = subfields['a'] out[author] = orcid return out
def check_records(records): for record in records: for field in record_get_field_instances(record, '100') + record_get_field_instances(record, '700'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if 'a' in subfields_dict and subfields_dict['a'] in CHANGES: if 'i' in subfields_dict and subfields_dict['i'] != CHANGES[subfields_dict['a']]: record.set_invalid("Author %s should have INSPIRE ID %s but has already INSPIRE ID %s" % (subfields_dict['a'], CHANGES[subfields_dict['a']], subfields_dict['i'])) elif not 'i' in subfields_dict: subfields.append(('i', CHANGES[subfields_dict['a']])) record.set_amended("Added INSPIRE ID %s to author %s" % (CHANGES[subfields_dict['a']], subfields_dict['a']))
def get_ids_from_recid(recid): """Get all relevant identifiers from metadata of local record.""" record = get_record(recid) # Retrieving DOI doi = "" dois = record_get_field_values(record, '024', '7', code='a') dois = [doi for doi in dois if doi.startswith('10.')] if len(dois) > 1: print >> sys.stderr, "WARNING: record %s have more than one DOI: %s" % (recid, dois) doi = dois[0] elif len(dois) == 1: doi = dois[0] # Retrieving arXiv eprint eprint = "" eprints = record_get_field_values(record, '035', code='a') eprints = [an_eprint[len('oai:arXiv.org:'):] for an_eprint in eprints if an_eprint.lower().startswith('oai:arxiv.org:')] if len(eprints) > 1: print >> sys.stderr, "WARNING: record %s have more than one arXiv eprint: %s" % (recid, eprints) eprint = eprints[0] elif len(eprints) == 1: eprint = eprints[0] # Retrieving Other service ID other_id = '' for field in record_get_field_instances(record, '035'): subfields = dict(field_get_subfield_instances(field)) if subfields.get('9', '').upper() == CFG_OTHER_SITE.upper() and subfields.get('a'): other_id = subfields['a'] if CFG_INSPIRE_SITE and not other_id: for field in record_get_field_instances(record, '595'): subfields = dict(field_get_subfield_instances(field)) if "CDS" in subfields.get('a', '').upper(): other_id = subfields.get('a', 0).split("-")[-1] try: int(other_id) except ValueError: # Not an integer, we move on other_id = '' reportnumbers = record_get_field_values(record, '037', code='a') system_number = "" if CFG_INSPIRE_SITE: for value in record_get_field_values(record, '970', filter_subfield_code="a", filter_subfield_value="SPIRES", filter_subfield_mode="s"): system_number = value.split("-")[-1] break # There is typically only one out = [str(recid), doi, eprint, other_id, system_number] + reportnumbers return [val.replace('\n', ' ').replace('\r', '') for val in out]
def is_published(record): """ Checks fields 980 and 773 to see if the record has already been published. Parameters: * record - dictionary: BibRecord dictionary. Returns: True is published, else False """ field980 = record_get_field_instances(record, '980') field773 = record_get_field_instances(record, '773') for f980 in field980: if 'a' in field_get_subfields(f980): for f773 in field773: if 'p' in field_get_subfields(f773): return True return False
def print_essentials(record, tag_list): """ Neatly prints all subfield values """ # Print control values first for control in tag_list['control']: for field in record_get_field_instances(record, tag=control): print " %s: %s" % (control, field[3]) # Then values of datafields for tag, ind1, ind2, subs in tag_list['datafld']: fields = record_get_field_instances(record, tag, ind1, ind2) fields_values = get_fields_vals(fields, subs) field_line = format_field_vals(fields_values) print " %s:%s" % (tag, field_line) print
def _create_ticket(recid, bibcatalog_system, queue): subject = "Refs for #%s" % recid if CFG_INSPIRE_SITE: # Add report number in the subjecet report_number = "" record = get_bibrecord(recid) in_core = False for collection_tag in record_get_field_instances(record, "980"): for collection in field_get_subfield_values(collection_tag, 'a'): if collection == 'CORE': in_core = True if collection == 'arXiv': # Do not create tickets for arxiv papers # Tickets for arxiv papers are created in bibcatelog write_message("arXiv paper", verbose=1) return # Do not create tickets for user submissions for source_field in record_get_field_instances(record, "541"): for source in field_get_subfield_values(source_field, "c"): if source == "submission": write_message("User submitted paper", verbose=1) return # Only create tickets for CORE papers if not in_core: write_message("not in core papers", verbose=1) return # Do not create tickets for old records creation_date = run_sql( """SELECT creation_date FROM bibrec WHERE id = %s""", [recid])[0][0] if creation_date < datetime.now() - timedelta(days=30 * 4): return for report_tag in record_get_field_instances(record, "037"): for report_number in field_get_subfield_values(report_tag, 'a'): subject += " " + report_number break text = '%s/record/edit/#state=edit&recid=%s' % (CFG_SITE_SECURE_URL, recid) bibcatalog_system.ticket_submit(subject=subject, queue=queue, text=text, recordid=recid)
def check_record(record, source_field, new_field, subfield_filter): """ Changes the code of a field to new_field """ from collections import namedtuple from invenio.bibrecord import (record_add_field, record_delete_field, record_get_field_instances) assert len(source_field) == 5 assert len(new_field) == 5 source_field = source_field.replace("_", " ") new_field = new_field.replace("_", " ") assert len(subfield_filter) == 2 SubfieldFilter = namedtuple('SubfieldFilter', ['code', 'value']) subfield_filter = SubfieldFilter(*subfield_filter) def filter_passes(subfield_code, result): return subfield_filter.code is None or ( subfield_filter.code in ('%', subfield_code) and subfield_filter.value == result) subfields_list = [] for subfields, ind1, ind2, _, pos in record_get_field_instances( record, source_field[:3], source_field[3], source_field[4]): if any(filter_passes(*s) for s in subfields): subfields_list.append(subfields) record_delete_field(record, source_field[:3], ind1, ind2, pos) for subfields in subfields_list: record_add_field(record, new_field[:3], new_field[3], new_field[4], subfields=subfields) record.set_amended('move from %s to %s: %s' % (source_field.replace(" ", "_"), new_field.replace(" ", "_"), subfields))
def create_xml(recid, IDs, tags): """ Replaces specific inspire-ids in records with nothing """ if VERBOSE: print "Working on %s" % recid record = get_record(int(recid)) correct_record = {} record_add_field(correct_record, '001', controlfield_value=recid) for tag in tags: field_instances = record_get_field_instances(record, \ tag[0:3], tag[3], tag[4]) for field_instance in field_instances: correct_subfields = [] for code, value in field_instance[0]: if code == 'i': if value in IDs: if VERBOSE: print "Getting rid of %s from %s!" % (value, recid) pass else: correct_subfields.append((code, value)) else: correct_subfields.append((code, value)) record_add_field(correct_record, tag[0:3], tag[3], tag[4], \ subfields=correct_subfields) return print_rec(correct_record)
def create_xml(recid): correct_record = {} tag = '8564_' record = get_record(recid) flag = None record_add_field(record, '001', controlfield_value=str(recid)) field_instances = record_get_field_instances(record, tag[0:3], tag[3], tag[4]) correct_subfields = [] for field_instance in field_instances: correct_subfields = [] # print field_instance for c,v in field_instance[0]: # print c,v matchObj = re.search(r'inspirehep\.net/record/\d+/files/fermilab-thesis-.*?\.pdf', v, flags=re.IGNORECASE) if matchObj: print 'yes' flag = True correct_subfields.append(('y', 'Fulltext')) correct_subfields.append((c,v)) record_add_field(correct_record, tag[0:3], tag[3], tag[4], \ subfields=correct_subfields) if flag: return print_rec(correct_record) else: return None
def create_xml(recid): """ Searches for duplicate instances of 773 and keeps the good one. """ tag = '773__' tag_value = tag + 'p' journal = get_fieldvalues(recid, tag_value) if len(journal) == 2 and journal[0] == journal[1]: record = get_record(recid) correct_record = {} record_add_field(correct_record, '001', \ controlfield_value=str(recid)) field_instances = record_get_field_instances(record, \ tag[0:3], tag[3], tag[4]) correct_subfields = [] c_value = False for field_instance in field_instances: for code, value in field_instance[0]: if value == 'To appear in the proceedings of': pass elif (code, value) not in correct_subfields: if code == 'c': if c_value: if len(value) > len(c_value): c_value = value else: c_value = value else: correct_subfields.append((code, value)) if c_value: correct_subfields.append(('c', c_value)) record_add_field(correct_record, tag[0:3], tag[3], tag[4], \ subfields=correct_subfields) return print_rec(correct_record) return None
def create_xml(recid, tags): """Create xml file to replace to 100, 700 block.""" record = get_record(recid) correct_record = {} record_add_field(correct_record, '001', controlfield_value=str(recid)) flag = None for tag in tags: field_instances = record_get_field_instances(record, tag[0:3], \ tag[3], tag[4]) correct_subfields = [] for field_instance in field_instances: correct_subfields = [] for code, value in field_instance[0]: if code == 'v': try: if VERBOSE: print len(AFFILIATIONS_DONE) affiliation_key = re.sub(r'\W+', ' ', value).upper() if not affiliation_key in AFFILIATIONS_DONE: new_values = get_aff(value) AFFILIATIONS_DONE[affiliation_key] = new_values for new_value in AFFILIATIONS_DONE[affiliation_key]: correct_subfields.append(('u', \ new_value.lstrip(' '))) flag = True except TypeError: pass correct_subfields.append((code, value)) record_add_field(correct_record, tag[0:3], tag[3], tag[4], \ subfields=correct_subfields) if flag: return print_rec(correct_record)
def check_record(record, source_field, new_field, subfield_filter): """ Changes the code of a field to new_field """ from collections import namedtuple from invenio.bibrecord import (record_add_field, record_delete_field, record_get_field_instances) assert len(source_field) == 5 assert len(new_field) == 5 source_field = source_field.replace("_", " ") new_field = new_field.replace("_", " ") assert len(subfield_filter) == 2 SubfieldFilter = namedtuple('SubfieldFilter', ['code', 'value']) subfield_filter = SubfieldFilter(*subfield_filter) def filter_passes(subfield_code, result): return subfield_filter.code is None or ( subfield_filter.code in ('%', subfield_code) and subfield_filter.value == result) subfields_list = [] for subfields, ind1, ind2, _, pos in record_get_field_instances( record, source_field[:3], source_field[3], source_field[4]): if any(filter_passes(*s) for s in subfields): subfields_list.append(subfields) record_delete_field(record, source_field[:3], ind1, ind2, pos) for subfields in subfields_list: record_add_field(record, new_field[:3], new_field[3], new_field[4], subfields=subfields) record.set_amended('move from %s to %s: %s' % (source_field.replace( " ", "_"), new_field.replace(" ", "_"), subfields))
def get_ids_from_recid(recid): record = get_record(recid) ## Retrieving DOI doi = "" dois = record_get_field_values(record, "024", "7", code="a") dois = [doi for doi in dois if doi.startswith("10.")] if len(dois) > 1: print >> sys.stderr, "WARNING: record %s have more than one DOI: %s" % (recid, dois) elif len(dois) == 1: doi = dois[0] ## Retrieving arXiv eprint eprint = "" eprints = record_get_field_values(record, "035", code="a") eprints = [ an_eprint[len("oai:arXiv.org:") :] for an_eprint in eprints if an_eprint.lower().startswith("oai:arxiv.org:") ] if len(eprints) > 1: print >> sys.stderr, "WARNING: record %s have more than one arXiv eprint: %s" % (recid, eprints) elif len(eprints) == 1: eprint = eprints[0] ## Retrieving Other service ID other_id = "" for field in record_get_field_instances(record, "035"): subfields = dict(field_get_subfield_instances(field)) if subfields.get("9", "").upper() == CFG_OTHER_SITE.upper() and subfields.get("a"): other_id = subfields["a"] reportnumbers = record_get_field_values(record, "037", code="a") return [str(recid), doi, eprint, other_id] + reportnumbers
def create_xmlrefs(recid): subrefs = [ '%s,%i,' % (old_journal, x) for x in range(vol_change, vol_curr) ] record = get_record(recid) correct_record = {} record_add_field(correct_record, '001', controlfield_value=str(recid)) field_instances = record_get_field_instances(record, '999', 'C', '5') correct_subfields = [] for field_instance in field_instances: correct_subfields = [] for code, value in field_instance[0]: if code == 's' and any(x for x in subrefs if x in value): newval = re.sub(old_journal, repl_journal, value) if VERBOSE: print "%s: Replacing %s with %s" % (recid, value, newval) correct_subfields.append(('s', newval)) else: correct_subfields.append((code, value)) record_add_field(correct_record, '999', 'C', '5', subfields=correct_subfields) return print_rec(correct_record)
def check_records(records, field): for record in records: if field != '999C5s': for position, value in record.iterfields([field]): newval = value.replace('. ', '.') if newval != value: record.amend_field(position, newval) continue for afield in record_get_field_instances(record, '999', 'C', '5'): subfields = field_get_subfield_instances(afield) subfields_dict = dict(subfields) if 's'in subfields_dict: old_pubnote = subfields_dict['s'] new_pubnote = old_pubnote.replace('. ', '.') if old_pubnote != new_pubnote: subfields.remove(('s', old_pubnote)) subfields.append(('s', new_pubnote)) if not '0' in subfields_dict: recids = perform_request_search(p=new_pubnote, f='journal') if len(recids) == 1: recid = recids.pop() subfields.append(('0', str(recid))) record.set_amended("Pubnote changed from %s to %s and matched a new record %s: Sam is the best, HURRAY!!!" % (old_pubnote, new_pubnote, recid)) continue record.set_amended("Pubnote changed from %s to %s" % (old_pubnote, new_pubnote))
def tokenize(self, recID): phrases = [] try: rec = get_record(recID) for rule in self.rules: tag_to_index, necessary_tag, necessary_value = rule core_tag = tag_to_index[0:3] ind = tag_to_index[3:5] sub_tag = tag_to_index[5] fields = [ dict(instance[0]) for instance in record_get_field_instances( rec, core_tag, ind[0], ind[1]) ] for field in fields: tag_condition = necessary_tag and field.has_key( necessary_tag) or necessary_tag == '' value_condition = necessary_value and field.get(necessary_tag, '') == necessary_value or \ necessary_value == '' if tag_condition and field.has_key( sub_tag) and value_condition: phrases.append(field[sub_tag]) return phrases except KeyError: return [] return phrases
def replace_references(recid): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record """ # Parse references references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml.encode("utf-8")) # Record marc xml record = get_record(recid) if references[0]: fields_to_add = record_get_field_instances(references[0], tag="999", ind1="%", ind2="%") # Replace 999 fields record_delete_fields(record, "999") record_add_fields(record, "999", fields_to_add) # Update record references out_xml = record_xml_output(record) else: out_xml = None return out_xml
def generate_ticket(ticket, record): """ Generates a ticket to be created, filling subject, body and queue values of the passed BibCatalogTicket object. The enriched object is returned. @param ticket: a ticket object as created by BibCatalogTicket() containing the subject, body and queue to create a ticket in. @type ticket: record object of BibCatalogTicket. @param record: a recstruct object as created by bibrecord.create_record() @type record: record object of BibRecord. @return: the modified ticket object to create. @rtype: BibCatalogTicket """ recid = record_id_from_record(record) subject = [] # Add report number in the subjecet report_number = "" for report_tag in record_get_field_instances(record, "037"): for report_number in field_get_subfield_values(report_tag, 'a'): subject.append(report_number) break subject.append("(#%s)" % (recid,)) text = 'Curate record here: %s/record/edit/#state=edit&recid=%s' % \ (CFG_SITE_SECURE_URL, recid) ticket.subject = " ".join(subject) ticket.body = text.replace('%', '%%') ticket.queue = "HEP_curation" return ticket
def check_records(records): from invenio.bibrank import ConfigParser, CFG_ETCDIR from invenio.bibrank_citation_indexer import get_recids_matching_query config = ConfigParser.ConfigParser() config.read("%s/bibrank/%s.cfg" % (CFG_ETCDIR, "citation")) for record in records: for field in record_get_field_instances(record, '999', 'C', '5'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if '0' not in subfields_dict and 's' in subfields_dict: old_pubnote = subfields_dict['s'] g = RE_BROKEN_PUBNOTES.match(old_pubnote) if g: new_pubnote = '%(journal)s,%(volume)s,P%(id)s' % g.groupdict( ) subfields.remove(('s', old_pubnote)) subfields.append(('s', new_pubnote)) recids = get_recids_matching_query(p=new_pubnote, f='journal', config=config) if len(recids) == 1: recid = recids.pop() subfields.append(('0', str(recid))) record.set_amended( "Pubnote changed from %s to %s and matched a new record %s: Sam is the best, HURRAY!!!" % (old_pubnote, new_pubnote, recid)) else: record.set_amended("Pubnote changed from %s to %s" % (old_pubnote, new_pubnote))
def check_records(records, field): for record in records: if field != '999C5s': for position, value in record.iterfields([field]): newval = value.replace('. ', '.') if newval != value: record.amend_field(position, newval) continue for afield in record_get_field_instances(record, '999', 'C', '5'): subfields = field_get_subfield_instances(afield) subfields_dict = dict(subfields) if 's' in subfields_dict: old_pubnote = subfields_dict['s'] new_pubnote = old_pubnote.replace('. ', '.') if old_pubnote != new_pubnote: subfields.remove(('s', old_pubnote)) subfields.append(('s', new_pubnote)) if not '0' in subfields_dict: recids = perform_request_search(p=new_pubnote, f='journal') if len(recids) == 1: recid = recids.pop() subfields.append(('0', str(recid))) record.set_amended( "Pubnote changed from %s to %s and matched a new record %s: Sam is the best, HURRAY!!!" % (old_pubnote, new_pubnote, recid)) continue record.set_amended("Pubnote changed from %s to %s" % (old_pubnote, new_pubnote))
def translate_fieldvalues_from_latex(record, tag, code='', encoding='utf-8'): """ Given a record and field tag, this function will modify the record by translating the subfield values of found fields from LaTeX to chosen encoding for all the subfields with given code (or all if no code is given). @param record: record to modify, in BibRec style structure @type record: dict @param tag: tag of fields to modify @type tag: string @param code: restrict the translation to a given subfield code @type code: string @param encoding: scharacter encoding for the new value. Defaults to UTF-8. @type encoding: string """ field_list = record_get_field_instances(record, tag) for field in field_list: subfields = field[0] subfield_index = 0 for subfield_code, subfield_value in subfields: if code == '' or subfield_code == code: newvalue = translate_latex2unicode(subfield_value).encode(encoding) record_modify_subfield(record, tag, subfield_code, newvalue, \ subfield_index, field_position_global=field[4]) subfield_index += 1
def translate_fieldvalues_from_latex(record, tag, code='', encoding='utf-8'): """ Given a record and field tag, this function will modify the record by translating the subfield values of found fields from LaTeX to chosen encoding for all the subfields with given code (or all if no code is given). @param record: record to modify, in BibRec style structure @type record: dict @param tag: tag of fields to modify @type tag: string @param code: restrict the translation to a given subfield code @type code: string @param encoding: scharacter encoding for the new value. Defaults to UTF-8. @type encoding: string """ field_list = record_get_field_instances(record, tag) for field in field_list: subfields = field[0] subfield_index = 0 for subfield_code, subfield_value in subfields: if code == '' or subfield_code == code: newvalue = translate_latex2unicode(subfield_value).encode( encoding) record_modify_subfield(record, tag, subfield_code, newvalue, \ subfield_index, field_position_global=field[4]) subfield_index += 1
def record_get_value_with_provenence(record, tag, ind1=" ", ind2=" ", value_code="", provenence_code="9", provenence_value="arXiv"): """ Retrieves the value of the field with given provenence. """ fields = record_get_field_instances(record, tag, ind1, ind2) final_values = [] for subfields, dummy1, dummy2, dummy3, dummy4 in fields: for code, value in subfields: if code == provenence_code and value == provenence_value: # We have a hit. Stop to look for right value break else: # No hits.. continue to next field continue for code, value in subfields: if code == value_code: # This is the value we are looking for with the correct provenence final_values.append(value) return final_values
def generate_ticket(ticket, record): """ Generates a ticket to be created, filling subject, body and queue values of the passed BibCatalogTicket object. The enriched object is returned. @param ticket: a ticket object as created by BibCatalogTicket() containing the subject, body and queue to create a ticket in. @type ticket: record object of BibCatalogTicket. @param record: a recstruct object as created by bibrecord.create_record() @type record: record object of BibRecord. @return: the modified ticket object to create. @rtype: BibCatalogTicket """ recid = record_id_from_record(record) subject = [] # Add report number in the subjecet report_number = "" for report_tag in record_get_field_instances(record, "037"): for report_number in field_get_subfield_values(report_tag, 'a'): subject.append(report_number) break subject.append("(#%s)" % (recid, )) text = 'Curate record here: %s/record/edit/#state=edit&recid=%s' % \ (CFG_SITE_SECURE_URL, recid) ticket.subject = " ".join(subject) ticket.body = text.replace('%', '%%') ticket.queue = "HEP_curation" return ticket
def record_get_value_with_provenence(record, provenence_value, provenence_code, tag, ind1=" ", ind2=" ", code=""): """ Retrieves the value of the given field(s) with given provenence code/value combo. For example: If one would like to extract all subject categories (65017 $a) with a given provenence, in this case "arXiv" in $9: 65017 $ahep-ph$9arXiv 65017 $ahep-th$9arXiv 65017 $aMath$9INSPIRE this function would return ["hep-ph", "hep-th"] Returns a list of subfield values. """ fields = record_get_field_instances(record, tag, ind1, ind2) final_values = [] for subfields, dummy1, dummy2, dummy3, dummy4 in fields: for subfield_code, value in subfields: if subfield_code == provenence_code and value == provenence_value: # We have a hit. Stop to look for right value break else: # No hits.. continue to next field continue for subfield_code, value in subfields: if subfield_code == code: # This is the value we are looking for with the correct provenence final_values.append(value) return final_values
def create_xml(recid, tags, experiment): record = get_record(recid) correct_record = {} record_add_field(correct_record, '001', controlfield_value=str(recid)) flag = None for tag in tags: field_instances = record_get_field_instances(record, tag[0:3], tag[3], tag[4]) correct_subfields = [] for field_instance in field_instances: correct_subfields = [] for code, value in field_instance[0]: if code == 'a': search = 'find a ' + value + ' and exp ' + experiment new_value = convert_search_to_inspire_id(search) if new_value[0]: flag = True correct_subfields.append(('i', new_value[0])) if new_value[1]: flag = True orcid_value = 'ORCID:' + new_value[1] correct_subfields.append(('j', orcid_value)) correct_subfields.append((code, value)) record_add_field(correct_record, tag[0:3], tag[3], tag[4], subfields=correct_subfields) #return print_rec(correct_record) if flag: #print print_rec(correct_record) return print_rec(correct_record)
def format_element(bfo, limit, separator=' ; ', extension='[...]', print_links="yes"): """ Prints the list of editors of a record. @param limit: the maximum number of editors to display @param separator: the separator between editors. @param extension: a text printed if more editors than 'limit' exist @param print_links: if yes, print the editors as HTML link to their publications """ from urllib import quote from invenio.config import CFG_BASE_URL from invenio import bibrecord authors = bibrecord.record_get_field_instances(bfo.get_record(), '100') editors = [bibrecord.field_get_subfield_values(author, 'a')[0] for author in authors if len(bibrecord.field_get_subfield_values(author, "e")) > 0 and bibrecord.field_get_subfield_values(author, "e")[0]=="ed." ] if print_links.lower() == "yes": editors = ['<a href="' + CFG_BASE_URL + '/search?f=author&p=' + \ quote(editor) + \ '&ln='+ bfo.lang + \ '">' + editor + '</a>' for editor in editors] if limit.isdigit() and len(editors) > int(limit): return separator.join(editors[:int(limit)]) + extension elif len(editors) > 0: return separator.join(editors)
def check_records(records): from invenio.bibrank import ConfigParser, CFG_ETCDIR from invenio.bibrank_citation_indexer import get_recids_matching_query config = ConfigParser.ConfigParser() config.read("%s/bibrank/%s.cfg" % (CFG_ETCDIR, "citation")) for record in records: for field in record_get_field_instances(record, '999', 'C', '5'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if '0' not in subfields_dict and 's' in subfields_dict: old_pubnote = subfields_dict['s'] g = RE_BROKEN_PUBNOTES.match(old_pubnote) if g: new_pubnote = '%(journal)s,%(volume)s,P%(id)s' % g.groupdict() subfields.remove(('s', old_pubnote)) subfields.append(('s', new_pubnote)) recids = get_recids_matching_query(p=new_pubnote, f='journal', config=config) if len(recids) == 1: recid = recids.pop() subfields.append(('0', str(recid))) record.set_amended("Pubnote changed from %s to %s and matched a new record %s: Sam is the best, HURRAY!!!" % (old_pubnote, new_pubnote, recid)) else: record.set_amended("Pubnote changed from %s to %s" % (old_pubnote, new_pubnote))
def create_xml(recid, correction_dict): """Fix the citations of Fermilab reports.""" tags = [REF] record = get_record(recid) correct_record = {} record_add_field(correct_record, '001', controlfield_value=str(recid)) flag = False for (tag, field_instance) in \ [(tag, field_instance) for tag in tags \ for field_instance in record_get_field_instances(record, \ tag[0:3], tag[3], tag[4])]: correct_subfields = [] for code, value in field_instance[0]: if code == 'r' and value.upper() in correction_dict: print 'Was:', value value = correction_dict[value.upper()] print 'Now:', value flag = True correct_subfields.append((code, value)) record_add_field(correct_record, tag[0:3], tag[3], tag[4], \ subfields=correct_subfields) if flag: return print_rec(correct_record) else: return None
def replace_references(recid): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record """ # Parse references references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml.encode('utf-8')) # Record marc xml record = get_record(recid) if references[0]: fields_to_add = record_get_field_instances(references[0], tag='999', ind1='%', ind2='%') # Replace 999 fields record_delete_fields(record, '999') record_add_fields(record, '999', fields_to_add) # Update record references out_xml = record_xml_output(record) else: out_xml = None return out_xml
def create_xml(recid): record = get_record(recid) correct_record = {} record_add_field(correct_record, '001', controlfield_value=str(recid)) field_instances = record_get_field_instances(record, tag[0:3], tag[3], tag[4]) correct_subfields = [] for field_instance in field_instances: correct_subfields = [] for code, value in field_instance[0]: if volume_letter: if code == 'p': correct_subfields.append(('p', repl_journal)) elif code == 'v': volume = get_fieldvalues(recid, '773__v') for v in volume: if v[0].isalpha(): correct_subfields.append(('v', v)) else: new_volume = volume_letter + v correct_subfields.append(('v', new_volume)) else: correct_subfields.append((code, value)) else: if code == 'p': correct_subfields.append(('p', repl_journal)) else: correct_subfields.append((code, value)) record_add_field(correct_record, tag[0:3], tag[3], tag[4], subfields=correct_subfields) return print_rec(correct_record)
def _create_ticket(recid, bibcatalog_system, queue): subject = "Refs for #%s" % recid if CFG_INSPIRE_SITE: # Add report number in the subjecet report_number = "" record = get_bibrecord(recid) in_core = False for collection_tag in record_get_field_instances(record, "980"): for collection in field_get_subfield_values(collection_tag, 'a'): if collection == 'CORE': in_core = True if collection == 'arXiv': # Do not create tickets for arxiv papers # Tickets for arxiv papers are created in bibcatelog write_message("arXiv paper", verbose=1) return # Only create tickets for HEP if not in_core: write_message("not in hep", verbose=1) return # Do not create tickets for old records creation_date = run_sql("""SELECT creation_date FROM bibrec WHERE id = %s""", [recid])[0][0] if creation_date < datetime.now() - timedelta(days=30*4): return for report_tag in record_get_field_instances(record, "037"): for category in field_get_subfield_values(report_tag, 'c'): if category.startswith('astro-ph'): write_message("astro-ph", verbose=1) # We do not curate astro-ph return for report_number in field_get_subfield_values(report_tag, 'a'): subject += " " + report_number break text = '%s/record/edit/#state=edit&recid=%s' % (CFG_SITE_SECURE_URL, recid) bibcatalog_system.ticket_submit(subject=subject, queue=queue, text=text, recordid=recid)
def check_existing_pdg_fields(recids, pdg_data, current_records): _print_out("Comparing new and old PDG data for " + str(len(recids)) + " records...") records = {} for recid in recids: record_mod = {} record_mod['001'] = deepcopy(current_records[recid]['001']) record_mod['084'] = deepcopy(current_records[recid]['084']) fields = record_get_field_instances(record_mod, '084') current_pdg_data = [] for field in fields: if is_pdg_field(field): current_pdg_data.append( field_get_subfield_values(field, 'a')[0]) current_set = set(current_pdg_data) new_set = set(pdg_data[recid]) deletions = list(current_set - new_set) additions = list(new_set - current_set) if len(deletions) > 0 or len(additions) > 0: if len(deletions) > 0: for field in fields: if is_pdg_field(field): if field_get_subfield_values(field, 'a')[0] in deletions: record_delete_field(record_mod, '084', ind1=' ', ind2=' ', field_position_global=field[4]) for pdg_field in additions: position = record_add_field(record_mod, '084', ' ', ' ') record_add_subfield_into(record_mod, '084', '2', 'PDG', field_position_global=position) record_add_subfield_into(record_mod, '084', '9', 'PDG', field_position_global=position) record_add_subfield_into(record_mod, '084', 'a', pdg_field, field_position_global=position) records[recid] = record_mod _print_verbose("Record #" + str(recid) + ": " + str(len(deletions)) + " deletions and " + str(len(additions)) + " additons.") else: _print_verbose("Nothing to change for record #" + str(recid)) _print_out(str(len(records)) + " records to be corrected.") return records
def check_arxiv(recid): record = get_record(recid) for report_tag in record_get_field_instances(record, "037"): for category in field_get_subfield_values(report_tag, 'a'): if category.startswith('arXiv'): return True return False
def create_xml(recid, arxiv_ids): old_record = get_record(recid) attached_files = record_get_field_instances(old_record, tag='856', ind1='4') fields_to_add = [f for f in attached_files if check_arxiv_url(f, arxiv_ids)] record = {} record_add_field(record, '001', controlfield_value=str(recid)) record_add_fields(record, '856', fields_to_add) return print_rec(record)
def create_xml(recid, fname=None, oaff=None): affs = [a for a in oaff] record = get_record(recid) auth_location = record_get_field_instances(record, '100', '', '')[0][4] record_delete_field(record, '700', '', '') for x in affs: record_add_subfield_into(record, '100', 'u', x, field_position_global=auth_location) return print_rec(record)
def record_in_collection(record, collection): """ Returns True/False if given record is in a given collection (980__a). """ for collection_tag in record_get_field_instances(record, "980"): for coll in field_get_subfield_values(collection_tag, 'a'): if coll.lower() == collection.lower(): return True return False
def check_records(records): for record in records: for field in record_get_field_instances( record, '100') + record_get_field_instances(record, '700'): subfields = field_get_subfield_instances(field) subfields_dict = dict(subfields) if 'a' in subfields_dict and subfields_dict['a'] in CHANGES: if 'i' in subfields_dict and subfields_dict['i'] != CHANGES[ subfields_dict['a']]: record.set_invalid( "Author %s should have INSPIRE ID %s but has already INSPIRE ID %s" % (subfields_dict['a'], CHANGES[subfields_dict['a']], subfields_dict['i'])) elif not 'i' in subfields_dict: subfields.append(('i', CHANGES[subfields_dict['a']])) record.set_amended( "Added INSPIRE ID %s to author %s" % (CHANGES[subfields_dict['a']], subfields_dict['a']))
def rollback_record(recid): print 'id', recid for rev in get_record_revision_ids(recid): old_record = create_record(get_marcxml_of_revision_id(rev)) fields_to_add = record_get_field_instances(old_record[0], tag='520') if fields_to_add: print 'reverting to', rev return create_our_record(recid, fields_to_add) print 'FAILED', recid
def main(): matchObj = re.match('^(\s)', journal) if matchObj: filename = 'tmp_' + matchObj.group(1) + '_' + re.sub( '.py', '.html', __file__) else: filename = 'tmp_' + re.sub('.py', '.html', __file__) if TEST: print "Testing mode...." else: print "Checking records in this search: %s" % search check_these_records = [] x = perform_request_search(p=search, cc='HEP') if len(x) > 0: if VERBOSE: print "%i records in search" % len(x) output = open(filename, 'w') for r in x: if VERBOSE: print "Working on record %i" % r record = get_record(r) ptep_field_instances = [] field_instances = record_get_field_instances(record, \ tag[0:3], tag[3], tag[4]) for field_instance in field_instances: # if TEST: # print "field_instance: ", field_instance for (code, value) in field_instance[0]: if journal in value: if TEST: print "suspect field_instance[0]: ", field_instance[ 0] ptep_field_instances.append(field_instance[0]) for item in ptep_field_instances: if any('r' in code for code in item) or any('0' in code for code in item): if TEST: print "'r' or '0' in item:", item else: if VERBOSE: print "Found a record that needs checking: %i" % r check_these_records.append(r) if check_these_records: check_these_records = sorted(set(check_these_records)) if VERBOSE: print "%i records of %i total in search should be checked" % ( len(check_these_records), len(x)) check_these_records = [ '<a href="https://inspirehep.net/record/edit/?ln=en#state=edit&recid=%i">%i</a><br />' % (r, r) for r in check_these_records ] output.writelines(check_these_records) output.close() else: if VERBOSE: print "No results in search"
def create_our_record(recid): old_record = get_record(recid) instances = record_get_field_instances(old_record, '980') new_instances = [l.field for l in set(OurInstance(i) for i in instances if field_get_subfield_instances(i) != [('a', 'unknown')])] record = {} record_add_field(record, '001', controlfield_value=str(recid)) record_add_fields(record, '980', new_instances) return print_rec(record)
def get_rn(revision): rns = set() record = create_record(get_marcxml_of_revision_id(revision))[0] fields = record_get_field_instances(record, tag='999', ind1='C', ind2='5') for f in fields: subfields = field_get_subfield_instances(f) for index, s in enumerate(subfields): if s[0] == 'r': rns.add(tag_arxiv_more(s[1])) return rns
def get_photolab_image_caption(record, imageID): """ Get the caption for the given image """ elements = record_get_field_instances(record, tag=CFG_MA_CAPTION_TAG) for element in elements: current_values = dict(element[0]) if current_values.get(CFG_MA_CAPTION_SUBFIELD_ID, -1) == imageID: return current_values.get(CFG_MA_CAPTION_SUBFIELD_CONTENT, '') return ''
def replace_references(recid, uid=None, txt=None, url=None): """Replace references for a record The record itself is not updated, the marc xml of the document with updated references is returned Parameters: * recid: the id of the record * txt: references in text mode * inspire: format of ther references """ # Parse references if txt is not None: references_xml = extract_references_from_string_xml( txt, is_only_references=True) elif url is not None: references_xml = extract_references_from_url_xml(url) else: references_xml = extract_references_from_record_xml(recid) references = create_record(references_xml.encode('utf-8')) dummy1, dummy2, record, dummy3, dummy4, dummy5, dummy6 = get_cache_file_contents( recid, uid) out_xml = None references_to_add = record_get_field_instances(references[0], tag='999', ind1='C', ind2='5') refextract_status = record_get_field_instances(references[0], tag='999', ind1='C', ind2='6') if references_to_add: # Replace 999 fields record_delete_fields(record, '999') record_add_fields(record, '999', references_to_add) record_add_fields(record, '999', refextract_status) # Update record references out_xml = record_xml_output(record) return out_xml
def record_find_matching_fields(key, rec, tag="", ind1=" ", ind2=" ", exact_match=False): """ This utility function will look for any fieldvalues containing or equal to, if exact match is wanted, given keyword string. The found fields will be returned as a list of field instances per tag. The fields to search can be narrowed down to tag/indicator level. @param key: keyword to search for @type key: string @param rec: a record structure as returned by bibrecord.create_record() @type rec: dict @param tag: a 3 characters long string @type tag: string @param ind1: a 1 character long string @type ind1: string @param ind2: a 1 character long string @type ind2: string @return: a list of found fields in a tuple per tag: (tag, field_instances) where field_instances is a list of (Subfields, ind1, ind2, value, field_position_global) and subfields is list of (code, value) @rtype: list """ if not tag: all_field_instances = rec.items() else: all_field_instances = [ (tag, record_get_field_instances(rec, tag, ind1, ind2)) ] matching_field_instances = [] for current_tag, field_instances in all_field_instances: found_fields = [] for field_instance in field_instances: # Get values to match: controlfield_value + subfield values values_to_match = [field_instance[3]] + \ [val for dummy_code, val in field_instance[0]] if exact_match and key in values_to_match: found_fields.append(field_instance) else: for value in values_to_match: if value.find(key) > -1: found_fields.append(field_instance) break if len(found_fields) > 0: matching_field_instances.append((current_tag, found_fields)) return matching_field_instances
def handle_tags(recid, tags, d): record = get_record(recid) correct_record = {} need_email = False need_author = False for tag in tags: original_tag = tag field_instances = \ record_get_field_instances(record, tag[0:3], tag[3], tag[4]) correct_subfields = [] #correct_subfields_aff = [] for field_instance in field_instances: correct_record = {} correct_subfields = [] for code, value in field_instance[0]: if code == 'm' or code == 'u': tag = '371__' if code == 'u': code = 'a' if code == 'm' and not value in list_of_emails: list_of_emails.append(value) inHepnames_email = get_hepnames_recid_from_email(value) if verbose: print 'inHepnames_email=', inHepnames_email #if not inHepnames_email: need_email = value else: tag = original_tag if tag == '700__' : tag = '100__' if code != 'v': correct_subfields = [(code, value)] if tag == '371__': correct_subfields.append(('z', 'current')) if code == 'a' and tag == '100__' and not value in list_of_authors: list_of_authors.append(value) nicename = re.sub(r'(.*)\, (.*)',r'\2 \1',value) correct_subfields.append(('q', nicename)) search = "find a " + value search = search + " or ea " + value inHepnames_author = \ perform_request_search(p=search, cc='HepNames') if verbose: print 'inHepnames_author=', inHepnames_author if not inHepnames_author: need_author = True if re.search(r"'",value): need_author = False if code == 'i' : need_author = False record_add_field(correct_record, tag[0:3], tag[3], tag[4], \ subfields=correct_subfields) if d: correct_record.update(d) if need_author or need_email: if verbose and inHepnames_author: print "Margaret: This author is already in", \ inHepnames_author, need_email print print_rec(correct_record) need_email = False need_author = False return correct_record
def process_record(self, record): """@see: BaseFieldCommand.process_record""" # if the tag is empty, we don't make any changes if self._tag == "" or self._tag == None: return matching_field_instances = \ bibrecord.record_get_field_instances(record, self._tag, self._ind1, self._ind2) for current_field in matching_field_instances: self._apply_subfield_commands_to_field(record, current_field[4])