def build_record(counts, fields, recid=None, status_code=0): """Given a series of MARC XML-ized reference lines and a record-id, write a MARC XML record to the stdout stream. Include in the record some stats for the extraction job. The printed MARC XML record will essentially take the following structure: <record> <controlfield tag="001">1</controlfield> <datafield tag="999" ind1="C" ind2="5"> [...] </datafield> [...] <datafield tag="999" ind1="C" ind2="6"> <subfield code="a"> Invenio/X.XX.X refextract/X.XX.X-timestamp-err-repnum-title-URL-misc </subfield> </datafield> </record> Timestamp, error(code), reportnum, title, URL, and misc will are of course take the relevant values. @param status_code: (integer)the status of reference-extraction for the given record: was there an error or not? 0 = no error; 1 = error. @param count_reportnum: (integer) - the number of institutional report-number citations found in the document's reference lines. @param count_title: (integer) - the number of journal title citations found in the document's reference lines. @param count_url: (integer) - the number of URL citations found in the document's reference lines. @param count_misc: (integer) - the number of sections of miscellaneous text (i.e. 999C5$m) from the document's reference lines. @param count_auth_group: (integer) - the total number of author groups identified ($h) @param recid: (string) - the record-id of the given document. (put into 001 field.) @param xml_lines: (list) of strings. Each string in the list contains a group of MARC XML 999C5 datafields, making up a single reference line. These reference lines will make up the document body. @return: The entire MARC XML textual output, plus recognition statistics. """ record = BibRecord(recid=recid) record['999'] = fields field = record.add_field(CFG_REFEXTRACT_TAG_ID_EXTRACTION_STATS) stats_str = "%(status)s-%(reportnum)s-%(title)s-%(author)s-%(url)s-%(doi)s-%(misc)s" % { 'status' : status_code, 'reportnum' : counts['reportnum'], 'title' : counts['title'], 'author' : counts['auth_group'], 'url' : counts['url'], 'doi' : counts['doi'], 'misc' : counts['misc'], } field.add_subfield(CFG_REFEXTRACT_SUBFIELD_EXTRACTION_STATS, stats_str) field.add_subfield(CFG_REFEXTRACT_SUBFIELD_EXTRACTION_TIME, datetime.now().strftime("%Y-%m-%d %H:%M:%S")) field.add_subfield(CFG_REFEXTRACT_SUBFIELD_EXTRACTION_VERSION, CFG_REFEXTRACT_VERSION) return record
def create_our_record(recid): old_record = get_record(recid) for subfield in old_record.find_subfields('100__u'): if subfield.value.lower() == 'lisbon, lifep': subfield.value = 'LIP, Lisbon' for subfield in old_record.find_subfields('700__u'): if subfield.value.lower() == 'lisbon, lifep': subfield.value = 'LIP, Lisbon' try: instances_100 = old_record['100'] except KeyError: instances_100 = [] try: instances_700 = old_record['700'] except KeyError: instances_700 = [] record = BibRecord(recid=recid) record['100'] = instances_100 record['700'] = instances_700 return record.to_xml()
def test_add_subfield(self): xml = """<record> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">our title</subfield> </datafield> </record>""" expected_record = create_record(xml) record = BibRecord() record.add_subfield('100__a', 'our title') self.assertEqual(record, expected_record)
def test_del_field(self): record = create_record(self.xml) record.add_subfield('101__b', 'not title') del record['100__'] expected_record = BibRecord() expected_record.add_subfield('101__b', 'not title') self.assertEqual(record, expected_record)
def append_to_record(rec_id, doi, published_date): """ Attempts to add a DOI to a record, also adds 930 'Published' if not already there and adds the extrapolated PubNote data to 773 """ record = get_record(recid=rec_id) new_record = BibRecord(rec_id) # make sure that there is no DOI for this record if not record_has_doi(record, rec_id, doi): # create new record with only 0247 field, that we will append # to the existing record with bibupload function new_record = BibRecord(rec_id) new_field = new_record.add_field('0247_') new_field.add_subfield('2', 'DOI') new_field.add_subfield('a', doi.decode('utf-8')) _print('DOI to be added: ' + doi + ' to the record ' + str(rec_id), 3) if not is_marked_published(record): new_field_980 = new_record.add_field('980__') new_field_980.add_subfield('a', 'Published') append_773 = False field_773 = record.find_fields('773__') new_field_773 = create_pubnote(doi, published_date) if len(field_773) == 0: append_773 = True _print("No pubnote, adding field 773 to record...", 7) else: if not is_pubnote_identical(field_773, new_field_773): append_773 = True _print( "Field 773 already exists for record, " + "differs from DOI extract", 3) else: _print( "Field 773 already exists, does not " + "contradict DOI extract.", 6) if append_773: new_field = new_record.add_field('773__') for code, value in new_field_773.iteritems(): new_field.add_subfield(code, value) field_260 = record.find_subfields("260__c") if len(field_260) == 0: # We add 260__c publication date new_field = new_record.add_field('260__') new_field.add_subfield("c", published_date) if len(new_record.record) > 0: return new_record.to_xml() else: return None
def test_add_subfield2(self): expected_record = create_record(self.xml) record = BibRecord() field = BibRecordField() record['100'] = [field] field.add_subfield('a', 'our title') self.assertEqual(record, expected_record)
class APSRecord(object): """ Class representing a record to harvest. """ def __init__(self, recid, doi=None, date=None, last_modified=None): self.recid = recid self.doi = doi or get_doi_from_record(self.recid) self.date = date self.record = BibRecord(recid or None) self.last_modified = last_modified def add_metadata(self, marcxml_file): """ Adds metadata from given file. Removes any DTD definitions and translates the metadata to MARCXML using BibConvert. """ if marcxml_file: self.record = create_records_from_file(marcxml_file) if self.recid: self.record['001'] = [BibRecordControlField(str(self.recid))] def add_metadata_by_string(self, marcxml_text): """ Adds metadata from given text. """ if marcxml_text: self.record = create_records_from_string(marcxml_text) if self.recid: self.record['001'] = [BibRecordControlField(str(self.recid))] def add_fft(self, fulltext_file, hidden=True): """ Adds FFT information as required from given fulltext. """ fft = self.record.add_field("FFT__") fft.add_subfield('a', fulltext_file) if hidden: fft.add_subfield('t', CFG_APSHARVEST_FFT_DOCTYPE) fft.add_subfield('o', "HIDDEN") else: fft.add_subfield('t', "INSPIRE-PUBLIC") def to_xml(self): return self.record.to_xml()
class APSRecord(object): """ Class representing a record to harvest. """ def __init__(self, recid=None, doi=None, date=None, last_modified=None): self.recid = recid self.doi = doi or get_doi_from_record(self.recid) self.date = date self.record = BibRecord(recid or None) self.last_modified = last_modified def add_metadata(self, marcxml_file): """ Adds metadata from given file. Removes any DTD definitions and translates the metadata to MARCXML using BibConvert. """ if marcxml_file: self.record = create_records_from_file(marcxml_file) if self.recid: self.record['001'] = [BibRecordControlField(str(self.recid))] def add_metadata_by_string(self, marcxml_text): """ Adds metadata from given text. """ if marcxml_text: self.record = create_records_from_string(marcxml_text) if self.recid: self.record['001'] = [BibRecordControlField(str(self.recid))] def add_fft(self, fulltext_file, hidden=True): """ Adds FFT information as required from given fulltext. """ fft = self.record.add_field("FFT__") fft.add_subfield('a', fulltext_file) if hidden: fft.add_subfield('t', CFG_APSHARVEST_FFT_DOCTYPE) fft.add_subfield('o', "HIDDEN") else: fft.add_subfield('t', "INSPIRE-PUBLIC") def to_xml(self): return self.record.to_xml()
def test_hash(self): for dummy, original_record in self.records_cache.iteritems(): # Our bibrecord we want to test record = BibRecord() for tag, fields in original_record.record.iteritems(): record[tag] = list(set(fields)) self.assertEqual(set(record[tag]), set(original_record[tag])) self.assertEqual(record, original_record)
def create_our_record(recid, bibupload, bibupload2): old_record = get_record(recid) try: instances_084 = old_record['084'] except KeyError: instances_084 = [] to_remove_instances_650 = [] modified = False for field in old_record['650']: if 'PACS' in field.get_subfield_values('2'): assert len(field.subfields) >= 2 assert len(field.subfields) -1 == len(field.get_subfield_values('a')) to_remove_instances_650.append(field) for value in field.get_subfield_values('a'): sub_2 = BibRecordSubField(code='2', value='PACS') sub_a = BibRecordSubField(code='a', value=value) f = BibRecordField(subfields=[sub_2, sub_a]) instances_084.append(f) modified = True if not modified: return None # Remove wrong indicator for field in instances_084[:]: if field.ind1 == '1' and field.ind2 == '7' \ and 'PACS' in field.get_subfield_values('2'): field.ind1 = ' ' field.ind2 = ' ' record = BibRecord(recid=recid) record['084'] = set(instances_084) bibupload.add(record.to_xml()) if to_remove_instances_650: record = BibRecord(recid=recid) record['650'] = to_remove_instances_650 bibupload2.add(record.to_xml())
def append_to_record(rec_id, doi, published_date): """ Attempts to add a DOI to a record, also adds 930 'Published' if not already there and adds the extrapolated PubNote data to 773 """ record = get_record(recid=rec_id) new_record = BibRecord(rec_id) # make sure that there is no DOI for this record if not record_has_doi(record, rec_id, doi): # create new record with only 0247 field, that we will append # to the existing record with bibupload function new_record = BibRecord(rec_id) new_field = new_record.add_field('0247_') new_field.add_subfield('2', 'DOI') new_field.add_subfield('a', doi.decode('utf-8')) _print('DOI to be added: ' + doi + ' to the record ' + str(rec_id), 3) if not is_marked_published(record): new_field_980 = new_record.add_field('980__') new_field_980.add_subfield('a', 'Published') append_773 = False field_773 = record.find_fields('773__') new_field_773 = create_pubnote(doi, published_date) if len(field_773) == 0: append_773 = True _print("No pubnote, adding field 773 to record...", 7) else: if not is_pubnote_identical(field_773, new_field_773): append_773 = True _print("Field 773 already exists for record, " + "differs from DOI extract", 3) else: _print("Field 773 already exists, does not " + "contradict DOI extract.", 6) if append_773: new_field = new_record.add_field('773__') for code, value in new_field_773.iteritems(): new_field.add_subfield(code, value) field_260 = record.find_subfields("260__c") if len(field_260) == 0: # We add 260__c publication date new_field = new_record.add_field('260__') new_field.add_subfield("c", published_date) if len(new_record.record) > 0: return new_record.to_xml() else: return None
def append_doi(recID, doi): record = get_record(recid=recID) try: # make sure that there is no DOI for this record if record.find_subfields('0247_a'): messages.append('Record %s already has a doi' % recID) if record.find_subfields('0247_a')[0].value != doi: errors.append('DOI of %s record is different than the new doi (%s)!' % (recID, doi)) else: # create new record with only 0247 field, that we will append # to the existing record with bibupload function new_record = BibRecord(recID) new_field = new_record.add_field('0247_') new_field.add_subfield('a', doi.decode('utf-8')) new_field.add_subfield('2', 'DOI') messages.append('Successfully inserted the doi: ' + doi + ' to the record ' + str(recID)) return new_record.to_xml() except Exception, e: traceback.print_exc() errors.append('Unknown error: ' + repr(e))
def test_set_record(self): record = BibRecord() field = BibRecordField() record['100'] = [field] self.assertEqual(len(record), 1)
def __init__(self, recid=None, doi=None, date=None, last_modified=None): self.recid = recid self.doi = doi or get_doi_from_record(self.recid) self.date = date self.record = BibRecord(recid or None) self.last_modified = last_modified
def test_simple(self): record = BibRecord() record.add_subfield('100__a', 'Test Journal Name') record.add_subfield('773__p', 'Test Journal Name') record.add_subfield('999C5s', 'Test Journal Name,100,10') converted_record = convert_journals(self.kb, record) expected_record = BibRecord() expected_record.add_subfield('100__a', 'Test Journal Name') expected_record.add_subfield('773__p', 'Converted') expected_record.add_subfield('999C5s', 'Converted,100,10') self.assertEqual(expected_record, converted_record)
def test_add_field(self): expected_record = create_record(self.xml) record = BibRecord() record.add_field('100__') record['100__'][0].add_subfield('a', 'our title') self.assertEqual(record, expected_record)
def __init__(self, recid, doi=None, date=None, last_modified=None): self.recid = recid self.doi = doi or get_doi_from_record(self.recid) self.date = date self.record = BibRecord(recid or None) self.last_modified = last_modified
def test_add_subfield(self): expected_record = create_record(self.xml) record = BibRecord() record.add_subfield('100__a', 'our title') self.assertEqual(record, expected_record)