def test_tokenize_marc21_one(self): marc_record = pymarc.Record() marc_record.leader = marc_record.leader[0:6] + 'a' + marc_record.leader[7:] marc_record.add_field( pymarc.Field(tag='100', indicators=['1', ' '], subfields=['a','Naslund, Sena Jeter.'])) marc_record.add_field( pymarc.Field(tag='245', indicators=['1', '0'], subfields = ['a', "Ahab's wife, or, The star-gazer :", 'b', "a novel /"])) self.assertEquals( sorted(self.classifier.__tokenize_marc21__(marc_record)), ['ahab', 'book', 'gazer', 'jeter', 'naslund', 'novel', 'sena', 'star', 'wife'])
def test_writing_1_record(self): expected = r""" <?xml version="1.0" encoding="UTF-8"?> <collection xmlns="http://www.loc.gov/MARC21/slim"> <record> <leader> 22 4500</leader> <datafield ind1="0" ind2="0" tag="100"> <subfield code="a">me</subfield> </datafield> <datafield ind1="0" ind2="0" tag="245"> <subfield code="a">Foo /</subfield> <subfield code="c">by me.</subfield> </datafield> </record> </collection> """ expected = textwrap.dedent(expected[1:]).replace('\n', '') if str != binary_type: expected = expected.encode() file_handle = BytesIO() try: writer = pymarc.XMLWriter(file_handle) record = pymarc.Record() record.add_field(pymarc.Field('100', ['0', '0'], ['a', u('me')])) record.add_field( pymarc.Field( '245', ['0', '0'], ['a', u('Foo /'), 'c', u('by me.')])) writer.write(record) writer.close(close_fh=False) self.assertEquals(file_handle.getvalue(), expected) finally: file_handle.close()
def test_relator_terms_to_codes(): # unknown term in $$e -> do nothing f100 = pymarc.Field( tag="100", indicators=["1", " "], subfields=["a", "Mustermann, Martin", "e", "10445599", "4", "aut"]) ph.relator_terms_to_codes(f100) assert f100.subfields == [ "a", "Mustermann, Martin", "e", "10445599", "4", "aut" ] # known term in $$e f100 = pymarc.Field(tag="100", indicators=["1", " "], subfields=["a", "Mustermann, Martin", "e", "author"]) ph.relator_terms_to_codes(f100) assert f100.subfields == ["a", "Mustermann, Martin", "4", "aut"] # known code in $$e, corresponding term in $$4 f100 = pymarc.Field( tag="100", indicators=["1", " "], subfields=["a", "Mustermann, Martin", "e", "author", "4", "aut"]) ph.relator_terms_to_codes(f100) assert f100.subfields == ["a", "Mustermann, Martin", "4", "aut"]
def test_marc_clean_subfields(): record = marcx.Record() record.add("001", data="1234") record.add("245", a="", b="ok") assert record.strict == True # Behind the scenes, marcx will not add empty subfield values # (https://git.io/fjIWU). assert marc_clean_subfields(record["245"], inplace=False) == ['b', 'ok'] assert record["245"].subfields == ['b', 'ok'] assert marc_clean_subfields(record["245"], inplace=True) is None assert record["245"].subfields == ['b', 'ok'] # Test pymarc record. record = pymarc.Record() record.add_field(pymarc.Field(tag='001', data='1234')) record.add_field( pymarc.Field(tag='245', indicators=['0', '1'], subfields=['a', '', 'b', 'ok'])) assert len(record.get_fields()) == 2 assert marc_clean_subfields(record["245"], inplace=False) == ['b', 'ok'] assert record["245"].subfields == ['a', '', 'b', 'ok'] assert marc_clean_subfields(record["245"], inplace=True) is None assert record["245"].subfields == ['b', 'ok']
def test_writing_1_record(self): expected = r""" <?xml version="1.0" encoding="UTF-8"?> <collection xmlns="http://www.loc.gov/MARC21/slim"> <record> <leader> 22 4500</leader> <datafield ind1="0" ind2="0" tag="100"> <subfield code="a">me</subfield> </datafield> <datafield ind1="0" ind2="0" tag="245"> <subfield code="a">Foo /</subfield> <subfield code="c">by me.</subfield> </datafield> </record> </collection> """ expected = textwrap.dedent(expected[1:]).replace("\n", "").encode() file_handle = BytesIO() try: writer = pymarc.XMLWriter(file_handle) record = pymarc.Record() record.add_field(pymarc.Field("100", ["0", "0"], ["a", "me"])) record.add_field( pymarc.Field("245", ["0", "0"], ["a", "Foo /", "c", "by me."])) writer.write(record) writer.close(close_fh=False) self.assertEquals(file_handle.getvalue(), expected) finally: file_handle.close()
def add_license(record, pandata): if pandata.rights: # add 536 field (funding information) record.add_ordered_field( pymarc.Field( tag='536', indicators=[' ', ' '], subfields=[ 'a', pandata.funding_info, ], )) # add 540 field (terms governing use) field540 = pymarc.Field(tag='540', indicators=[' ', ' '], subfields=[ 'a', dict(licenses.CHOICES).get( pandata.rights, pandata.rights), ]) rights_url = pandata.rights_url if pandata.rights_url else dict( licenses.GRANTS).get(pandata.rights, None) if rights_url: field540.add_subfield('u', rights_url) record.add_ordered_field(field540)
def add_stuff(record): # add field indicating record originator record.add_ordered_field(pymarc.Field(tag='003', data='GITenberg')) # update timestamp of record datestamp = datetime.now().strftime('%Y%m%d%H%M%S') + '.0' record.add_ordered_field(pymarc.Field(tag='005', data=datestamp)) # change 006, 007 because this is an online resource record.add_ordered_field(pymarc.Field(tag='006', data='m o d ')) record.add_ordered_field(pymarc.Field(tag='007', data='cr'))
def test_from_record(self): record = pymarc.Record() record.add_field(pymarc.Field('001', data='123')) record.add_field(pymarc.Field('020', [' ', ' '], subfields=['a', '123'])) with self.assertRaises(AttributeError): record.itervalues('020') obj = marcx.Record.from_record(record) self.assertEquals(obj.__class__.__name__, 'Record') self.assertEquals(obj.__dict__, record.__dict__) self.assertEquals(list(obj.itervalues('020')), ['123'])
def test_marc_clean_record(): record = pymarc.Record() record.add_field(pymarc.Field(tag='001', data='1234')) record.add_field( pymarc.Field(tag='245', indicators=['0', '1'], subfields=['a', '', 'b', 'ok'])) assert len(record.get_fields()) == 2 assert record["245"].subfields == ['a', '', 'b', 'ok'] marc_clean_record(record) assert record["245"].subfields == ['b', 'ok']
def marcwriter(file): #remove prior file for fname in os.listdir("."): if os.path.isfile(fname) and fname.startswith("holdings"): os.remove(fname) #create an output file for the created MARC records (wb means 'write binary') outputfile = open('holdings{}.mrc'.format(date.today()), 'wb') #iterate through each row of the file for rownum, row in enumerate(file): #declare PyMARC record object item_load = pymarc.Record(to_unicode=True, force_utf8=True) #define data fields in CSV file ocn = row[1] isbn = row[2].split('|') #upc = row[3].split('|') #Clean up OCLC numbers with regular expression ocn = re.sub("[^0-9]", "", ocn) #write data to field variables field_001 = pymarc.Field(tag='001', data=ocn) item_load.add_ordered_field(field_001) for i in isbn: if i == '': break field_020 = pymarc.Field(tag='020', indicators=[' ', ' '], subfields=['a', i]) item_load.add_ordered_field(field_020) ##If including 024 fields #for j in upc: # if j == '': # break # field_024 = pymarc.Field( # tag='024', # indicators = ['3',' '], # subfields = ['a', j] # ) # item_load.add_ordered_field(field_024) #Create output file outputfile.write(item_load.as_marc()) #close the output file outputfile.close()
def df_to_mrc(df, field_delimiter, path_out, txt_error_file): mrc_errors = [] df = df.replace(r'^\s*$', np.nan, regex=True) outputfile = open(path_out, 'wb') errorfile = io.open(txt_error_file, 'wt', encoding='UTF-8') list_of_dicts = df.to_dict('records') for record in tqdm(list_of_dicts, total=len(list_of_dicts)): record = {k: v for k, v in record.items() if pd.notnull(v)} try: pymarc_record = pymarc.Record(to_unicode=True, force_utf8=True, leader=record['LDR']) del record['LDR'] for k, v in record.items(): v = v.split(field_delimiter) if int(k) < 10: tag = k data = ''.join(v) marc_field = pymarc.Field(tag=tag, data=data) pymarc_record.add_ordered_field(marc_field) else: if len(v) == 1: tag = k record_in_list = re.split('\$(.)', ''.join(v)) indicators = list(record_in_list[0]) subfields = record_in_list[1:] marc_field = pymarc.Field(tag=tag, indicators=indicators, subfields=subfields) pymarc_record.add_ordered_field(marc_field) else: for element in v: tag = k record_in_list = re.split('\$(.)', ''.join(element)) indicators = list(record_in_list[0]) subfields = record_in_list[1:] marc_field = pymarc.Field(tag=tag, indicators=indicators, subfields=subfields) pymarc_record.add_ordered_field(marc_field) outputfile.write(pymarc_record.as_marc()) except ValueError: mrc_errors.append(record) if len(mrc_errors) > 0: for element in mrc_errors: errorfile.write(str(element) + '\n\n') errorfile.close() outputfile.close()
def setUp(self): self.reader = pymarc.MARCReader(open('test/test.dat', 'rb')) self._record = pymarc.Record() field = pymarc.Field(tag='245', indicators=['1', '0'], subfields=['a', 'Python', 'c', 'Guido']) self._record.add_field(field)
def check001(record): if len(record.get_fields('001')) < 1: record.add_field( pymarc.Field(tag='001', data=''.join( [str(i) for i in random.sample(range(10), 6)]))) return record
def setUp(self): self.reader = pymarc.MARCReader(open("test/test.dat", "rb")) self._record = pymarc.Record() field = pymarc.Field(tag="245", indicators=["1", "0"], subfields=["a", "Python", "c", "Guido"]) self._record.add_field(field)
def add_stuff(record): # add field indicating record originator field003 = pymarc.Field(tag='003', data='UnglueIt') record.add_ordered_field(field003) # update timestamp of record datestamp = datetime.now().strftime('%Y%m%d%H%M%S') + '.0' field005 = pymarc.Field(tag='005', data=datestamp) record.add_ordered_field(field005) # change 006, 007, 008 because this is an online resource field006 = pymarc.Field(tag='006', data='m o d ') record.add_ordered_field(field006) field007 = pymarc.Field(tag='007', data='cr') record.add_ordered_field(field007)
def country_044_from_008(record): """Add a field 044##$$c with the ISO 3166-Codes derived from 008/15-17. All codes for USA, Canada and Great Britain are normalized to XD-US, XD-CA and XA-GB. """ country008 = record["008"].data[15:18].rstrip() country044 = None if country008 in country_codes_marc2iso: country044 = country_codes_marc2iso[country008] if country044 is not None: if not record["044"]: record.add_ordered_field( pymarc.Field(tag="044", indicators=[" ", " "], subfields=["c", country044])) elif country044[3:] in record["044"].subfields: # change existing code to code with continental prefix if not country044 in record["044"].subfields: subfields = [] for subfield in record["044"].subfields: subfields.append( subfield.replace(country044[3:], country044)) record["044"].subfields = subfields else: record["044"].add_subfield("c", country044)
def csvmarcwriter(file): #Open your CSV File with open(file) as fh: itemread = csv.reader(fh) itemlist = list(itemread) #create an output file for the created MARC records (wb means 'write binary') outputfile = open('writer.mrc', 'wb') #iterate through each row of the CSV file for row in itemlist[1:]: #declare PyMARC record object item_load = pymarc.Record(to_unicode=True, force_utf8=True) #define data fields in CSV file ocn = row[0] barcode = row[1] ltitle = row[2] #Clean up OCLC numbers with regular expression ocn = re.sub("[^0-9]", "", ocn) #write data to field variables field_001 = pymarc.Field(tag='001', data=ocn) field_974 = pymarc.Field( tag='974', indicators = [' ',' '], subfields = ['a', ltitle, '9', "LOCAL"], ) field_949 = pymarc.Field( tag='949', indicators = [' ',' '], subfields = ['a', barcode] ) #add field variables to PyMARC record object item_load.add_ordered_field(field_001) item_load.add_ordered_field(field_974) item_load.add_ordered_field(field_949) #Create output file outputfile.write(item_load.as_marc()) #close the output file outputfile.close()
def test_encoding(self): # Create a record record1 = pymarc.Record() # Add a field containing no diacritics record1.add_field( pymarc.Field( tag='245', indicators=[' ', ' '], subfields=[ 'a', 'Report of the Committee on the Peaceful Uses of Outer Space' ])) # And a field containing diacritics record1.add_field( pymarc.Field( tag='246', indicators=[' ', ' '], subfields=[ 'a', "Rapport du Comité des utilisations pacifiques de l'espace extra-atmosphérique" ])) # Create XML with an encoding specified record_xml = pymarc.marcxml.record_to_xml(record1, encoding='utf-8') # Parse the generated XML record2 = pymarc.parse_xml_to_array(six.BytesIO(record_xml))[0] # Compare the two records. If the other tests above pass, and this one passes, then the addition of an encoding # parameter in the marcxml.record_to_xml fuction didn't seem to break basic functionality of the library. self.assertEqual(record1.leader, record2.leader) field1 = record1.get_fields() field2 = record2.get_fields() self.assertEqual(len(field1), len(field2)) pos = 0 while pos < len(field1): self.assertEqual(field1[pos].tag, field2[pos].tag) if field1[pos].is_control_field(): self.assertEqual(field1[pos].data, field2[pos].data) else: self.assertEqual(field1[pos].get_subfields(), field2[pos].get_subfields()) self.assertEqual(field1[pos].indicators, field2[pos].indicators) pos += 1
def GetMarcField(self): listToReturn = [] if isRepeatableField(self.Tag) or isHoldingField(self.Tag): for f in self.ListOfSubFields: if len(f) % 2 != 0: raise BuilderException( "item_%s : tag_%s : Subfields list must contain pairs of subfield and it's value: %s" % (self.ItemID, self.Tag, str(f))) listToReturn.append(pymarc.Field(self.Tag, self.Indicators, f)) return listToReturn else: if len(self.SubFields) % 2 != 0: raise BuilderException( "item_%s : tag_%s : Subfields list must contain pairs of subfield and it's value: %s" % (self.ItemID, self.Tag, str(self.SubFields))) listToReturn.append( pymarc.Field(self.Tag, self.Indicators, self.SubFields)) return listToReturn
def test_writing_1_record(self): expected = json.loads(r""" [ { "leader" : " 22 4500", "fields" : [ { "100": { "ind1": "0", "ind2": "0", "subfields": [ { "a": "me" } ] } }, { "245": { "ind1": "0", "ind2": "0", "subfields": [ { "a": "Foo /" }, { "c": "by me." } ] } } ] } ] """) file_handle = StringIO() try: writer = pymarc.JSONWriter(file_handle) record = pymarc.Record() record.add_field(pymarc.Field('100', ['0', '0'], ['a', u('me')])) record.add_field( pymarc.Field( '245', ['0', '0'], ['a', u('Foo /'), 'c', u('by me.')])) writer.write(record) writer.close(close_fh=False) actual = json.loads(file_handle.getvalue()) self.assertEquals(actual, expected) finally: file_handle.close()
def test_writing_1_record(self): expected = r""" =LDR 22 4500 =100 00$ame =245 00$aFoo /$cby me. """ expected = textwrap.dedent(expected[1:]) file_handle = StringIO() try: writer = pymarc.TextWriter(file_handle) record = pymarc.Record() record.add_field(pymarc.Field("100", ["0", "0"], ["a", "me"])) record.add_field( pymarc.Field("245", ["0", "0"], ["a", "Foo /", "c", "by me."])) writer.write(record) writer.close(close_fh=False) self.assertEquals(file_handle.getvalue(), expected) finally: file_handle.close()
def xref_rpt_parse(xreffile): today = str(date.today()) f = open(xreffile, 'r') file = f.read() lines = file.split('\n') xref_parsed = [] for line in lines: xref = line.split('\t') xref_parsed.append(xref) with open('upd_xref_' + today + '.mrc', 'wb') as processed, open('upd_mmsids_' + today + '.txt', 'w') as mmsid_file: rec_count = 0 mmsid_file.write('MMS ID\n') for row in xref_parsed: brief_record = pymarc.Record(to_unicode=True, force_utf8=True) try: mmsid = row[0] newocn = row[1] field_001 = pymarc.Field(tag='001', data=mmsid) field_035 = pymarc.Field( tag = '035', indicators = [' ',' '], subfields = ['a', '(OCoLC)' + newocn] ) field_245 = pymarc.Field( tag = '245', indicators = [' ',' '], subfields = ['a', 'Title'] ) brief_record.add_ordered_field(field_001) brief_record.add_ordered_field(field_035) brief_record.add_ordered_field(field_245) except IndexError: continue processed.write(brief_record.as_marc()) mmsid_file.write(str(mmsid) + '\n') rec_count += 1 return(rec_count)
def via_record(self): the_record = self._record() field856_via = pymarc.Field(tag='856', indicators=['4', '0'], subfields=[ 'u', self.edition.download_via_url(), ]) the_record.add_ordered_field(field856_via) return the_record
def test_tokenize_marc21_title_only(self): marc_record = pymarc.Record() marc_record.add_field( pymarc.Field(tag='245', indicators=['1', '0'], subfields = ['a', "Ahab's wife, or, The star-gazer :", 'b', "a novel /"])) self.assertEquals( self.classifier.__tokenize_marc21__(marc_record), ['novel', 'star', 'ahab', 'wife', 'work', 'gazer'])
def update_marc_file(infile, outfile, cdlpath): """add cdlpath info to all the MARC records in a file""" # open MARC file for reading reader = pymarc.MARCReader( file(infile), to_unicode=True, force_utf8=True, utf8_handling='ignore' ) # keep the new file in memory string = StringIO.StringIO() writer = pymarc.MARCWriter(string) # main look through all the records count = 0 for record in reader: count += 1 # create new MARC field and add it to the record field = pymarc.Field( tag = '941', indicators = ['0','1'], subfields = [ 'a', cdlpath ] ) record.add_field(field) try: # try to write the record writer.write(record) except UnicodeDecodeError as inst: # catch Unicode errors title = '' recordId = '' if record['245'] is not None: title = record['245'] if record['001'] is not None: recordId = record['001'] print "--- error with record %s %s" % (count, recordId) print "leader9 = %s" % record.leader[9] print title print inst # set leader9 to 'a' (indicates unicode) and try again ## this didn't work # try: # l = list(record.leader) # l[9] = 'a' # UTF-8 encoding # record.leader = "".join(l) # writer.write(record) # except UnicodeDecodeError as inst2: # print "tried again and failed again" # print "leader9 = %s" % record.leader[9] # print inst2 out = open(outfile, mode="w") sys.stdout = out print string.getvalue() string.close()
def store_record(db_object, pymarc_record, generated_fields): # type: (DBObject, PymarcRecord, Dict[AnyStr, Any]) -> PymarcRecord for tag, code_dict in generated_fields.items(): pymarc_field = pymarc.Field(tag, indicators=["#", "#"]) for code, ref in code_dict.items(): column = ref.split(".")[1] if hasattr(db_object, column): value = getattr(db_object, column) pymarc_field.add_subfield(code, value) pymarc_record.add_field(pymarc_field) return pymarc_record
def add_license(record, edition): if edition.license: # add 536 field (funding information) field536 = pymarc.Field(tag='536', indicators=[' ', ' '], subfields=[ 'a', edition.funding_info, ]) record.add_ordered_field(field536) # add 540 field (terms governing use) field540 = pymarc.Field(tag='540', indicators=[' ', ' '], subfields=[ 'a', dict(cc.CHOICES)[edition.license], 'u', dict(cc.GRANTS)[edition.license], ]) record.add_ordered_field(field540)
def test_writing_1_record(self): expected = r""" =LDR 22 4500 =100 00$ame =245 00$aFoo /$cby me. """ expected = textwrap.dedent(expected[1:]) file_handle = StringIO() try: writer = pymarc.TextWriter(file_handle) record = pymarc.Record() record.add_field(pymarc.Field('100', ['0', '0'], ['a', u('me')])) record.add_field( pymarc.Field( '245', ['0', '0'], ['a', u('Foo /'), 'c', u('by me.')])) writer.write(record) writer.close(close_fh=False) self.assertEquals(file_handle.getvalue(), expected) finally: file_handle.close()
def marc_record(self): """ Returns the item as a pymarc Record """ record = pymarc.Record() for tag, fprop in self.fields_properties.iteritems(): try: tag = int(tag) except Exception, error: continue # only marc fields if tag > 999: continue if fprop.first.subfields: sfields = [] indicators = [ fprop.first.indicator1 or "#", fprop.first.indicator2 or "#" ] for sf in fprop.first.subfields.values(): for val in sf.exec_value: sfields.append(sf.field_name) sfields.append(val) field = pymarc.Field(tag, indicators, sfields) record.add_field(field) else: try: exec_value = fprop.first.exec_value except Exception: exec_value = [] indicators = [ fprop.first.indicator1 or "#", fprop.first.indicator2 or "#" ] for val in exec_value: record.add_field( pymarc.Field(tag, indicators, data=str(val)))
def language_041_from_008(record): """Add a field 041##$$a with the language code from 008/35-37. If 041 already exists, append subfield $$a with the code, if not already present. """ lang = record["008"].data[35:38] if not record["041"]: record.add_ordered_field( pymarc.Field(tag="041", indicators=[" ", " "], subfields=["a", lang])) else: if not lang in record["041"].value(): record["041"].add_subfield("a", lang)