def generateMARCXML(MARC21input, MARCXMLoutput): """Write MARCXML file for given MARC21 binary file.""" reader = pymarc.MARCReader(open(MARC21input, 'rb')) for record in reader: writer = pymarc.XMLWriter(open(MARCXMLoutput,'wb')) writer.write(record) writer.close()
def test_writing_1_record(self): expected = r""" <?xml version="1.0" encoding="UTF-8"?> <collection xmlns="http://www.loc.gov/MARC21/slim"> <record> <leader> 22 4500</leader> <datafield ind1="0" ind2="0" tag="100"> <subfield code="a">me</subfield> </datafield> <datafield ind1="0" ind2="0" tag="245"> <subfield code="a">Foo /</subfield> <subfield code="c">by me.</subfield> </datafield> </record> </collection> """ expected = textwrap.dedent(expected[1:]).replace("\n", "").encode() file_handle = BytesIO() try: writer = pymarc.XMLWriter(file_handle) record = pymarc.Record() record.add_field(pymarc.Field("100", ["0", "0"], ["a", "me"])) record.add_field( pymarc.Field("245", ["0", "0"], ["a", "Foo /", "c", "by me."])) writer.write(record) writer.close(close_fh=False) self.assertEquals(file_handle.getvalue(), expected) finally: file_handle.close()
def test_writing_1_record(self): expected = r""" <?xml version="1.0" encoding="UTF-8"?> <collection xmlns="http://www.loc.gov/MARC21/slim"> <record> <leader> 22 4500</leader> <datafield ind1="0" ind2="0" tag="100"> <subfield code="a">me</subfield> </datafield> <datafield ind1="0" ind2="0" tag="245"> <subfield code="a">Foo /</subfield> <subfield code="c">by me.</subfield> </datafield> </record> </collection> """ expected = textwrap.dedent(expected[1:]).replace('\n', '') if str != binary_type: expected = expected.encode() file_handle = BytesIO() try: writer = pymarc.XMLWriter(file_handle) record = pymarc.Record() record.add_field(pymarc.Field('100', ['0', '0'], ['a', u('me')])) record.add_field( pymarc.Field( '245', ['0', '0'], ['a', u('Foo /'), 'c', u('by me.')])) writer.write(record) writer.close(close_fh=False) self.assertEquals(file_handle.getvalue(), expected) finally: file_handle.close()
def test_close_true(self): """If close_fh is true, then the file handle is also closed.""" file_handle = BytesIO() self.assertFalse(file_handle.closed, "The file handle should be open") writer = pymarc.XMLWriter(file_handle) self.assertFalse(file_handle.closed, "The file handle should still be open") writer.close() self.assertTrue(file_handle.closed, "The file handle should close when the writer closes")
def write_collection(records, write_location): '''writes an array/generator of records into an xml collection file''' writer = pymarc.XMLWriter(open(write_location, 'wb')) for record in records: if type(record) == pymarc.record.Record: writer.write(record) else: raise Exception( 'attempted to pass non-record object into record writer') writer.close()
def test_close_false(self): """If close_fh is false, then the file handle is NOT closed.""" file_handle = BytesIO() self.assertFalse(file_handle.closed, "The file handle should be open") writer = pymarc.XMLWriter(file_handle) self.assertFalse(file_handle.closed, "The file handle should still be open") writer.close(close_fh=False) self.assertFalse( file_handle.closed, "The file handle should NOT close when the writer closes", )
def test_writing_0_records(self): expected = r""" <?xml version="1.0" encoding="UTF-8"?> <collection xmlns="http://www.loc.gov/MARC21/slim"> </collection> """ expected = textwrap.dedent(expected[1:]).replace("\n", "").encode() file_handle = BytesIO() try: writer = pymarc.XMLWriter(file_handle) writer.close(close_fh=False) self.assertEquals(file_handle.getvalue(), expected) finally: file_handle.close()
def main(): '''parses args pointing to record xml paths, specifies output paths, and applies "pull_arabic"''' logger = logging.getLogger(__name__) logger.info( 'collecting arabic records and extracting parallel Arabic/Romanized representations' ) parser = argparse.ArgumentParser() parser.add_argument('input_directory', help='path to directory containing records') parser.add_argument( '-f', '--sub_directory_filter', help= 'select a particular subdirectory inside a complex directory structure' ) parser.add_argument( '-n', '--name', help='optional source name, otherwise take directory name') args = parser.parse_args() if args.name: name = args.name else: name = args.input_directory.split('/')[-1] logger.info(f'source: {name}') record_paths = get_xml_paths(args.input_directory, args.sub_directory_filter) writer = pymarc.XMLWriter(open(f'data/arabic_records/{name}.xml', 'wb')) for path in record_paths: xmlname = path.split('/')[-1].replace('.xml', '') pymarc.map_xml(lambda record: pull_arabic(record, writer=writer), path) writer.close() global counter008 global counter880 logger.info( f'# of Arabic records ("ara" in language field 008): {counter008}')
def write_to_file(reclist, filename="output", form="bin"): """write records to file""" if form == "bin": filename = filename + ".mrc" with open(filename, "wb") as out: for record in reclist: out.write(record.as_marc()) elif form == "xml": filename = filename + ".xml" writer = pymarc.XMLWriter(open(filename, "wb")) for record in reclist: writer.write(record) writer.close() elif form == "text": filename = filename + ".txt" with open(filename, "wt", encoding="utf-8") as out: writer = pymarc.TextWriter(out) for record in reclist: writer.write(record)
def test_writing_empty_record(self): expected = r""" <?xml version="1.0" encoding="UTF-8"?> <collection xmlns="http://www.loc.gov/MARC21/slim"> <record> <leader> 22 4500</leader> </record> </collection> """ expected = textwrap.dedent(expected[1:]).replace('\n', '') if str != binary_type: expected = expected.encode() file_handle = BytesIO() try: writer = pymarc.XMLWriter(file_handle) record = pymarc.Record() writer.write(record) writer.close(close_fh=False) self.assertEquals(file_handle.getvalue(), expected) finally: file_handle.close()
def write_marcxml(self, record, filename): """Write record to filename as MARCXML.""" with open(filename, 'wb') as fh: writer = pymarc.XMLWriter(fh) writer.write(record) writer.close()
for root, dirs, files in os.walk(rootdir): for name in files: if waiting: print(name,restart_file) if name == restart_file: waiting = False if not waiting: if name[0] != '.': start_time = datetime.datetime.now().time() print(read_format) # Convert JSON files into MARCXML if read_format == 'json': print("Starting output writer") output_writer = pymarc.XMLWriter(open(results_folder_name + SLASH + name + '.xml','wb')) print("Opening file") readFile(rootdir + SLASH + name,output_writer) output_writer.close() print("Opening output file") # Convert MARCXML into BIBFRAME if read_format == 'json': bibf_output_file = bibf_results_folder_name + SLASH + 'BIBF_' + name + '.xml' xml_input_file = results_folder_name + SLASH + name + '.xml' else: bibf_output_file = bibf_results_folder_name + SLASH + 'BIBF_' + name xml_input_file = root + SLASH + name print(xml_input_file) bibf_output = open(bibf_output_file,'w')