def test_wrapped_lines(): data = open(test_data + 'wrapped_lines').read() ret = list(handle_wrapped_lines(get_tag_lines(data, ['520']))) assert len(ret) == 2 a, b = ret assert a[0] == '520' and b[0] == '520' assert len(a[1]) == 2295 assert len(b[1]) == 248
def all_fields(self): marc8 = self.leader()[9] != 'a' for tag, line in handle_wrapped_lines(get_all_tag_lines(self.data)): if tag.startswith('00'): # marc_upei/marc-for-openlibrary-bigset.mrc:78997353:588 if tag == '008' and line == '': continue assert line[-1] == '\x1e' yield tag, line[:-1] else: yield tag, BinaryDataField(self, line)
def all_fields(self): marc8 = self.leader()[9] != 'a' for tag, line in fast_parse.handle_wrapped_lines(fast_parse.get_all_tag_lines(self.data)): if tag.startswith('00'): # marc_upei/marc-for-openlibrary-bigset.mrc:78997353:588 if tag == '008' and line == '': continue assert line[-1] == '\x1e' yield tag, line[:-1] else: yield tag, BinaryDataField(self, line)
def read_edition(loc, data): fields = {} for tag, line in handle_wrapped_lines(get_tag_lines(data, want)): fields.setdefault(tag, []).append(line) edition = {} if len(fields['008']) != 1: warn("There should be a single '008' field, %s has %d." % (loc, len(fields['008']))) return {} f = fields['008'][0] if not f: warn("'008' field must not be blank in %s" % (loc)) return {} publish_date = str(f)[7:11] if publish_date.isdigit() and publish_date != '0000': edition["publish_date"] = publish_date try: if str(f)[6] == 't': edition["copyright_date"] = str(f)[11:15] except: print loc raise publish_country = str(f)[15:18] if publish_country not in ('|||', ' '): edition["publish_country"] = publish_country lang = str(f)[35:38] if lang not in (' ', '|||'): edition["languages"] = [{ 'key': '/l/' + lang }] edition.update(read_lccn(fields)) try: edition.update(read_isbn(fields)) except: print loc raise edition.update(read_oclc(fields)) edition.update(read_lc_classification(fields)) edition.update(read_dewey(fields)) edition.update(read_authors(fields)) edition.update(read_title(fields)) edition.update(read_genres(fields)) edition.update(read_subjects(fields)) edition.update(read_pagination(fields)) edition.update(read_series(fields)) edition.update(read_work_titles(fields)) edition.update(read_other_titles(fields)) edition.update(read_edition_name(fields)) edition.update(read_publisher(fields)) edition.update(read_contributions(fields)) edition.update(read_location(fields)) edition.update(read_url(fields)) edition.update(read_toc(fields)) edition.update(read_notes(fields)) edition.update(read_description(fields)) return edition
def read_fields(self, want): want = set(want) for tag, line in handle_wrapped_lines(get_tag_lines(self.data, want)): if tag not in want: continue if tag.startswith('00'): # marc_upei/marc-for-openlibrary-bigset.mrc:78997353:588 if tag == '008' and line == '': continue assert line[-1] == '\x1e' yield tag, line[:-1] else: yield tag, BinaryDataField(line)
def read_fields(self, want): want = set(want) marc8 = self.leader()[9] != 'a' #for tag, line in handle_wrapped_lines(get_tag_lines(self.data, want)): for tag, line in handle_wrapped_lines(get_tag_lines(self.data, want)): if tag not in want: continue if tag.startswith('00'): # marc_upei/marc-for-openlibrary-bigset.mrc:78997353:588 if tag == '008' and line == '': continue assert line[-1] == '\x1e' yield tag, line[:-1] else: yield tag, BinaryDataField(self, line)
f = open(filename) for pos, loc, data in read_marc_file(full_part, f): rec_no +=1 yield rec_no, pos, loc, data # source_record,oclc,accompanying_material,translated_from,title re_oclc = re.compile ('^\(OCoLC\).*?0*(\d+)') out = open('/3/edward/updates', 'w') want = set(['001', '003', '035', '041', '245', '300']) for rec_no, pos, loc, data in iter_marc(): fields = {} rec = {} title_seen = False for tag, line in handle_wrapped_lines(get_tag_lines(data, want)): if tag == '245': if title_seen: continue title_seen = True if line[1] == '0': # no prefix continue contents = get_contents(line, ['a', 'b']) if 'a' in contents: rec['title'] = ' '.join(x.strip(' /,;:') for x in contents['a']) elif 'b' in contents: rec['title'] = contents['b'][0].strip(' /,;:') if 'title' in rec and has_dot(rec['title']): rec['title'] = rec['title'][:-1] continue if tag == '300':
for pos, loc, data in read_marc_file(full_part, f): rec_no += 1 yield rec_no, pos, loc, data # source_record,oclc,accompanying_material,translated_from,title re_oclc = re.compile('^\(OCoLC\).*?0*(\d+)') out = open('/3/edward/updates', 'w') want = set(['001', '003', '035', '041', '245', '300']) for rec_no, pos, loc, data in iter_marc(): fields = {} rec = {} title_seen = False for tag, line in handle_wrapped_lines(get_tag_lines(data, want)): if tag == '245': if title_seen: continue title_seen = True if line[1] == '0': # no prefix continue contents = get_contents(line, ['a', 'b']) if 'a' in contents: rec['title'] = ' '.join( x.strip(' /,;:') for x in contents['a']) elif 'b' in contents: rec['title'] = contents['b'][0].strip(' /,;:') if 'title' in rec and has_dot(rec['title']): rec['title'] = rec['title'][:-1] continue