def test_ia_charset(): # Tests a corrupted unicode MARC record is corrected, does code exist to fix this? data = open(test_data + 'histoirereligieu05cr_meta.mrc').read() line = list(get_tag_lines(data, set(['100'])))[0][1] a = list(get_all_subfields(line))[0][1] expect = u'Crétineau-Joly, J.' assert a == expect
def test_wrapped_lines(): data = open(test_data + 'wrapped_lines').read() ret = list(handle_wrapped_lines(get_tag_lines(data, ['520']))) assert len(ret) == 2 a, b = ret assert a[0] == '520' and b[0] == '520' assert len(a[1]) == 2295 assert len(b[1]) == 248
def read_edition(loc, data): fields = {} for tag, line in handle_wrapped_lines(get_tag_lines(data, want)): fields.setdefault(tag, []).append(line) edition = {} if len(fields['008']) != 1: warn("There should be a single '008' field, %s has %d." % (loc, len(fields['008']))) return {} f = fields['008'][0] if not f: warn("'008' field must not be blank in %s" % (loc)) return {} publish_date = str(f)[7:11] if publish_date.isdigit() and publish_date != '0000': edition["publish_date"] = publish_date try: if str(f)[6] == 't': edition["copyright_date"] = str(f)[11:15] except: print loc raise publish_country = str(f)[15:18] if publish_country not in ('|||', ' '): edition["publish_country"] = publish_country lang = str(f)[35:38] if lang not in (' ', '|||'): edition["languages"] = [{ 'key': '/l/' + lang }] edition.update(read_lccn(fields)) try: edition.update(read_isbn(fields)) except: print loc raise edition.update(read_oclc(fields)) edition.update(read_lc_classification(fields)) edition.update(read_dewey(fields)) edition.update(read_authors(fields)) edition.update(read_title(fields)) edition.update(read_genres(fields)) edition.update(read_subjects(fields)) edition.update(read_pagination(fields)) edition.update(read_series(fields)) edition.update(read_work_titles(fields)) edition.update(read_other_titles(fields)) edition.update(read_edition_name(fields)) edition.update(read_publisher(fields)) edition.update(read_contributions(fields)) edition.update(read_location(fields)) edition.update(read_url(fields)) edition.update(read_toc(fields)) edition.update(read_notes(fields)) edition.update(read_description(fields)) return edition
def read_fields(self, want): want = set(want) for tag, line in handle_wrapped_lines(get_tag_lines(self.data, want)): if tag not in want: continue if tag.startswith('00'): # marc_upei/marc-for-openlibrary-bigset.mrc:78997353:588 if tag == '008' and line == '': continue assert line[-1] == '\x1e' yield tag, line[:-1] else: yield tag, BinaryDataField(line)
def read_fields(self, want): want = set(want) marc8 = self.leader()[9] != 'a' for tag, line in fast_parse.handle_wrapped_lines(fast_parse.get_tag_lines(self.data, want)): if tag not in want: continue if tag.startswith('00'): # marc_upei/marc-for-openlibrary-bigset.mrc:78997353:588 if tag == '008' and line == '': continue assert line[-1] == '\x1e' yield tag, line[:-1] else: yield tag, BinaryDataField(self, line)
def get_marc_subjects(w): for src in get_marc_source(w): data = None try: data = get_data(src) except ValueError: print 'bad record source:', src print 'http://openlibrary.org' + w['key'] continue except urllib2.HTTPError, error: print 'HTTP error:', error.code, error.msg print 'http://openlibrary.org' + w['key'] if not data: continue try: lines = list(get_tag_lines(data, subject_fields)) except BadDictionary: print 'bad dictionary:', src print 'http://openlibrary.org' + w['key'] continue if lines: yield lines
def get_marc_subjects(w): for src in get_marc_source(w): data = None from openlibrary.catalog.get_ia import get_data try: data = get_data(src) except ValueError: print 'bad record source:', src print 'http://openlibrary.org' + w['key'] continue except urllib2.HTTPError, error: print 'HTTP error:', error.code, error.msg print 'http://openlibrary.org' + w['key'] if not data: continue try: lines = list(get_tag_lines(data, subject_fields)) except BadDictionary: print 'bad dictionary:', src print 'http://openlibrary.org' + w['key'] continue if lines: yield lines
def get_marc_subjects(w): for src in get_marc_source(w): data = None from openlibrary.catalog.get_ia import get_data try: data = get_data(src) except ValueError: print('bad record source:', src) print('http://openlibrary.org' + w['key']) continue except urllib2.HTTPError as error: print('HTTP error:', error.code, error.msg) print('http://openlibrary.org' + w['key']) if not data: continue try: lines = list(get_tag_lines(data, subject_fields)) except BadDictionary: print('bad dictionary:', src) print('http://openlibrary.org' + w['key']) continue if lines: yield lines
def __init__(self, data): fields = {} for tag, line in get_tag_lines(data, want): fields.setdefault(tag, []).append(line) self.fields = fields
def read_works(): i = 0 pages = {} page_marc = {} for work, marc in work_and_marc(): lines = [] for loc in marc: data = get_data(loc) if not data: continue found = [v for k, v in get_tag_lines(data, set(['600']))] if found: lines.append((loc, found)) if not lines: continue work['lines'] = lines i += 1 print(i, work['key'], work['title']) try: people, marc_alt = read_people(j[1] for j in lines) except AssertionError: print(work['lines']) continue except KeyError: print(work['lines']) continue marc_alt_reverse = defaultdict(set) for k, v in marc_alt.items(): marc_alt_reverse[v].add(k) w = ol.get(work['key']) w['subject_people'] = [] for p, num in people.iteritems(): print(' %2d %s' % (num, ' '.join("%s: %s" % (k, v) for k, v in p))) print(' ', p) if p in page_marc: w['subject_people'].append({'key': '/subjects/people/' + page_marc[p]}) continue obj = build_person_object(p, marc_alt_reverse.get(p, [])) key = obj['name'].replace(' ', '_') full_key = '/subjects/people/' + key w['subject_people'].append({'key': full_key}) if key in pages: print(key) pages[key]['marc'].append(p) continue for m in obj['marc']: page_marc[m] = key pages[key] = obj obj_for_db = obj.copy() del obj_for_db['marc'] obj_for_db['key'] = full_key obj_for_db['type'] = '/type/person' print(ol.save(full_key.encode('utf-8'), obj_for_db, 'create a new person page')) print(w) print(ol.save(w['key'], w, 'add links to people that this work is about'))
f = open(filename) for pos, loc, data in read_marc_file(full_part, f): rec_no +=1 yield rec_no, pos, loc, data # source_record,oclc,accompanying_material,translated_from,title re_oclc = re.compile ('^\(OCoLC\).*?0*(\d+)') out = open('/3/edward/updates', 'w') want = set(['001', '003', '035', '041', '245', '300']) for rec_no, pos, loc, data in iter_marc(): fields = {} rec = {} title_seen = False for tag, line in handle_wrapped_lines(get_tag_lines(data, want)): if tag == '245': if title_seen: continue title_seen = True if line[1] == '0': # no prefix continue contents = get_contents(line, ['a', 'b']) if 'a' in contents: rec['title'] = ' '.join(x.strip(' /,;:') for x in contents['a']) elif 'b' in contents: rec['title'] = contents['b'][0].strip(' /,;:') if 'title' in rec and has_dot(rec['title']): rec['title'] = rec['title'][:-1] continue if tag == '300':
def load_part(archive_id, part, start_pos=0): print 'load_part:', archive_id, part global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print loc print fast_parse.get_tag_lines(data, ['245']) raise except AssertionError: print loc raise except fast_parse.NotBook: continue if not index_fields or 'title' not in index_fields: continue print loc edition_pool = pool.build(index_fields) if not edition_pool: yield loc, data continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect': print 'following redirect %s => %s' % (edition_key, thing['location']) edition_key = thing['location'] if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing, data) match = True break if match: break if not match: yield loc, data
def load_part(archive_id, part, start_pos=0): print('load_part:', archive_id, part) global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print(loc) print(fast_parse.get_tag_lines(data, ['245'])) raise except AssertionError: print(loc) raise except fast_parse.NotBook: continue if not index_fields or 'title' not in index_fields: continue print(loc) edition_pool = pool.build(index_fields) if not edition_pool: yield loc, data continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect': print('following redirect %s => %s' % (edition_key, thing['location'])) edition_key = thing['location'] if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing, data) match = True break if match: break if not match: yield loc, data
for pos, loc, data in read_marc_file(full_part, f): rec_no += 1 yield rec_no, pos, loc, data # source_record,oclc,accompanying_material,translated_from,title re_oclc = re.compile('^\(OCoLC\).*?0*(\d+)') out = open('/3/edward/updates', 'w') want = set(['001', '003', '035', '041', '245', '300']) for rec_no, pos, loc, data in iter_marc(): fields = {} rec = {} title_seen = False for tag, line in handle_wrapped_lines(get_tag_lines(data, want)): if tag == '245': if title_seen: continue title_seen = True if line[1] == '0': # no prefix continue contents = get_contents(line, ['a', 'b']) if 'a' in contents: rec['title'] = ' '.join( x.strip(' /,;:') for x in contents['a']) elif 'b' in contents: rec['title'] = contents['b'][0].strip(' /,;:') if 'title' in rec and has_dot(rec['title']): rec['title'] = rec['title'][:-1] continue
def read_works(): i = 0 pages = {} page_marc = {} for work, marc in work_and_marc(): lines = [] for loc in marc: data = get_data(loc) if not data: continue found = [v for k, v in get_tag_lines(data, set(['600']))] if found: lines.append((loc, found)) if not lines: continue work['lines'] = lines i += 1 print i, work['key'], work['title'] try: people, marc_alt = read_people(j[1] for j in lines) except AssertionError: print work['lines'] continue except KeyError: print work['lines'] continue marc_alt_reverse = defaultdict(set) for k, v in marc_alt.items(): marc_alt_reverse[v].add(k) w = ol.get(work['key']) w['subject_people'] = [] for p, num in people.iteritems(): print ' %2d %s' % (num, ' '.join("%s: %s" % (k, v) for k, v in p)) print ' ', p if p in page_marc: w['subject_people'].append({'key': '/subjects/people/' + page_marc[p]}) continue obj = build_person_object(p, marc_alt_reverse.get(p, [])) key = obj['name'].replace(' ', '_') full_key = '/subjects/people/' + key w['subject_people'].append({'key': full_key}) if key in pages: print key pages[key]['marc'].append(p) continue for m in obj['marc']: page_marc[m] = key pages[key] = obj obj_for_db = obj.copy() del obj_for_db['marc'] obj_for_db['key'] = full_key obj_for_db['type'] = '/type/person' print ol.save(full_key.encode('utf-8'), obj_for_db, 'create a new person page') print w print ol.save(w['key'], w, 'add links to people that this work is about')