def test_lookup3(): line = '00\x1faJohn,\x1fcof Paris,\x1fd1240?-1306.\x1e' print fmt_line(get_subfields(line, 'abcd')) fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')) print fields found = name_lookup(fields) # print [i for i in found if 'Paris' in i[0]] # found = [(u'John of Paris', [u'Christian philosophers', u'Dominicans', u'Roman Catholic theologians', u'13th-century Latin writers', u'1255 births', u'1306 deaths'], u'john of paris', None)] dates = pick_first_date(v for k, v in fields if k == 'd') match = look_for_match(found, dates, False) match = pick_from_match(match) pprint(match)
def test_lookup(): line = '00\x1faEgeria,\x1fd4th/5th cent.\x1e' # count=3 wiki = 'Egeria (pilgrim)' print fmt_line(get_subfields(line, 'abcd')) fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')) print fields found = name_lookup(fields) print found dates = pick_first_date(v for k, v in fields if k == 'd') assert dates.items()[0] != ('date', '') print dates print print look_for_match(found, dates, True)
def test_lookup2(): line = '00\x1faRichard,\x1fcof St. Victor,\x1fdd. 1173.\x1e' print fmt_line(get_subfields(line, 'abcd')) fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')) print fields found = name_lookup(fields) dates = pick_first_date(v for k, v in fields if k == 'd') assert dates.items()[0] != ('date', '') print dates print match = look_for_match(found, dates, False) pprint(match) print match = pick_from_match(match) pprint(match)
def read_isbn(fields): if '020' not in fields: return {} found = [] for line in fields['020']: if '\x1f' in line: for k, v in get_subfields(line, ['a', 'z']): m = re_isbn_and_price.match(v) if m: found.append(m.group(1)) else: m = re_isbn.match(v) if m: found.append(m.group(1)) else: m = re_isbn.match(line[3:-1]) if m: found.append(m.group(1)) ret = {} seen = set() for i in tidy_isbn(found): if i in seen: # avoid dups continue seen.add(i) if len(i) == 13: ret.setdefault('isbn_13', []).append(i) elif len(i) <= 16: ret.setdefault('isbn_10', []).append(i) return ret
def read_marc(): for line in bz2.BZ2File('marc_authors.bz2'): line = eval(line) if '[Sound recording]' in line: continue line = strip_brackets(line) #print expr_in_utf8(get_all_subfields(line)) print(fmt_line(get_subfields(line, 'abcd')))
def read_line(line, name): if not line or '\x1fd' not in line: return subfields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')) marc_name = ' '.join(v for k, v in subfields if k in 'abc') flipped = flip_name(marc_name) if marc_name != name and flipped != name: return d = pick_first_date(v for k, v in subfields if k in 'abcd') dates = tuple(d.get(k, None) for k in ['birth_date', 'death_date', 'date']) return (marc_name, flipped, dates)
def read_oclc(fields): found = [] if '003' in fields and '001' in fields \ and fields['003'][0] == 'OCoLC': oclc = fields['001'][0] assert oclc.isdigit() found.append(oclc) for line in fields.get('035', []): for k, v in get_subfields(line, ['a']): m = re_oclc.match(v) if m: oclc = m.group(1) if oclc not in found: found.append(oclc) return {'oclc_number': remove_duplicates(found) } if found else {}
def read_series(fields): found = [] for tag in ('440', '490', '830'): if tag not in fields: continue for line in fields[tag]: this = [] for k, v in get_subfields(line, ['a', 'v']): if k == 'v' and v: this.append(v) continue v = v.rstrip('.,; ') if v: this.append(v) if this: found += [' -- '.join(this)] return {'series': found} if found else {}
def read_lccn(fields): if '010' not in fields: return {} found = [] for line in fields['010']: for k, v in get_subfields(line, ['a']): lccn = v.strip() if re_question.match(lccn): continue m = re_lccn.search(lccn) if not m: continue lccn = re_letters.sub('', m.group(1)).strip() if lccn: found.append(lccn) return {'lccn': found}
def db_marc_lookup(): c = get_cursor() articles = set() count = 0 t0 = time() match_count = 0 total = 3596802 for line in bz2.BZ2File('marc_authors.bz2'): count += 1 if count % 1000 == 0: t1 = time() - t0 rec_per_sec = count / t1 time_left = (total - count) / rec_per_sec print( count, match_count, "%.2f%% %.2f mins left" % ((match_count * 100) / count, time_left / 60)) line = eval(line) line = strip_brackets(line) fields = [(k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')] dates = pick_first_date(v for k, v in fields if k == 'd') if dates.items()[0] == ('date', ''): continue found = name_lookup(c, fields) if not found: continue match = {} seen = set() # print fmt_line(get_subfields(line, 'abcd')) # print dates for name, cats, match_name, pd in found: if name in seen: continue seen.add(name) cats = eval(cats) if not any( cat.endswith(' births') or cat.endswith(' deaths') for cat in cats): continue dm = date_match(dates, cats) if dm: match[name] = (cats, match_name) continue print((name, match_name)) print("cats =", cats) print(('match' if dm else 'no match')) for field in ['birth', 'death']: print(field + 's:', [ i[:-(len(field) + 2)] for i in cats if i.endswith(' %ss' % field) ], end=' ') print() # print '---' if not match: continue match_count += 1 # articles.add(match.keys()[0]) if len(match) != 1: print(count, match_count) print(fmt_line(get_subfields(line, 'abcd'))) for name, (cats, match_name) in match.items(): print(name, cats, match_name) print("http://en.wikipedia.org/wiki/" + name.replace(' ', '_')) print() continue # print len(articles), match[0][0], fmt_line(get_subfields(line, 'abcd')) assert len(match) == 1 print(match_count)
def marc_title(data): line = get_first_tag(data, set(['245'])) return ''.join( "<b>$%s</b>%s" % (esc(k), esc(v)) for k, v in get_subfields(line, set(['a', 'b']))) if line else None
def db_marc_lookup(): c = get_cursor() articles = set() count = 0 t0 = time() match_count = 0 total = 3596802 for line in bz2.BZ2File('marc_authors.bz2'): count+=1 if count % 1000 == 0: t1 = time() - t0 rec_per_sec = count / t1 time_left = (total - count) / rec_per_sec print count, match_count, "%.2f%% %.2f mins left" % ((match_count * 100) / count, time_left / 60) line = eval(line) line = strip_brackets(line) fields = [(k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')] dates = pick_first_date(v for k, v in fields if k == 'd') if dates.items()[0] == ('date', ''): continue found = name_lookup(c, fields) if not found: continue match = {} seen = set() # print fmt_line(get_subfields(line, 'abcd')) # print dates for name, cats, match_name, pd in found: if name in seen: continue seen.add(name) cats = eval(cats) if not any(cat.endswith(' births') or cat.endswith(' deaths') for cat in cats): continue dm = date_match(dates, cats) if dm: match[name] = (cats, match_name) continue print (name, match_name) print "cats =", cats print ('match' if dm else 'no match') for field in ['birth', 'death']: print field + 's:', [i[:-(len(field)+2)] for i in cats if i.endswith(' %ss' % field)], print # print '---' if not match: continue match_count+=1 # articles.add(match.keys()[0]) if len(match) != 1: print count, match_count print fmt_line(get_subfields(line, 'abcd')) for name, (cats, match_name) in match.items(): print name, cats, match_name print "http://en.wikipedia.org/wiki/" + name.replace(' ', '_') print continue # print len(articles), match[0][0], fmt_line(get_subfields(line, 'abcd')) assert len(match) == 1 print match_count
def marc_title(data): line = get_first_tag(data, set(["245"])) return "".join("<b>$%s</b>%s" % (esc(k), esc(v)) for k, v in get_subfields(line, set(["a", "b"]))) if line else None
def get_subfield_values(line, want): return [v for k, v in get_subfields(line, want)]
def marc_title(data): line = get_first_tag(data, set(['245'])) return ''.join("<b>$%s</b>%s" % (esc(k), esc(v)) for k, v in get_subfields(line, set(['a', 'b']))) if line else None
def db_marc_lookup(): verbose = False c = get_cursor() articles = set() count = 0 count_with_date = 0 t0 = time() match_count = 0 total = 3596802 prev_fields = None fh = open('matches3', 'w') for line in bz2.BZ2File('marc_authors.bz2'): count+=1 line = eval(line) line = strip_brackets(line) if count % 5000 == 0: t1 = time() - t0 rec_per_sec = count / t1 time_left = (total - count) / rec_per_sec print(fmt_line(get_subfields(line, 'abcd'))) print(count, count_with_date, match_count, "%.2f%% %.2f mins left" % (float(match_count * 100.0) / float(count_with_date), time_left / 60)) fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')) if prev_fields == fields: continue prev_fields = fields dates = pick_first_date(v for k, v in fields if k == 'd') if dates.items()[0] == ('date', ''): continue count_with_date += 1 if verbose: print(fmt_line(get_subfields(line, 'abcd'))) print(dates) is_noble_or_clergy = any(re_noble_or_clergy.search(v) \ for v in get_subfield_values(line, 'c')) found = name_lookup(c, fields) if not found: continue if is_noble_or_clergy: print('noble or clergy not found:') print(fmt_line(get_subfields(line, 'abcd'))) print() continue match = {} seen = set() for name, cats, match_name, pd in found: if name in seen: continue seen.add(name) cats = eval(cats) if not any(cat.endswith(' births') or cat.endswith(' deaths') for cat in cats): continue dm = date_match(dates, cats) if dm: match[name] = (cats, match_name) if not verbose: continue print((name, match_name)) print("cats =", cats) print(('match' if dm else 'no match')) for field in ['birth', 'death']: print(field + 's:', [i[:-(len(field)+2)] for i in cats if i.endswith(' %ss' % field)], end=' ') print() if verbose: print('---') if not match: continue if is_noble_or_clergy: print('noble or clergy not found:') print(fmt_line(get_subfields(line, 'abcd'))) print(found) print() continue match_count+=1 # articles.add(match.keys()[0]) if len(match) != 1: match = pick_from_match(match) if len(match) != 1: print(count, match_count) print(fmt_line(get_subfields(line, 'abcd'))) more_than_one_match(match) else: #print (list(get_subfields(line, 'abcd')), match.keys()[0]) print((match.keys()[0], fields), file=fh) continue # print len(articles), match[0][0], fmt_line(get_subfields(line, 'abcd')) assert len(match) == 1 print(match_count) fh.close()
def get_contents(line, want): contents = {} for k, v in get_subfields(line, want): contents.setdefault(k, []).append(v) return contents