def read_isbn(fields): if '020' not in fields: return {} found = [] for line in fields['020']: if '\x1f' in line: for k, v in get_subfields(line, ['a', 'z']): m = re_isbn_and_price.match(v) if m: found.append(m.group(1)) else: m = re_isbn.match(v) if m: found.append(m.group(1)) else: m = re_isbn.match(line[3:-1]) if m: found.append(m.group(1)) ret = {} seen = set() for i in tidy_isbn(found): if i in seen: # avoid dups continue seen.add(i) if len(i) == 13: ret.setdefault('isbn_13', []).append(i) elif len(i) <= 16: ret.setdefault('isbn_10', []).append(i) return ret
def find_aspects(line): cur = [(i, j) for i, j in get_subfields(line, 'ax')] if len(cur) < 2 or cur[0][0] != 'a' or cur[1][0] != 'x': return a, x = cur[0][1], cur[1][1] x = x.strip('. ') a = a.strip('. ') if not re_aspects.search(x): return if a == 'Body, Human': a = 'the Human body' return x + ' of ' + flip_subject(a)
def read_oclc(fields): found = [] if '003' in fields and '001' in fields \ and fields['003'][0] == 'OCoLC': oclc = fields['001'][0] assert oclc.isdigit() found.append(oclc) for line in fields.get('035', []): for k, v in get_subfields(line, ['a']): m = re_oclc.match(v) if m: oclc = m.group(1) if oclc not in found: found.append(oclc) return {'oclc_number': remove_duplicates(found) } if found else {}
def read_series(fields): found = [] for tag in ('440', '490', '830'): if tag not in fields: continue for line in fields[tag]: this = [] for k, v in get_subfields(line, ['a', 'v']): if k == 'v' and v: this.append(v) continue v = v.rstrip('.,; ') if v: this.append(v) if this: found += [' -- '.join(this)] return {'series': found} if found else {}
def read_lccn(fields): if '010' not in fields: return {} found = [] for line in fields['010']: for k, v in get_subfields(line, ['a']): lccn = v.strip() if re_question.match(lccn): continue m = re_lccn.search(lccn) if not m: continue lccn = re_letters.sub('', m.group(1)).strip() if lccn: found.append(lccn) return {'lccn': found}
def test_lookup(): for line in marc: fields = tuple((k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')) found = name_lookup(fields) for i in found: print i dates = pick_first_date(v for k, v in fields if k == 'd') print dates match = look_for_match(found, dates, False) print len(match) for i in match: print i #pprint(match) if len(match) != 1: match = pick_from_match(match) if len(match) != 1: for i in more_than_one_match(match): print i print
def test_lookup(): for line in marc: fields = tuple( (k, v.strip(' /,;:')) for k, v in get_subfields(line, 'abcd')) found = name_lookup(fields) for i in found: print i dates = pick_first_date(v for k, v in fields if k == 'd') print dates match = look_for_match(found, dates, False) print len(match) for i in match: print i #pprint(match) if len(match) != 1: match = pick_from_match(match) if len(match) != 1: for i in more_than_one_match(match): print i print
def find_subjects(marc_subjects): person = defaultdict(int) event = defaultdict(int) work = defaultdict(int) org = defaultdict(int) time = defaultdict(int) place = defaultdict(int) subject = defaultdict(int) #fiction = False for lines in marc_subjects: for tag, line in lines: aspects = find_aspects(line) if aspects: subject[aspects] += 1 if re_large_book.search(line): continue if tag == '600': # people name_and_date = [] for k, v in get_subfields(line, ['a', 'b', 'c', 'd']): v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip( ' /,;:') if k == 'a': if v == 'Mao, Zedong': v = 'Mao Zedong' else: m = re_flip_name.match(v) if m: v = flip_name(v) name_and_date.append(v) name = remove_trailing_dot(' '.join(name_and_date)).strip() if name != '': person[name] += 1 elif tag == '610': # org v = ' '.join(get_subfield_values(line, 'abcd')) v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 for v in get_subfield_values(line, 'a'): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 elif tag == '611': # event v = ' '.join(j.strip() for i, j in get_all_subfields(line) if i not in 'vxyz') if v: v = v.strip() v = tidy_subject(v) if v: event[v] += 1 elif tag == '630': # work for v in get_subfield_values(line, ['a']): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: work[v] += 1 elif tag == '650': # topical for v in get_subfield_values(line, ['a']): if v: v = v.strip() v = tidy_subject(v) if v: subject[v] += 1 elif tag == '651': # geo for v in get_subfield_values(line, ['a']): if v: place[flip_place(v).strip()] += 1 else: print 'other', tag, list(get_all_subfields(line)) cur = [ v for k, v in get_all_subfields(line) if k == 'a' or v.strip('. ').lower() == 'fiction' ] # skip: 'Good, Sally (Fictitious character) in fiction' if len(cur) > 1 and cur[-1].strip( '. ').lower() == 'fiction' and ')' not in cur[-2]: subject[flip_subject(cur[-2]) + ' in fiction'] += 1 for v in get_subfield_values(line, ['y']): v = v.strip() if v: time[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['v']): v = v.strip() if v: v = remove_trailing_dot(v).strip() v = tidy_subject(v) if v: subject[v] += 1 for v in get_subfield_values(line, ['z']): v = v.strip() if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['x']): v = v.strip() if not v: continue if aspects and re_aspects.search(v): continue v = tidy_subject(v) if v: subject[v] += 1 v_and_x = get_subfield_values(line, ['v', 'x']) #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x: # fiction = True #if 'Fiction' in subject: # del subject['Fiction'] ret = {} if person: ret['person'] = dict(person) if time: ret['time'] = dict(time) if place: ret['place'] = dict(place) if subject: ret['subject'] = dict(subject) if event: ret['event'] = dict(event) if org: ret['org'] = dict(org) if work: ret['work'] = dict(work) return ret
def find_subjects(marc_subjects): person = defaultdict(int) event = defaultdict(int) work = defaultdict(int) org = defaultdict(int) time = defaultdict(int) place = defaultdict(int) subject = defaultdict(int) #fiction = False for lines in marc_subjects: for tag, line in lines: aspects = find_aspects(line) if aspects: subject[aspects] += 1 if re_large_book.search(line): continue if tag == '600': # people name_and_date = [] for k, v in get_subfields(line, ['a', 'b', 'c', 'd']): v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:') if k == 'a': if v == 'Mao, Zedong': v = 'Mao Zedong' else: m = re_flip_name.match(v) if m: v = flip_name(v) name_and_date.append(v) name = remove_trailing_dot(' '.join(name_and_date)).strip() if name != '': person[name] += 1 elif tag == '610': # org v = ' '.join(get_subfield_values(line, 'abcd')) v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 for v in get_subfield_values(line, 'a'): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 elif tag == '611': # event v = ' '.join(j.strip() for i, j in get_all_subfields(line) if i not in 'vxyz') if v: v = v.strip() v = tidy_subject(v) if v: event[v] += 1 elif tag == '630': # work for v in get_subfield_values(line, ['a']): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: work[v] += 1 elif tag == '650': # topical for v in get_subfield_values(line, ['a']): if v: v = v.strip() v = tidy_subject(v) if v: subject[v] += 1 elif tag == '651': # geo for v in get_subfield_values(line, ['a']): if v: place[flip_place(v).strip()] += 1 else: print 'other', tag, list(get_all_subfields(line)) cur = [v for k, v in get_all_subfields(line) if k=='a' or v.strip('. ').lower() == 'fiction'] # skip: 'Good, Sally (Fictitious character) in fiction' if len(cur) > 1 and cur[-1].strip('. ').lower() == 'fiction' and ')' not in cur[-2]: subject[flip_subject(cur[-2]) + ' in fiction'] += 1 for v in get_subfield_values(line, ['y']): v = v.strip() if v: time[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['v']): v = v.strip() if v: v = remove_trailing_dot(v).strip() v = tidy_subject(v) if v: subject[v] += 1 for v in get_subfield_values(line, ['z']): v = v.strip() if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['x']): v = v.strip() if not v: continue if aspects and re_aspects.search(v): continue v = tidy_subject(v) if v: subject[v] += 1 v_and_x = get_subfield_values(line, ['v', 'x']) #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x: # fiction = True #if 'Fiction' in subject: # del subject['Fiction'] ret = {} if person: ret['person'] = dict(person) if time: ret['time'] = dict(time) if place: ret['place'] = dict(place) if subject: ret['subject'] = dict(subject) if event: ret['event'] = dict(event) if org: ret['org'] = dict(org) if work: ret['work'] = dict(work) return ret
def get_subfield_values(line, want): return [v for k, v in get_subfields(line, want)]
def get_contents(line, want): contents = {} for k, v in get_subfields(line, want): contents.setdefault(k, []).append(v) return contents
def find_subjects(w, marc_subjects=None): people = defaultdict(int) genres = defaultdict(int) when = defaultdict(int) place = defaultdict(int) subject = defaultdict(int) #fiction = False for lines in marc_subjects or get_marc_subjects(w): for tag, line in lines: if re_large_book.match(line): continue if tag == '600': # people name_and_date = [] for k, v in get_subfields(line, ['a', 'b', 'c', 'd']): v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:') if k == 'a': if v == 'Mao, Zedong': v = 'Mao Zedong' else: m = re_flip_name.match(v) if m: v = flip_name(v) name_and_date.append(v) name = remove_trailing_dot(' '.join(name_and_date)).strip() if name != '': people[name] += 1 if tag == '650': for v in get_subfield_values(line, ['a']): if v: v = v.strip() v = tidy_subject(v) if v: subject[v] += 1 if tag == '651': for v in get_subfield_values(line, ['a']): if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['y']): v = v.strip() if v: when[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['v']): v = v.strip() if v: subject[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['z']): v = v.strip() if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['x']): v = v.strip() if v: v = tidy_subject(v) if v: subject[v] += 1 v_and_x = get_subfield_values(line, ['v', 'x']) #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x: # fiction = True #if 'Fiction' in subject: # del subject['Fiction'] ret = {} if people: ret['people'] = dict(people) if when: ret['times'] = dict(when) if place: ret['places'] = dict(place) if subject: ret['subjects'] = dict(subject) return ret