def read_author_person(f): f.remove_brackets() author = {} contents = f.get_contents(['a', 'b', 'c', 'd', 'e']) if 'a' not in contents and 'c' not in contents: return # should at least be a name or title name = [v.strip(' /,;:') for v in f.get_subfield_values(['a', 'b', 'c'])] if 'd' in contents: author = pick_first_date(strip_foc(d).strip(',') for d in contents['d']) if 'death_date' in author and author['death_date']: death_date = author['death_date'] if re_number_dot.search(death_date): author['death_date'] = death_date[:-1] author['name'] = ' '.join(name) author['entity_type'] = 'person' subfields = [ ('a', 'personal_name'), ('b', 'numeration'), ('c', 'title'), ('e', 'role') ] for subfield, field_name in subfields: if subfield in contents: author[field_name] = remove_trailing_dot(' '.join([x.strip(' /,;:') for x in contents[subfield]])) if 'q' in contents: author['fuller_name'] = ' '.join(contents['q']) for f in 'name', 'personal_name': author[f] = remove_trailing_dot(strip_foc(author[f])) return author
def read_work_titles(rec): found = [] tag_240 = rec.get_fields('240') if tag_240: for f in tag_240: title = f.get_subfield_values(['a', 'm', 'n', 'p', 'r']) found.append(remove_trailing_dot(' '.join(title).strip(','))) tag_130 = rec.get_fields('130') if tag_130: for f in tag_130: title = ' '.join(v for k, v in f.get_all_subfields() if k.islower() and k != 'n') found.append(remove_trailing_dot(title.strip(','))) return remove_duplicates(found)
def read_title(rec): fields = rec.get_fields('245') if not fields: fields = rec.get_fields('740') if not fields: raise NoTitle # example MARC record with multiple titles: # http://openlibrary.org/show-marc/marc_western_washington_univ/wwu_bibs.mrc_revrev.mrc:299505697:862 contents = fields[0].get_contents(['a', 'b', 'c', 'h', 'p']) b_and_p = [i for i in fields[0].get_subfield_values(['b', 'p']) if i] ret = {} title = None # MARC record with 245a missing: # http://openlibrary.org/show-marc/marc_western_washington_univ/wwu_bibs.mrc_revrev.mrc:516779055:1304 if 'a' in contents: title = ' '.join(x.strip(' /,;:') for x in contents['a']) elif b_and_p: title = b_and_p.pop(0).strip(' /,;:') # talis_openlibrary_contribution/talis-openlibrary-contribution.mrc:183427199:255 if title in ('See.', 'See also.'): raise SeeAlsoAsTitle # talis_openlibrary_contribution/talis-openlibrary-contribution.mrc:5654086:483 # scrapbooksofmoun03tupp if title is None: subfields = list(fields[0].get_all_subfields()) title = ' '.join(v for k, v in subfields) if not title: # ia:scrapbooksofmoun03tupp raise NoTitle ret['title'] = remove_trailing_dot(title) if b_and_p: ret["subtitle"] = ' : '.join(remove_trailing_dot(x.strip(' /,;:')) for x in b_and_p) if 'c' in contents: ret["by_statement"] = remove_trailing_dot(' '.join(contents['c'])) if 'h' in contents: h = ' '.join(contents['h']).strip(' ') m = re_bracket_field.match(h) if m: h = m.group(1) assert h ret["physical_format"] = h return ret
def read_contributions(rec): want = dict(( ('700', 'abcdeq'), ('710', 'ab'), ('711', 'acdn'), ('720', 'a'), )) ret = {} skip_authors = set() for tag in ('100', '110', '111'): fields = rec.get_fields(tag) for f in fields: skip_authors.add(tuple(f.get_all_subfields())) if not skip_authors: for tag, f in rec.read_fields(['700', '710', '711', '720']): f = rec.decode_field(f) if tag in ('700', '720'): if 'authors' not in ret or last_name_in_245c(rec, f): ret.setdefault('authors', []).append(read_author_person(f)) skip_authors.add(tuple(f.get_subfields(want[tag]))) continue elif 'authors' in ret: break if tag == '710': name = [v.strip(' /,;:') for v in f.get_subfield_values(want[tag])] ret['authors'] = [{ 'entity_type': 'org', 'name': remove_trailing_dot(' '.join(name))}] skip_authors.add(tuple(f.get_subfields(want[tag]))) break if tag == '711': name = [v.strip(' /,;:') for v in f.get_subfield_values(want[tag])] ret['authors'] = [{ 'entity_type': 'event', 'name': remove_trailing_dot(' '.join(name))}] skip_authors.add(tuple(f.get_subfields(want[tag]))) break for tag, f in rec.read_fields(['700', '710', '711', '720']): sub = want[tag] cur = tuple(rec.decode_field(f).get_subfields(sub)) if tuple(cur) in skip_authors: continue name = remove_trailing_dot(' '.join(strip_foc(i[1]) for i in cur).strip(',')) ret.setdefault('contributions', []).append(name) # need to add flip_name return ret
def flip_place(s): s = remove_trailing_dot(s) # Whitechapel (London, England) # East End (London, England) # Whitechapel (Londres, Inglaterra) if re_paren.search(s): return s m = re_place_comma.match(s) return m.group(2) + ' ' + m.group(1) if m else s
def test_remove_trailing_dot(): data = [ ('Test', 'Test'), ('Test.', 'Test'), ('Test J.', 'Test J.'), ('Test...', 'Test...') ] for input, expect in data: output = remove_trailing_dot(input) assert output == expect
def tidy_subject(s): s = s.strip() if len(s) < 2: print 'short subject:', `s` else: s = s[0].upper() + s[1:] m = re_etc.search(s) if m: return m.group(1) s = remove_trailing_dot(s) m = re_fictitious_character.match(s) return m.group(2) + ' ' + m.group(1) + m.group(3) if m else s
def read_authors(rec): count = 0 fields_100 = rec.get_fields('100') fields_110 = rec.get_fields('110') fields_111 = rec.get_fields('111') count = len(fields_100) + len(fields_110) + len(fields_111) if count == 0: return # talis_openlibrary_contribution/talis-openlibrary-contribution.mrc:11601515:773 has two authors: # 100 1 $aDowling, James Walter Frederick. # 111 2 $aConference on Civil Engineering Problems Overseas. found = [read_author_person(f) for f in fields_100] for f in fields_110: f.remove_brackets() name = [v.strip(' /,;:') for v in f.get_subfield_values(['a', 'b'])] found.append({ 'entity_type': 'org', 'name': remove_trailing_dot(' '.join(name))}) for f in fields_111: f.remove_brackets() name = [v.strip(' /,;:') for v in f.get_subfield_values(['a', 'c', 'd', 'n'])] found.append({ 'entity_type': 'event', 'name': remove_trailing_dot(' '.join(name))}) if found: return found
def read_subjects(rec): subjects = defaultdict(lambda: defaultdict(int)) for tag, field in rec.read_fields(subject_fields): f = rec.decode_field(field) aspects = find_aspects(f) if tag == '600': # people name_and_date = [] for k, v in f.get_subfields(['a', 'b', 'c', 'd']): v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:') if k == 'a': m = re_flip_name.match(v) if m: v = flip_name(v) name_and_date.append(v) name = remove_trailing_dot(' '.join(name_and_date)).strip() if name != '': subjects['person'][name] += 1 elif tag == '610': # org v = ' '.join(f.get_subfield_values('abcd')) v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: subjects['org'][v] += 1 for v in f.get_subfield_values('a'): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: subjects['org'][v] += 1 elif tag == '611': # event v = ' '.join(j.strip() for i, j in f.get_all_subfields() if i not in 'vxyz') if v: v = v.strip() v = tidy_subject(v) if v: subjects['event'][v] += 1 elif tag == '630': # work for v in f.get_subfield_values(['a']): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: subjects['work'][v] += 1 elif tag == '650': # topical for v in f.get_subfield_values(['a']): if v: v = v.strip() v = tidy_subject(v) if v: subjects['subject'][v] += 1 elif tag == '651': # geo for v in f.get_subfield_values(['a']): if v: subjects['place'][flip_place(v).strip()] += 1 for v in f.get_subfield_values(['y']): v = v.strip() if v: subjects['time'][remove_trailing_dot(v).strip()] += 1 for v in f.get_subfield_values(['v']): v = v.strip() if v: v = remove_trailing_dot(v).strip() v = tidy_subject(v) if v: subjects['subject'][v] += 1 for v in f.get_subfield_values(['z']): v = v.strip() if v: subjects['place'][flip_place(v).strip()] += 1 for v in f.get_subfield_values(['x']): v = v.strip() if not v: continue if aspects and re_aspects.search(v): continue v = tidy_subject(v) if v: subjects['subject'][v] += 1 return dict((k, dict(v)) for k, v in subjects.items())
def tidy_subfield(v): return remove_trailing_dot(v.strip(' /,;:'))
def find_subjects(marc_subjects): person = defaultdict(int) event = defaultdict(int) work = defaultdict(int) org = defaultdict(int) time = defaultdict(int) place = defaultdict(int) subject = defaultdict(int) #fiction = False for lines in marc_subjects: for tag, line in lines: aspects = find_aspects(line) if aspects: subject[aspects] += 1 if re_large_book.search(line): continue if tag == '600': # people name_and_date = [] for k, v in get_subfields(line, ['a', 'b', 'c', 'd']): v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:') if k == 'a': if v == 'Mao, Zedong': v = 'Mao Zedong' else: m = re_flip_name.match(v) if m: v = flip_name(v) name_and_date.append(v) name = remove_trailing_dot(' '.join(name_and_date)).strip() if name != '': person[name] += 1 elif tag == '610': # org v = ' '.join(get_subfield_values(line, 'abcd')) v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 for v in get_subfield_values(line, 'a'): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 elif tag == '611': # event v = ' '.join(j.strip() for i, j in get_all_subfields(line) if i not in 'vxyz') if v: v = v.strip() v = tidy_subject(v) if v: event[v] += 1 elif tag == '630': # work for v in get_subfield_values(line, ['a']): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: work[v] += 1 elif tag == '650': # topical for v in get_subfield_values(line, ['a']): if v: v = v.strip() v = tidy_subject(v) if v: subject[v] += 1 elif tag == '651': # geo for v in get_subfield_values(line, ['a']): if v: place[flip_place(v).strip()] += 1 else: print 'other', tag, list(get_all_subfields(line)) cur = [v for k, v in get_all_subfields(line) if k=='a' or v.strip('. ').lower() == 'fiction'] # skip: 'Good, Sally (Fictitious character) in fiction' if len(cur) > 1 and cur[-1].strip('. ').lower() == 'fiction' and ')' not in cur[-2]: subject[flip_subject(cur[-2]) + ' in fiction'] += 1 for v in get_subfield_values(line, ['y']): v = v.strip() if v: time[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['v']): v = v.strip() if v: v = remove_trailing_dot(v).strip() v = tidy_subject(v) if v: subject[v] += 1 for v in get_subfield_values(line, ['z']): v = v.strip() if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['x']): v = v.strip() if not v: continue if aspects and re_aspects.search(v): continue v = tidy_subject(v) if v: subject[v] += 1 v_and_x = get_subfield_values(line, ['v', 'x']) #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x: # fiction = True #if 'Fiction' in subject: # del subject['Fiction'] ret = {} if person: ret['person'] = dict(person) if time: ret['time'] = dict(time) if place: ret['place'] = dict(place) if subject: ret['subject'] = dict(subject) if event: ret['event'] = dict(event) if org: ret['org'] = dict(org) if work: ret['work'] = dict(work) return ret
def find_subjects(marc_subjects): person = defaultdict(int) event = defaultdict(int) work = defaultdict(int) org = defaultdict(int) time = defaultdict(int) place = defaultdict(int) subject = defaultdict(int) #fiction = False for lines in marc_subjects: for tag, line in lines: aspects = find_aspects(line) if aspects: subject[aspects] += 1 if re_large_book.search(line): continue if tag == '600': # people name_and_date = [] for k, v in get_subfields(line, ['a', 'b', 'c', 'd']): v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:') if k == 'a': if v == 'Mao, Zedong': v = 'Mao Zedong' else: m = re_flip_name.match(v) if m: v = flip_name(v) name_and_date.append(v) name = remove_trailing_dot(' '.join(name_and_date)).strip() if name != '': person[name] += 1 elif tag == '610': # org v = ' '.join(get_subfield_values(line, 'abcd')) v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 for v in get_subfield_values(line, 'a'): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: org[v] += 1 elif tag == '611': # event v = ' '.join(j.strip() for i, j in get_all_subfields(line) if i not in 'vxyz') if v: v = v.strip() v = tidy_subject(v) if v: event[v] += 1 elif tag == '630': # work for v in get_subfield_values(line, ['a']): v = v.strip() if v: v = remove_trailing_dot(v).strip() if v: v = tidy_subject(v) if v: work[v] += 1 elif tag == '650': # topical for v in get_subfield_values(line, ['a']): if v: v = v.strip() v = tidy_subject(v) if v: subject[v] += 1 elif tag == '651': # geo for v in get_subfield_values(line, ['a']): if v: place[flip_place(v).strip()] += 1 else: print('other', tag, list(get_all_subfields(line))) cur = [v for k, v in get_all_subfields(line) if k=='a' or v.strip('. ').lower() == 'fiction'] # skip: 'Good, Sally (Fictitious character) in fiction' if len(cur) > 1 and cur[-1].strip('. ').lower() == 'fiction' and ')' not in cur[-2]: subject[flip_subject(cur[-2]) + ' in fiction'] += 1 for v in get_subfield_values(line, ['y']): v = v.strip() if v: time[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['v']): v = v.strip() if v: v = remove_trailing_dot(v).strip() v = tidy_subject(v) if v: subject[v] += 1 for v in get_subfield_values(line, ['z']): v = v.strip() if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['x']): v = v.strip() if not v: continue if aspects and re_aspects.search(v): continue v = tidy_subject(v) if v: subject[v] += 1 v_and_x = get_subfield_values(line, ['v', 'x']) #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x: # fiction = True #if 'Fiction' in subject: # del subject['Fiction'] ret = {} if person: ret['person'] = dict(person) if time: ret['time'] = dict(time) if place: ret['place'] = dict(place) if subject: ret['subject'] = dict(subject) if event: ret['event'] = dict(event) if org: ret['org'] = dict(org) if work: ret['work'] = dict(work) return ret
def read_contributions(rec): """ Reads contributors from a MARC record and use values in 7xx fields to set 'authors' if the 1xx fields do not exist. Otherwise set additional 'contributions' :param (MarcBinary | MarcXml) rec: :rtype: dict """ want = dict(( ('700', 'abcdeq'), ('710', 'ab'), ('711', 'acdn'), ('720', 'a'), )) ret = {} skip_authors = set() for tag in ('100', '110', '111'): fields = rec.get_fields(tag) for f in fields: skip_authors.add(tuple(f.get_all_subfields())) if not skip_authors: for tag, f in rec.read_fields(['700', '710', '711', '720']): f = rec.decode_field(f) if tag in ('700', '720'): if 'authors' not in ret or last_name_in_245c(rec, f): ret.setdefault('authors', []).append(read_author_person(f)) skip_authors.add(tuple(f.get_subfields(want[tag]))) continue elif 'authors' in ret: break if tag == '710': name = [ v.strip(' /,;:') for v in f.get_subfield_values(want[tag]) ] ret['authors'] = [{ 'entity_type': 'org', 'name': remove_trailing_dot(' '.join(name)) }] skip_authors.add(tuple(f.get_subfields(want[tag]))) break if tag == '711': name = [ v.strip(' /,;:') for v in f.get_subfield_values(want[tag]) ] ret['authors'] = [{ 'entity_type': 'event', 'name': remove_trailing_dot(' '.join(name)) }] skip_authors.add(tuple(f.get_subfields(want[tag]))) break for tag, f in rec.read_fields(['700', '710', '711', '720']): sub = want[tag] cur = tuple(rec.decode_field(f).get_subfields(sub)) if tuple(cur) in skip_authors: continue name = remove_trailing_dot(' '.join(strip_foc(i[1]) for i in cur).strip(',')) ret.setdefault('contributions', []).append(name) # need to add flip_name return ret
def find_subjects(w, marc_subjects=None): people = defaultdict(int) genres = defaultdict(int) when = defaultdict(int) place = defaultdict(int) subject = defaultdict(int) #fiction = False for lines in marc_subjects or get_marc_subjects(w): for tag, line in lines: if re_large_book.match(line): continue if tag == '600': # people name_and_date = [] for k, v in get_subfields(line, ['a', 'b', 'c', 'd']): v = '(' + v.strip('.() ') + ')' if k == 'd' else v.strip(' /,;:') if k == 'a': if v == 'Mao, Zedong': v = 'Mao Zedong' else: m = re_flip_name.match(v) if m: v = flip_name(v) name_and_date.append(v) name = remove_trailing_dot(' '.join(name_and_date)).strip() if name != '': people[name] += 1 if tag == '650': for v in get_subfield_values(line, ['a']): if v: v = v.strip() v = tidy_subject(v) if v: subject[v] += 1 if tag == '651': for v in get_subfield_values(line, ['a']): if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['y']): v = v.strip() if v: when[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['v']): v = v.strip() if v: subject[remove_trailing_dot(v).strip()] += 1 for v in get_subfield_values(line, ['z']): v = v.strip() if v: place[flip_place(v).strip()] += 1 for v in get_subfield_values(line, ['x']): v = v.strip() if v: v = tidy_subject(v) if v: subject[v] += 1 v_and_x = get_subfield_values(line, ['v', 'x']) #if 'Fiction' in v_and_x or 'Fiction.' in v_and_x: # fiction = True #if 'Fiction' in subject: # del subject['Fiction'] ret = {} if people: ret['people'] = dict(people) if when: ret['times'] = dict(when) if place: ret['places'] = dict(place) if subject: ret['subjects'] = dict(subject) return ret