def testFindSubjects(self): marc = [[ ('650', ' 0\x1faRhodes, Dan (Fictitious character)\x1fvFiction.\x1e'), ('650', ' 0\x1faSheriffs\x1fvFiction.\x1e'), ('651', ' 0\x1faTexas\x1fvFiction.\x1e') ]] expect = { 'place': {u'Texas': 1}, 'subject': {u'Dan Rhodes (Fictitious character)': 1, u'Sheriffs': 1, u'Sheriffs in fiction': 1, u'Texas in fiction': 1, u'Fiction': 3} } self.assertEqual(find_subjects(marc), expect) marc = [[ ('650', ' 0\x1faSpies\x1fzFrance\x1fzParis\x1fvFiction.\x1e'), ('651', ' 0\x1faFrance\x1fxHistory\x1fyDirectory, 1795-1799\x1fvFiction.\x1e') ]] expect = { 'subject': {u'History': 1, u'France in fiction': 1, u'Spies': 1, u'Spies in fiction': 1, u'Fiction': 2}, 'place': {u'Paris': 1, u'France': 2}, 'time': {u'Directory, 1795-1799': 1} } self.assertEqual(find_subjects(marc), expect)
def testFindSubjects(self): marc = [[ ('650', ' 0\x1faRhodes, Dan (Fictitious character)\x1fvFiction.\x1e'), ('650', ' 0\x1faSheriffs\x1fvFiction.\x1e'), ('651', ' 0\x1faTexas\x1fvFiction.\x1e') ]] expect = { 'place': { u'Texas': 1 }, 'subject': { u'Dan Rhodes (Fictitious character)': 1, u'Sheriffs': 1, u'Sheriffs in fiction': 1, u'Texas in fiction': 1, u'Fiction': 3 } } self.assertEqual(find_subjects(marc), expect) marc = [[ ('650', ' 0\x1faSpies\x1fzFrance\x1fzParis\x1fvFiction.\x1e'), ('651', ' 0\x1faFrance\x1fxHistory\x1fyDirectory, 1795-1799\x1fvFiction.\x1e' ) ]] expect = { 'subject': { u'History': 1, u'France in fiction': 1, u'Spies': 1, u'Spies in fiction': 1, u'Fiction': 2 }, 'place': { u'Paris': 1, u'France': 2 }, 'time': { u'Directory, 1795-1799': 1 } } self.assertEqual(find_subjects(marc), expect)
def build_doc(w): editions = w['editions'] if len(editions) > 300: print `w['title'], len(editions)` authors = [] if 'authors' not in w: print 'no authors' for a in w['authors']: if a is None: continue cur = {'key': a['key'], 'name': a.get('name', '')} if a.get('alternate_names', None): cur['alternate_names'] = a['alternate_names'] authors.append(cur) subjects = find_subjects(w, marc_subjects=w['subjects']) if 'subjects' in w else {} doc = Element("doc") m = re_work_key.match(w['key']) add_field(doc, 'key', m.group(1)) add_field(doc, 'title', w['title']) add_field(doc, 'title_suggest', w['title']) has_fulltext = any(e.get('ocaid', None) for e in editions) add_field(doc, 'has_fulltext', has_fulltext) if w.get('subtitle', None): add_field(doc, 'subtitle', w['subtitle']) alt_titles = set() for e in editions: if 'title' in e and e['title'] != w['title']: alt_titles.add(e['title']) for f in 'work_titles', 'other_titles': for t in e.get(f, []): if t != w['title']: alt_titles.add(t) add_field_list(doc, 'alternative_title', alt_titles) alt_subtitles = set( e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None)) add_field(doc, 'alternative_subtitle', alt_subtitles) add_field(doc, 'edition_count', len(editions)) for e in editions: m = re_edition_key.match(e['key']) if not m: print 'bad edition key:', e['key'] continue add_field(doc, 'edition_key', m.group(1)) k = 'by_statement' add_field_list(doc, k, set( e[k] for e in editions if e.get(k, None))) k = 'publish_date' pub_dates = set(e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, pub_dates) pub_years = set(m.group(1) for m in (re_year.match(i) for i in pub_dates) if m) if pub_years: add_field_list(doc, 'publish_year', pub_years) add_field(doc, 'first_publish_year', min(int(i) for i in pub_years)) k = 'first_sentence' fs = set( e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, fs) publishers = set() for e in editions: publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', [])) add_field_list(doc, 'publisher', publishers) add_field_list(doc, 'publisher_facet', publishers) field_map = [ ('lccn', 'lccn'), ('publish_places', 'publish_place'), ('oclc_numbers', 'oclc'), ('contributions', 'contributor'), ] for db_key, search_key in field_map: v = set() for e in editions: if db_key not in e: continue v.update(e[db_key]) add_field_list(doc, search_key, v) isbn = set() for e in editions: for f in 'isbn_10', 'isbn_13': for v in e.get(f, []): isbn.add(v.replace('-', '')) add_field_list(doc, 'isbn', isbn) lang = set() for e in editions: for l in e.get('languages', []): assert l['key'].startswith('/l/') and len(l['key']) == 6 lang.add(l['key'][3:]) if lang: add_field_list(doc, 'language', lang) v = set( e['ocaid'].strip() for e in editions if 'ocaid' in e) add_field_list(doc, 'ia', v) author_keys = [a['key'] for a in authors] assert not any(ak.startswith('/a/') for ak in author_keys) author_names = [a.get('name', '') for a in authors] assert not any('\t' in n for n in author_names) add_field_list(doc, 'author_key', author_keys) add_field_list(doc, 'author_name', author_names) alt_names = set() for a in authors: if 'alternate_names' in a: alt_names.update(a['alternate_names']) add_field_list(doc, 'author_alternative_name', alt_names) add_field_list(doc, 'author_facet', (k + '\t' + n for k, n in zip(author_keys, author_names))) add_field(doc, 'fiction', subjects['fiction']) for k in 'person', 'place', 'subject', 'time': if k not in subjects: continue add_field_list(doc, k, subjects[k].keys()) for k in 'person', 'place', 'subject', 'time': if k not in subjects: continue add_field_list(doc, k + '_facet', subjects[k].keys()) for k in 'person', 'place', 'subject', 'time': if k not in subjects: continue add_field_list(doc, k + '_key', (str_to_key(s) for s in subjects[k].keys())) return doc
def build_doc(w): wkey = w['key'] m = re_work_key.match(wkey) wkey_num = int(m.group(1)) if wkey_num in long_subjects: return def get_pub_year(e): pub_date = e.get('publish_date', None) if pub_date: m = re_year.search(pub_date) if m: return m.group(1) editions = [] for e in w['editions']: pub_year = get_pub_year(e) if pub_year: e['pub_year'] = pub_year editions.append(e) editions.sort(key=lambda e: e.get('pub_year', None)) doc = Element("doc") add_field(doc, 'key', 'OL%dW' % wkey_num) add_field(doc, 'title', w['title']) #add_field(doc, 'title_suggest', w['title']) has_fulltext = any(e.get('ia', None) for e in editions) add_field(doc, 'has_fulltext', has_fulltext) if w.get('subtitle', None): add_field(doc, 'subtitle', w['subtitle']) alt_titles = set() for e in editions: if e.get('title', None): t = e['title'] if t != w['title']: alt_titles.add(t) for f in 'work_titles', 'other_titles': if f not in e: continue assert isinstance(e[f], list) for t in e[f]: if t != w['title']: alt_titles.add(t) add_field_list(doc, 'alternative_title', alt_titles) alt_subtitles = set( e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None)) add_field(doc, 'alternative_subtitle', alt_subtitles) add_field(doc, 'edition_count', len(editions)) for e in editions: add_field(doc, 'edition_key', 'OL%dM' % e['ekey']) if wkey_num in covers: add_field(doc, 'cover_edition_key', 'OL%dM' % covers[wkey_num]) k = 'by_statement' add_field_list(doc, k, set( e[k] for e in editions if e.get(k, None))) k = 'publish_date' pub_dates = set(e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, pub_dates) pub_years = set(e['pub_year'] for e in editions if 'pub_year' in e) if pub_years: add_field_list(doc, 'publish_year', pub_years) add_field(doc, 'first_publish_year', min(int(i) for i in pub_years)) k = 'first_sentence' fs = set( e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, fs) field_map = [ ('lccn', 'lccn'), ('publishers', 'publisher'), ('publish_places', 'publish_place'), ('oclc_numbers', 'oclc'), ('contributions', 'contributor'), ] for db_key, search_key in field_map: v = set() for e in editions: if db_key not in e: continue if db_key == 'publishers': e[db_key] = ['Sine nomine' if is_sine_nomine(i) else i for i in e[db_key].split('\t')] assert isinstance(e[db_key], list) v.update(e[db_key]) add_field_list(doc, search_key, v) # if db_key == 'publishers': # add_field_list(doc, search_key + '_facet', v) isbn = set() for e in editions: for f in 'isbn_10', 'isbn_13': if f not in e: continue assert isinstance(e[f], list) for v in e[f]: isbn.add(v.replace('-', '')) add_field_list(doc, 'isbn', isbn) lang = set() for e in editions: if 'languages' not in e: continue assert isinstance(e['languages'], list) for l in e['languages']: for l2 in l.split('\t'): if len(l2) != 3: print e['languages'] assert len(l2) == 3 lang.add(l2) if lang: add_field_list(doc, 'language', lang) goog = set() # google non_goog = set() for e in editions: if 'ia' in e: assert isinstance(e['ia'], list) for i in e['ia']: i = i.strip() if i.endswith('goog'): goog.add(i) else: non_goog.add(i) add_field_list(doc, 'ia', list(non_goog) + list(goog)) authors = w['authors'] author_keys = ['OL%dA' % a['akey'] for a in authors] author_names = [a.get('name', '') or '' for a in authors] add_field_list(doc, 'author_key', author_keys) add_field_list(doc, 'author_name', author_names) alt_names = set() for a in authors: if 'alt_names' not in a: continue assert isinstance(a['alt_names'], list) alt_names.update(a['alt_names']) add_field_list(doc, 'author_alternative_name', alt_names) add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names))) # if 'subjects' in w: # if isinstance(w['subjects'][0], list): # try: # subjects = find_subjects(w['subjects']) # except ValueError: # print w['subjects'] # raise # else: # subjects = work_subjects(wkey_num) # if not subjects: # subjects = {} # if 'marc_subjects' in w: try: marc_subjects = eval(w['marc_subjects']) except: print 'error parsing marc subjects (%d)' % len(w['marc_subjects']) marc_subjects = [] try: subjects = find_subjects(marc_subjects) except ValueError: print w['marc_subjects'] raise subjects = four_types(subjects) for k in 'person', 'place', 'subject', 'time': if k not in subjects: continue add_field_list(doc, k, subjects[k].keys()) #add_field_list(doc, k + '_facet', subjects[k].keys()) subject_keys = [str_to_key(s) for s in subjects[k].keys()] add_field_list(doc, k + '_key', subject_keys) return doc
def find_works(akey, book_iter, existing={}): equiv = defaultdict(int) # normalized title and work title pairs norm_titles = defaultdict(int) # frequency of titles books_by_key = {} books = [] # normalized work title to regular title rev_wt = defaultdict(lambda: defaultdict(int)) print 'find_works' for book in book_iter: if 'norm_wt' in book: pair = (book['norm_title'], book['norm_wt']) equiv[pair] += 1 rev_wt[book['norm_wt']][book['work_title']] +=1 norm_titles[book['norm_title']] += 1 books_by_key[book['key']] = book books.append(book) title_map = build_work_title_map(equiv, norm_titles) for a, b in existing.items(): norm_a = mk_norm(a) norm_b = mk_norm(b) rev_wt[norm_b][norm_a] +=1 title_map[norm_a] = norm_b works = defaultdict(lambda: defaultdict(list)) work_titles = defaultdict(list) for b in books: if 'eng' not in b.get('lang', []) and 'norm_wt' in b: work_titles[b['norm_wt']].append(b['key']) continue n = b['norm_title'] title = b['title'] if n in title_map: n = title_map[n] title = top_rev_wt(rev_wt[n]) works[n][title].append(b['key']) works = sorted([(sum(map(len, w.values() + [work_titles[n]])), n, w) for n, w in works.items()]) for work_count, norm, w in works: first = sorted(w.items(), reverse=True, key=lambda i:len(i[1]))[0][0] titles = defaultdict(int) for key_list in w.values(): for ekey in key_list: b = books_by_key[ekey] title = b['title'] titles[title] += 1 keys = work_titles[norm] for values in w.values(): keys += values assert work_count == len(keys) title = max(titles.keys(), key=lambda i:titles[i]) toc_iter = ((k, books_by_key[k].get('table_of_contents', None)) for k in keys) toc = dict((k, v) for k, v in toc_iter if v) editions = [books_by_key[k] for k in keys] subtitles = defaultdict(lambda: defaultdict(int)) edition_count = 0 with_subtitle_count = 0 for e in editions: edition_count += 1 subtitle = e['subtitle'] or '' if subtitle != '': with_subtitle_count += 1 norm_subtitle = mk_norm(subtitle) if norm_subtitle != norm: subtitles[norm_subtitle][subtitle] += 1 use_subtitle = None for k, v in subtitles.iteritems(): lc_k = k.strip(' .').lower() if lc_k in ('', 'roman') or 'edition' in lc_k: continue num = sum(v.values()) overall = float(num) / float(edition_count) ratio = float(num) / float(with_subtitle_count) if overall > 0.2 and ratio > 0.5: use_subtitle = freq_dict_top(v) w = {'title': first, 'editions': editions} if use_subtitle: w['subtitle'] = use_subtitle if toc: w['toc'] = toc subjects = four_types(find_subjects(get_marc_subjects(w))) if subjects: w['subjects'] = subjects yield w
def build_doc(w): wkey = w['key'] m = re_work_key.match(wkey) wkey_num = int(m.group(1)) if wkey_num in long_subjects: return def get_pub_year(e): pub_date = e.get('publish_date', None) if pub_date: m = re_year.search(pub_date) if m: return m.group(1) editions = [] for e in w['editions']: pub_year = get_pub_year(e) if pub_year: e['pub_year'] = pub_year editions.append(e) editions.sort(key=lambda e: e.get('pub_year', None)) doc = Element("doc") add_field(doc, 'key', 'OL%dW' % wkey_num) add_field(doc, 'title', w['title']) #add_field(doc, 'title_suggest', w['title']) has_fulltext = any(e.get('ia', None) for e in editions) add_field(doc, 'has_fulltext', has_fulltext) if w.get('subtitle', None): add_field(doc, 'subtitle', w['subtitle']) alt_titles = set() for e in editions: if e.get('title', None): t = e['title'] if t != w['title']: alt_titles.add(t) for f in 'work_titles', 'other_titles': if f not in e: continue assert isinstance(e[f], list) for t in e[f]: if t != w['title']: alt_titles.add(t) add_field_list(doc, 'alternative_title', alt_titles) alt_subtitles = set(e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None)) add_field(doc, 'alternative_subtitle', alt_subtitles) add_field(doc, 'edition_count', len(editions)) for e in editions: add_field(doc, 'edition_key', 'OL%dM' % e['ekey']) if wkey_num in covers: add_field(doc, 'cover_edition_key', 'OL%dM' % covers[wkey_num]) k = 'by_statement' add_field_list(doc, k, set(e[k] for e in editions if e.get(k, None))) k = 'publish_date' pub_dates = set(e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, pub_dates) pub_years = set(e['pub_year'] for e in editions if 'pub_year' in e) if pub_years: add_field_list(doc, 'publish_year', pub_years) add_field(doc, 'first_publish_year', min(int(i) for i in pub_years)) k = 'first_sentence' fs = set(e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, fs) field_map = [ ('lccn', 'lccn'), ('publishers', 'publisher'), ('publish_places', 'publish_place'), ('oclc_numbers', 'oclc'), ('contributions', 'contributor'), ] for db_key, search_key in field_map: v = set() for e in editions: if db_key not in e: continue if db_key == 'publishers': e[db_key] = [ 'Sine nomine' if is_sine_nomine(i) else i for i in e[db_key].split('\t') ] assert isinstance(e[db_key], list) v.update(e[db_key]) add_field_list(doc, search_key, v) # if db_key == 'publishers': # add_field_list(doc, search_key + '_facet', v) isbn = set() for e in editions: for f in 'isbn_10', 'isbn_13': if f not in e: continue assert isinstance(e[f], list) for v in e[f]: isbn.add(v.replace('-', '')) add_field_list(doc, 'isbn', isbn) lang = set() for e in editions: if 'languages' not in e: continue assert isinstance(e['languages'], list) for l in e['languages']: for l2 in l.split('\t'): if len(l2) != 3: print e['languages'] assert len(l2) == 3 lang.add(l2) if lang: add_field_list(doc, 'language', lang) goog = set() # google non_goog = set() for e in editions: if 'ia' in e: assert isinstance(e['ia'], list) for i in e['ia']: i = i.strip() if i.endswith('goog'): goog.add(i) else: non_goog.add(i) add_field_list(doc, 'ia', list(non_goog) + list(goog)) authors = w['authors'] author_keys = ['OL%dA' % a['akey'] for a in authors] author_names = [a.get('name', '') or '' for a in authors] add_field_list(doc, 'author_key', author_keys) add_field_list(doc, 'author_name', author_names) alt_names = set() for a in authors: if 'alt_names' not in a: continue assert isinstance(a['alt_names'], list) alt_names.update(a['alt_names']) add_field_list(doc, 'author_alternative_name', alt_names) add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names))) # if 'subjects' in w: # if isinstance(w['subjects'][0], list): # try: # subjects = find_subjects(w['subjects']) # except ValueError: # print w['subjects'] # raise # else: # subjects = work_subjects(wkey_num) # if not subjects: # subjects = {} # if 'marc_subjects' in w: try: marc_subjects = eval(w['marc_subjects']) except: print 'error parsing marc subjects (%d)' % len(w['marc_subjects']) marc_subjects = [] try: subjects = find_subjects(marc_subjects) except ValueError: print w['marc_subjects'] raise subjects = four_types(subjects) for k in 'person', 'place', 'subject', 'time': if k not in subjects: continue add_field_list(doc, k, subjects[k].keys()) #add_field_list(doc, k + '_facet', subjects[k].keys()) subject_keys = [str_to_key(s) for s in subjects[k].keys()] add_field_list(doc, k + '_key', subject_keys) return doc
def build_doc(w): editions = w['editions'] if len(editions) > 300: print ` w['title'], len(editions) ` authors = [] if 'authors' not in w: print 'no authors' for a in w['authors']: if a is None: continue cur = {'key': a['key'], 'name': a.get('name', '')} if a.get('alternate_names', None): cur['alternate_names'] = a['alternate_names'] authors.append(cur) subjects = find_subjects( w, marc_subjects=w['subjects']) if 'subjects' in w else {} doc = Element("doc") m = re_work_key.match(w['key']) add_field(doc, 'key', m.group(1)) add_field(doc, 'title', w['title']) add_field(doc, 'title_suggest', w['title']) has_fulltext = any(e.get('ocaid', None) for e in editions) add_field(doc, 'has_fulltext', has_fulltext) if w.get('subtitle', None): add_field(doc, 'subtitle', w['subtitle']) alt_titles = set() for e in editions: if 'title' in e and e['title'] != w['title']: alt_titles.add(e['title']) for f in 'work_titles', 'other_titles': for t in e.get(f, []): if t != w['title']: alt_titles.add(t) add_field_list(doc, 'alternative_title', alt_titles) alt_subtitles = set(e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None)) add_field(doc, 'alternative_subtitle', alt_subtitles) add_field(doc, 'edition_count', len(editions)) for e in editions: m = re_edition_key.match(e['key']) if not m: print 'bad edition key:', e['key'] continue add_field(doc, 'edition_key', m.group(1)) k = 'by_statement' add_field_list(doc, k, set(e[k] for e in editions if e.get(k, None))) k = 'publish_date' pub_dates = set(e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, pub_dates) pub_years = set( m.group(1) for m in (re_year.match(i) for i in pub_dates) if m) if pub_years: add_field_list(doc, 'publish_year', pub_years) add_field(doc, 'first_publish_year', min(int(i) for i in pub_years)) k = 'first_sentence' fs = set(e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, fs) publishers = set() for e in editions: publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', [])) add_field_list(doc, 'publisher', publishers) add_field_list(doc, 'publisher_facet', publishers) field_map = [ ('lccn', 'lccn'), ('publish_places', 'publish_place'), ('oclc_numbers', 'oclc'), ('contributions', 'contributor'), ] for db_key, search_key in field_map: v = set() for e in editions: if db_key not in e: continue v.update(e[db_key]) add_field_list(doc, search_key, v) isbn = set() for e in editions: for f in 'isbn_10', 'isbn_13': for v in e.get(f, []): isbn.add(v.replace('-', '')) add_field_list(doc, 'isbn', isbn) lang = set() for e in editions: for l in e.get('languages', []): assert l['key'].startswith('/l/') and len(l['key']) == 6 lang.add(l['key'][3:]) if lang: add_field_list(doc, 'language', lang) v = set(e['ocaid'].strip() for e in editions if 'ocaid' in e) add_field_list(doc, 'ia', v) author_keys = [a['key'] for a in authors] assert not any(ak.startswith('/a/') for ak in author_keys) author_names = [a.get('name', '') for a in authors] assert not any('\t' in n for n in author_names) add_field_list(doc, 'author_key', author_keys) add_field_list(doc, 'author_name', author_names) alt_names = set() for a in authors: if 'alternate_names' in a: alt_names.update(a['alternate_names']) add_field_list(doc, 'author_alternative_name', alt_names) add_field_list(doc, 'author_facet', (k + '\t' + n for k, n in zip(author_keys, author_names))) add_field(doc, 'fiction', subjects['fiction']) for k in 'person', 'place', 'subject', 'time': if k not in subjects: continue add_field_list(doc, k, subjects[k].keys()) for k in 'person', 'place', 'subject', 'time': if k not in subjects: continue add_field_list(doc, k + '_facet', subjects[k].keys()) for k in 'person', 'place', 'subject', 'time': if k not in subjects: continue add_field_list(doc, k + '_key', (str_to_key(s) for s in subjects[k].keys())) return doc