def find_works(book_iter, existing={}, do_get_mc=True): var = find_works2(book_iter) find_works3(var, existing) works = find_work_sort(var) for work_count, norm, w in works: first = sorted(w.items(), reverse=True, key=lambda i: len(i[1]))[0][0] titles = defaultdict(int) for key_list in w.values(): for ekey in key_list: b = var['books_by_key'][ekey] title = b['title'] titles[title] += 1 keys = var['work_titles'][norm] for values in w.values(): keys += values assert work_count == len(keys) title = max(titles.keys(), key=lambda i: titles[i]) toc_iter = ((k, var['books_by_key'][k].get('table_of_contents', None)) for k in keys) toc = dict((k, v) for k, v in toc_iter if v) # sometimes keys contains duplicates editions = [var['books_by_key'][k] for k in set(keys)] subtitles = defaultdict(lambda: defaultdict(int)) edition_count = 0 with_subtitle_count = 0 for e in editions: edition_count += 1 subtitle = e.get('subtitle') or '' if subtitle != '': with_subtitle_count += 1 norm_subtitle = mk_norm(subtitle) if norm_subtitle != norm: subtitles[norm_subtitle][subtitle] += 1 use_subtitle = None for k, v in subtitles.iteritems(): lc_k = k.strip(' .').lower() if lc_k in ('', 'roman') or 'edition' in lc_k: continue num = sum(v.values()) overall = float(num) / float(edition_count) ratio = float(num) / float(with_subtitle_count) if overall > 0.2 and ratio > 0.5: use_subtitle = freq_dict_top(v) w = {'title': first, 'editions': editions} if use_subtitle: w['subtitle'] = use_subtitle if toc: w['toc'] = toc try: subjects = four_types(get_work_subjects(w, do_get_mc=do_get_mc)) except: print w raise if subjects: w['subjects'] = subjects yield w
def find_works(book_iter, existing={}, do_get_mc=True): var = find_works2(book_iter) find_works3(var, existing) works = find_work_sort(var) for work_count, norm, w in works: first = sorted(w.items(), reverse=True, key=lambda i:len(i[1]))[0][0] titles = defaultdict(int) for key_list in w.values(): for ekey in key_list: b = var['books_by_key'][ekey] title = b['title'] titles[title] += 1 keys = var['work_titles'][norm] for values in w.values(): keys += values assert work_count == len(keys) title = max(titles.keys(), key=lambda i:titles[i]) toc_iter = ((k, var['books_by_key'][k].get('table_of_contents', None)) for k in keys) toc = dict((k, v) for k, v in toc_iter if v) # sometimes keys contains duplicates editions = [var['books_by_key'][k] for k in set(keys)] subtitles = defaultdict(lambda: defaultdict(int)) edition_count = 0 with_subtitle_count = 0 for e in editions: edition_count += 1 subtitle = e.get('subtitle') or '' if subtitle != '': with_subtitle_count += 1 norm_subtitle = mk_norm(subtitle) if norm_subtitle != norm: subtitles[norm_subtitle][subtitle] += 1 use_subtitle = None for k, v in subtitles.iteritems(): lc_k = k.strip(' .').lower() if lc_k in ('', 'roman') or 'edition' in lc_k: continue num = sum(v.values()) overall = float(num) / float(edition_count) ratio = float(num) / float(with_subtitle_count) if overall > 0.2 and ratio > 0.5: use_subtitle = freq_dict_top(v) w = {'title': first, 'editions': editions} if use_subtitle: w['subtitle'] = use_subtitle if toc: w['toc'] = toc try: subjects = four_types(get_work_subjects(w, do_get_mc=do_get_mc)) except: print(w) raise if subjects: w['subjects'] = subjects yield w
def build_doc(w, obj_cache={}, resolve_redirects=False): wkey = w['key'] assert w['type']['key'] == '/type/work' title = w.get('title', None) if not title: return def get_pub_year(e): pub_date = e.get('publish_date', None) if pub_date: m = re_year.search(pub_date) if m: return m.group(1) if 'editions' not in w: q = { 'type':'/type/edition', 'works': wkey, '*': None } w['editions'] = list(query_iter(q)) print 'editions:', [e['key'] for e in w['editions']] editions = [] for e in w['editions']: pub_year = get_pub_year(e) if pub_year: e['pub_year'] = pub_year if 'ocaid' in e: collection = get_ia_collection(e['ocaid']) #print 'collection:', collection e['ia_collection'] = collection e['public_scan'] = ('lendinglibrary' not in collection) and ('printdisabled' not in collection) overdrive_id = e.get('identifiers', {}).get('overdrive', None) if overdrive_id: #print 'overdrive:', overdrive_id e['overdrive'] = overdrive_id editions.append(e) editions.sort(key=lambda e: e.get('pub_year', None)) #print len(w['editions']), 'editions found' #print w['key'] work_authors = [] authors = [] author_keys = [] for a in w.get('authors', []): if 'author' not in a: # OL Web UI bug continue # http://openlibrary.org/works/OL15365167W.yml?m=edit&v=1 akey = a['author']['key'] m = re_author_key.match(akey) if not m: print 'invalid author key:', akey continue work_authors.append(akey) author_keys.append(m.group(1)) if akey in obj_cache and obj_cache[akey]['type']['key'] != '/type/redirect': authors.append(obj_cache[akey]) else: authors.append(withKey(akey)) if any(a['type']['key'] == '/type/redirect' for a in authors): if resolve_redirects: def resolve(a): if a['type']['key'] == '/type/redirect': a = withKey(a['location']) return a authors = [resolve(a) for a in authors] else: print for a in authors: print 'author:', a print raise AuthorRedirect assert all(a['type']['key'] == '/type/author' for a in authors) try: subjects = four_types(get_work_subjects(w)) except: print 'bad work: ', w['key'] raise field_map = { 'subjects': 'subject', 'subject_places': 'place', 'subject_times': 'time', 'subject_people': 'person', } has_fulltext = any(e.get('ocaid', None) or e.get('overdrive', None) for e in editions) #print 'has_fulltext:', has_fulltext for db_field, solr_field in field_map.iteritems(): if not w.get(db_field, None): continue cur = subjects.setdefault(solr_field, {}) for v in w[db_field]: try: if isinstance(v, dict): if 'value' not in v: continue v = v['value'] cur[v] = cur.get(v, 0) + 1 except: print 'v:', v raise if any(e.get('ocaid', None) for e in editions): subjects.setdefault('subject', {}) subjects['subject']['Accessible book'] = subjects['subject'].get('Accessible book', 0) + 1 if not has_fulltext: subjects['subject']['Protected DAISY'] = subjects['subject'].get('Protected DAISY', 0) + 1 #print w['key'], subjects['subject'] doc = Element("doc") add_field(doc, 'key', w['key'][7:]) title = w.get('title', None) if title: add_field(doc, 'title', title) # add_field(doc, 'title_suggest', title) add_field(doc, 'has_fulltext', has_fulltext) if w.get('subtitle', None): add_field(doc, 'subtitle', w['subtitle']) alt_titles = set() for e in editions: if 'title' in e and e['title'] != title: alt_titles.add(e['title']) for f in 'work_titles', 'other_titles': for t in e.get(f, []): if t != title: alt_titles.add(t) add_field_list(doc, 'alternative_title', alt_titles) alt_subtitles = set( e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None)) add_field_list(doc, 'alternative_subtitle', alt_subtitles) add_field(doc, 'edition_count', len(editions)) for e in editions: add_field(doc, 'edition_key', re_edition_key.match(e['key']).group(1)) cover_edition = pick_cover(w, editions) if cover_edition: add_field(doc, 'cover_edition_key', re_edition_key.match(cover_edition).group(1)) k = 'by_statement' add_field_list(doc, k, set( e[k] for e in editions if e.get(k, None))) k = 'publish_date' pub_dates = set(e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, pub_dates) pub_years = set(m.group(1) for m in (re_year.search(i) for i in pub_dates) if m) if pub_years: add_field_list(doc, 'publish_year', pub_years) add_field(doc, 'first_publish_year', min(int(i) for i in pub_years)) k = 'first_sentence' fs = set( e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, fs) publishers = set() for e in editions: publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', [])) add_field_list(doc, 'publisher', publishers) # add_field_list(doc, 'publisher_facet', publishers) field_map = [ ('lccn', 'lccn'), ('publish_places', 'publish_place'), ('oclc_numbers', 'oclc'), ('contributions', 'contributor'), ] for db_key, search_key in field_map: v = set() for e in editions: if db_key not in e: continue v.update(e[db_key]) add_field_list(doc, search_key, v) isbn = set() for e in editions: for f in 'isbn_10', 'isbn_13': for v in e.get(f, []): isbn.add(v.replace('-', '')) add_field_list(doc, 'isbn', isbn) lang = set() for e in editions: for l in e.get('languages', []): m = re_lang_key.match(l['key'] if isinstance(l, dict) else l) lang.add(m.group(1)) if lang: add_field_list(doc, 'language', lang) pub_goog = set() # google pub_nongoog = set() nonpub_goog = set() nonpub_nongoog = set() public_scan = False all_collection = set() all_overdrive = set() lending_edition = None printdisabled = set() for e in editions: if 'overdrive' in e: all_overdrive.update(e['overdrive']) if 'ocaid' not in e: continue if not lending_edition and 'lendinglibrary' in e['ia_collection']: lending_edition = re_edition_key.match(e['key']).group(1) if 'printdisabled' in e['ia_collection']: printdisabled.add(re_edition_key.match(e['key']).group(1)) all_collection.update(e['ia_collection']) assert isinstance(e['ocaid'], basestring) i = e['ocaid'].strip() if e['public_scan']: public_scan = True if i.endswith('goog'): pub_goog.add(i) else: pub_nongoog.add(i) else: if i.endswith('goog'): nonpub_goog.add(i) else: nonpub_nongoog.add(i) #print 'lending_edition:', lending_edition ia_list = list(pub_nongoog) + list(pub_goog) + list(nonpub_nongoog) + list(nonpub_goog) add_field_list(doc, 'ia', ia_list) if has_fulltext: add_field(doc, 'public_scan_b', public_scan) if all_collection: add_field(doc, 'ia_collection_s', ';'.join(all_collection)) if all_overdrive: add_field(doc, 'overdrive_s', ';'.join(all_overdrive)) if lending_edition: add_field(doc, 'lending_edition_s', lending_edition) if printdisabled: add_field(doc, 'printdisabled_s', ';'.join(list(printdisabled))) author_keys = [re_author_key.match(a['key']).group(1) for a in authors] author_names = [a.get('name', '') for a in authors] add_field_list(doc, 'author_key', author_keys) add_field_list(doc, 'author_name', author_names) alt_names = set() for a in authors: if 'alternate_names' in a: alt_names.update(a['alternate_names']) add_field_list(doc, 'author_alternative_name', alt_names) add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names))) #if subjects: # add_field(doc, 'fiction', subjects['fiction']) for k in 'person', 'place', 'subject', 'time': if k not in subjects: continue add_field_list(doc, k, subjects[k].keys()) add_field_list(doc, k + '_facet', subjects[k].keys()) subject_keys = [str_to_key(s) for s in subjects[k].keys()] add_field_list(doc, k + '_key', subject_keys) return doc
def find_works(akey, book_iter, existing={}): equiv = defaultdict(int) # normalized title and work title pairs norm_titles = defaultdict(int) # frequency of titles books_by_key = {} books = [] # normalized work title to regular title rev_wt = defaultdict(lambda: defaultdict(int)) for book in book_iter: if 'norm_wt' in book: pair = (book['norm_title'], book['norm_wt']) equiv[pair] += 1 rev_wt[book['norm_wt']][book['work_title']] +=1 norm_titles[book['norm_title']] += 1 books_by_key[book['key']] = book books.append(book) title_map = build_work_title_map(equiv, norm_titles) for a, b in existing.items(): norm_a = mk_norm(a) norm_b = mk_norm(b) rev_wt[norm_b][norm_a] +=1 title_map[norm_a] = norm_b works = defaultdict(lambda: defaultdict(list)) work_titles = defaultdict(list) for b in books: if 'eng' not in b.get('lang', []) and 'norm_wt' in b: work_titles[b['norm_wt']].append(b['key']) n = b['norm_title'] title = b['title'] if n in title_map: n = title_map[n] title = top_rev_wt(rev_wt[n]) works[n][title].append(b['key']) works = sorted([(sum(map(len, w.values() + [work_titles[n]])), n, w) for n, w in works.items()]) for work_count, norm, w in works: first = sorted(w.items(), reverse=True, key=lambda i:len(i[1]))[0][0] titles = defaultdict(int) for key_list in w.values(): for ekey in key_list: b = books_by_key[ekey] title = b['title'] titles[title] += 1 keys = work_titles[norm] for values in w.values(): keys += values assert work_count == len(keys) title = max(titles.keys(), key=lambda i:titles[i]) toc_iter = ((k, books_by_key[k].get('table_of_contents', None)) for k in keys) toc = dict((k, v) for k, v in toc_iter if v) editions = [books_by_key[k] for k in keys] subtitles = defaultdict(lambda: defaultdict(int)) edition_count = 0 with_subtitle_count = 0 for e in editions: edition_count += 1 subtitle = e['subtitle'] or '' if subtitle != '': with_subtitle_count += 1 norm_subtitle = mk_norm(subtitle) if norm_subtitle != norm: subtitles[norm_subtitle][subtitle] += 1 use_subtitle = None for k, v in subtitles.iteritems(): lc_k = k.strip(' .').lower() if lc_k in ('', 'roman') or 'edition' in lc_k: continue num = sum(v.values()) overall = float(num) / float(edition_count) ratio = float(num) / float(with_subtitle_count) if overall > 0.2 and ratio > 0.5: use_subtitle = freq_dict_top(v) w = {'title': first, 'editions': editions} if use_subtitle: w['subtitle'] = use_subtitle if toc: w['toc'] = toc subjects = four_types(get_work_subjects(w)) if subjects: w['subjects'] = subjects yield w