def work_and_marc(): i = 0 skip = True for w in query_iter({'type': '/type/work', 'title': None}): if skip: if w['key'] == '/w/OL56814W': skip = False else: continue marc = set() q = { 'type': '/type/edition', 'works': w['key'], 'title': None, 'source_records': None } for e in query_iter(q): if e.get('source_records', []): marc.update(i[5:] for i in e['source_records'] if i.startswith('marc:')) mc = get_mc(e['key']) if mc and not mc.startswith('ia:') and not mc.startswith( 'amazon:'): marc.add(mc) if marc: yield w, marc else: print('no marc:', w)
def by_authors(): find_new_work_key() skipping = False skipping = True q = { 'type':'/type/author', 'name': None, 'works': None } for a in query_iter(q, offset=215000): akey = a['key'] if skipping: print('skipping:', akey, a['name']) if akey == '/a/OL218496A': skipping = False continue q = { 'type':'/type/work', 'authors': akey, } if query(q): print((akey, repr(a['name']), 'has works')) continue # print akey, a['name'] found = find_works(akey) works = [i for i in found if len(i['editions']) > 2] if works: #open('found/' + akey[3:], 'w').write(repr(works)) print((akey, repr(a['name']))) #pprint(works) #print_works(works) add_works(akey, works) print()
def update_work(w, obj_cache={}, debug=False, resolve_redirects=False): wkey = w['key'] assert wkey.startswith('/works') assert '/' not in wkey[7:] q = {'type': '/type/redirect', 'location': wkey} redirect_keys = [r['key'][7:] for r in query_iter(q)] redirects = ''.join('<query>key:%s</query>' % r for r in redirect_keys if '/' not in r) delete_xml = '<delete><query>key:%s</query>%s</delete>' % (wkey[7:], redirects) requests = [delete_xml] if w['type']['key'] == '/type/work' and w.get('title'): try: doc = build_doc(w, obj_cache, resolve_redirects=resolve_redirects) except: print w raise if doc is not None: add = Element("add") add.append(doc) add_xml = tostring(add).encode('utf-8') requests.append(add_xml) return requests
def books_query(akey): # live version q = { 'type':'/type/edition', 'authors': akey, '*': None } return query_iter(q)
def by_authors(): find_new_work_key() skipping = False skipping = True q = {'type': '/type/author', 'name': None, 'works': None} for a in query_iter(q, offset=215000): akey = a['key'] if skipping: print('skipping:', akey, a['name']) if akey == '/a/OL218496A': skipping = False continue q = { 'type': '/type/work', 'authors': akey, } if query(q): print((akey, repr(a['name']), 'has works')) continue # print akey, a['name'] found = find_works(akey) works = [i for i in found if len(i['editions']) > 2] if works: #open('found/' + akey[3:], 'w').write(repr(works)) print((akey, repr(a['name']))) #print_works(works) add_works(akey, works) print()
def update_work(w, obj_cache=None, debug=False, resolve_redirects=False): if obj_cache is None: obj_cache = {} wkey = w['key'] #assert wkey.startswith('/works') #assert '/' not in wkey[7:] deletes = [] requests = [] q = {'type': '/type/redirect', 'location': wkey} redirect_keys = [r['key'][7:] for r in query_iter(q)] deletes += redirect_keys deletes += [wkey[7:]] # strip /works/ from /works/OL1234W # handle edition records as well # When an edition is not belonged to a work, create a fake work and index it. if w['type']['key'] == '/type/edition' and w.get('title'): edition = w w = { # Use key as /works/OL1M. # In case of single-core-solr, we are using full path as key. So it is required # to be unique across all types of documents. # The website takes care of redirecting /works/OL1M to /books/OL1M. 'key': edition['key'].replace("/books/", "/works/"), 'type': {'key': '/type/work'}, 'title': edition['title'], 'editions': [edition] } # Hack to add subjects when indexing /books/ia:xxx if edition.get("subjects"): w['subjects'] = edition['subjects'] if w['type']['key'] == '/type/work' and w.get('title'): try: d = build_data(w, obj_cache=obj_cache, resolve_redirects=resolve_redirects) doc = dict2element(d) except: logger.error("failed to update work %s", w['key'], exc_info=True) else: if d is not None: # Delete all ia:foobar keys # XXX-Anand: The works in in_library subject were getting wiped off for unknown reasons. # I suspect that this might be a cause. Disabling temporarily. #if d.get('ia'): # deletes += ["ia:" + iaid for iaid in d['ia']] # In single core solr, we use full path as key, not just the last part if is_single_core(): deletes = ["/works/" + k for k in deletes] requests.append(make_delete_query(deletes)) add = Element("add") add.append(doc) add_xml = tostring(add).encode('utf-8') requests.append(add_xml) return requests
def update_edition(e): if not is_single_core(): return [] ekey = e['key'] logger.info("updating edition %s", ekey) wkey = e.get('works') and e['works'][0]['key'] w = wkey and withKey(wkey) authors = [] if w: authors = [withKey(a['author']['key']) for a in w.get("authors", []) if 'author' in a] request_set = SolrRequestSet() request_set.delete(ekey) q = {'type': '/type/redirect', 'location': ekey} redirect_keys = [r['key'] for r in query_iter(q)] for k in redirect_keys: request_set.delete(k) doc = EditionBuilder(e, w, authors).build() request_set.add(doc) return request_set.get_requests()
def is_loaded(loc): assert loc.startswith('marc:') vars = {'loc': loc[5:]} db_iter = marc_index.query('select * from machine_comment where v=$loc', vars) if list(db_iter): return True iter = query_iter({'type': '/type/edition', 'source_records': loc}) return bool(list(iter))
def switch_author(ol, old, new, other, debug=False): q = { 'authors': old, 'type': '/type/edition', } for e in query_iter(q): if debug: print 'switch author:', e['key'] print e e = ol.get(e['key']) update_edition(ol, e, other, new, debug)
def switch_author(ol, old, new, other, debug=False): q = { 'authors': old, 'type': '/type/edition', } for e in query_iter(q): if debug: print('switch author:', e['key']) print(e) e = ol.get(e['key']) update_edition(ol, e, other, new, debug)
def title_search(self, v): q = {'type': '/type/edition', 'isbn_10': None, 'title': v} editions = [] for e in query_iter(q): e['title'] = v editions.append(e) yield 'searcing for title "' + web.htmlquote(v) + '": ' yield from self.search(editions)
def add_cover_to_work(w): if 'cover_edition' in w: return q = {'type':'/type/edition', 'works':w['key'], 'publish_date': None, 'languages': '/l/eng'} cover_edition = pick_cover(query_iter(q)) if not cover_edition: q = {'type':'/type/edition', 'works':w['key'], 'publish_date': None} cover_edition = pick_cover(query_iter(q)) if not cover_edition: return w['cover_edition'] = Reference(cover_edition) if ol is None: rc = read_rc() ol = OpenLibrary("http://openlibrary.org") ol.login('WorkBot', rc['WorkBot']) print ol.save(w['key'], w, 'added cover to work')
def isbn_search(self, v): q = {'type': '/type/edition', 'isbn_10': v, 'title': None, 'subtitle': None} editions = [] for e in query_iter(q): e['isbn_10'] = v editions.append(e) yield 'searching for ISBN ' + web.htmlquote(v) + ': ' for i in self.search(editions): yield i
def is_loaded(loc): assert loc.startswith('marc:') vars = {'loc': loc[5:]} db_iter = marc_index.query('select * from machine_comment where v=$loc', vars) if list(db_iter): return True iter = query_iter({'type': '/type/edition', 'source_records': loc}) return bool(list(iter))
def oclc_search(self, v): q = {'type': '/type/edition', 'oclc_numbers': v, 'title': None, 'subtitle': None, 'isbn_10': None} editions = [] print(q) for e in query_iter(q): e['oclc_numbers'] = v editions.append(e) yield 'searching for OCLC ' + web.htmlquote(v) + ': ' for i in self.search(editions): yield i
def work_and_marc(): i = 0 skip = True for w in query_iter({'type': '/type/work', 'title': None}): if skip: if w['key'] == '/w/OL56814W': skip = False else: continue marc = set() q = {'type': '/type/edition', 'works': w['key'], 'title': None, 'source_records': None} for e in query_iter(q): if e.get('source_records', []): marc.update(i[5:] for i in e['source_records'] if i.startswith('marc:')) mc = get_mc(e['key']) if mc and not mc.startswith('ia:') and not mc.startswith('amazon:'): marc.add(mc) if marc: yield w, marc else: print('no marc:', w)
def books_query(akey): # live version q = { 'type': '/type/edition', 'authors': akey, 'source_records': None, 'title': None, 'work_title': None, 'languages': None, 'title_prefix': None, 'subtitle': None, } return query_iter(q)
def books_query(akey): # live version q = { 'type':'/type/edition', 'authors': akey, 'source_records': None, 'title': None, 'work_title': None, 'languages': None, 'title_prefix': None, 'subtitle': None, } return query_iter(q)
def update_work(w, obj_cache=None, debug=False, resolve_redirects=False): if obj_cache is None: obj_cache = {} wkey = w['key'] #assert wkey.startswith('/works') #assert '/' not in wkey[7:] deletes = [] requests = [] q = {'type': '/type/redirect', 'location': wkey} redirect_keys = [r['key'][7:] for r in query_iter(q)] deletes += redirect_keys deletes += [wkey[7:]] # strip /works/ from /works/OL1234W # handle edition records as well # When an edition is not belonged to a work, create a fake work and index it. if w['type']['key'] == '/type/edition' and w.get('title'): edition = w w = { 'key': edition['key'], 'type': {'key': '/type/work'}, 'title': edition['title'], 'editions': [edition] } if w['type']['key'] == '/type/work' and w.get('title'): try: d = build_data(w, obj_cache=obj_cache, resolve_redirects=resolve_redirects) doc = dict2element(d) except: logger.error("failed to update work %s", w['key'], exc_info=True) else: if d is not None: # Delete all ia:foobar keys # if d.get('ia'): deletes += ["ia:" + iaid for iaid in d['ia']] requests.append(make_delete_query(deletes)) add = Element("add") add.append(doc) add_xml = tostring(add).encode('utf-8') requests.append(add_xml) return requests
def build_data(w, obj_cache=None, resolve_redirects=False): wkey = w['key'] # Anand - Oct 2013 # For /works/ia:xxx, editions are already suplied. Querying will empty response. if "editions" in w: editions = w['editions'] else: q = { 'type':'/type/edition', 'works': wkey, '*': None } editions = list(query_iter(q)) authors = SolrProcessor().extract_authors(w) iaids = [e["ocaid"] for e in editions if "ocaid" in e] ia = dict((iaid, get_ia_collection_and_box_id(iaid)) for iaid in iaids) duplicates = {} return build_data2(w, editions, authors, ia, duplicates)
def merge_authors(keys): # print 'merge author %s:"%s" and %s:"%s"' % (author['key'], author['name'], merge_with['key'], merge_with['name']) # print 'becomes: "%s"' % `new_name` authors = [a for a in (withKey(k) for k in keys) if a['type']['key'] != '/type/redirect'] not_redirect = set(a['key'] for a in authors) for a in authors: print a assert all(a['type']['key'] == '/type/author' for a in authors) name1 = authors[0]['name'] assert all(match_with_bad_chars(a['name'], name1) for a in authors[1:]) best_key = pick_best_author(authors)['key'] imgs = [a['key'] for a in authors if has_image(a['key'])] if len(imgs) == 1: new_key = imgs[0] else: new_key = "/a/OL%dA" % min(key_int(a) for a in authors) # Molière and O. J. O. Ferreira if len(imgs) != 0: print 'imgs:', imgs return # skip if not (imgs == [u'/a/OL21848A', u'/a/OL4280680A'] \ or imgs == [u'/a/OL325189A', u'/a/OL266422A'] \ or imgs == [u'/a/OL5160945A', u'/a/OL5776228A']): print imgs assert len(imgs) == 0 do_normalize(new_key, best_key, authors) old_keys = set(k for k in keys if k != new_key) print 'old keys:', old_keys for old in old_keys: # /b/OL21291659M switch_author(old, new_key, old_keys) if old in not_redirect: make_redirect(old, new_key) q = { 'authors': old, 'type': '/type/edition', } if list(get_things(q)) != []: switch_author(old, new_key, old_keys) l = list(query_iter(q)) print old, l assert l == []
def by_authors(): skip = '/a/OL25755A' q = {'type': '/type/author', 'name': None} for a in query_iter(q): akey = a['key'] if skip: if akey == skip: skip = None else: continue write_log('author', akey, a.get('name', 'name missing')) works = find_works(akey, get_books(akey, books_query(akey))) print((akey, repr(a['name']))) for w in works: w['author'] = akey wkey = get_work_key(w['title'], akey) if wkey: w['key'] = wkey yield w
def by_authors(): q = { 'type':'/type/author', 'name': None, 'works': None } for a in query_iter(q): akey = a['key'] write_log('author', akey, a.get('name', 'name missing')) q = { 'type':'/type/work', 'authors': akey, } works = find_works(akey) print akey, `a['name']` for w in works: w['author'] = akey work_queue.append(w) if len(work_queue) > 1000: for e in run_queue(work_queue): yield e work_queue = [] for e in run_queue(work_queue): yield e
def by_authors(): skip = '/a/OL25755A' q = { 'type':'/type/author', 'name': None } for a in query_iter(q): akey = a['key'] if skip: if akey == skip: skip = None else: continue write_log('author', akey, a.get('name', 'name missing')) works = find_works(akey, get_books(akey, books_query(akey))) print((akey, repr(a['name']))) for w in works: w['author'] = akey wkey = get_work_key(w['title'], akey) if wkey: w['key'] = wkey yield w
def update_work(w): wkey = w['key'] assert wkey.startswith('/works') assert '/' not in wkey[7:] q = {'type': '/type/redirect', 'location': wkey} redirect_keys = [r['key'][7:] for r in query_iter(q)] redirects = ''.join('<query>key:%s</query>' % r for r in redirect_keys if '/' not in r) delete_xml = '<delete><query>key:%s</query>%s</delete>' % (wkey[7:], redirects) requests = [delete_xml] if w['type']['key'] == '/type/work' and w.get('title', None): try: doc = build_doc(w) except: print w raise if doc is not None: add = Element("add") add.append(doc) add_xml = tostring(add).encode('utf-8') requests.append(add_xml) return requests
def update_work(w, obj_cache=None, debug=False, resolve_redirects=False): if obj_cache is None: obj_cache = {} wkey = w['key'] #assert wkey.startswith('/works') #assert '/' not in wkey[7:] q = {'type': '/type/redirect', 'location': wkey} redirect_keys = [r['key'][7:] for r in query_iter(q)] redirects = ''.join('<query>key:%s</query>' % r for r in redirect_keys if '/' not in r) delete_xml = '<delete><query>key:%s</query>%s</delete>' % (wkey[7:].replace(":", r"\:"), redirects) requests = [delete_xml] # handle edition records as well # When an edition is not belonged to a work, create a fake work and index it. if w['type']['key'] == '/type/edition' and w.get('title'): edition = w w = { 'key': edition['key'], 'type': {'key': '/type/work'}, 'title': edition['title'], 'editions': [edition] } if w['type']['key'] == '/type/work' and w.get('title'): try: doc = build_doc(w, obj_cache, resolve_redirects=resolve_redirects) except: logger.error("failed to update work %s", w['key'], exc_info=True) else: if doc is not None: add = Element("add") add.append(doc) add_xml = tostring(add).encode('utf-8') requests.append(add_xml) return requests
akey = sys.argv[1] # out = open('book_cache', 'w') # for b in books_query(akey): # print >> out, b # out.close() # sys.exit(0) works = find_works(akey, get_books(akey, books_query(akey))) #works = find_works(akey, get_books(akey, books_from_cache())) do_updates = False while True: # until redirects repaired q = {'type':'/type/edition', 'authors':akey, 'works': None} work_to_edition = defaultdict(set) edition_to_work = defaultdict(set) for e in query_iter(q): if e.get('works', None): for w in e['works']: work_to_edition[w['key']].add(e['key']) edition_to_work[e['key']].add(w['key']) work_title = {} fix_redirects = [] for k, editions in work_to_edition.items(): w = withKey(k) if w['type']['key'] == '/type/redirect': print 'redirect found' wkey = w['location'] assert re_work_key.match(wkey) for ekey in editions: e = withKey(ekey)
def build_data(w, obj_cache=None, resolve_redirects=False): if obj_cache: obj_cache = {} wkey = w['key'] assert w['type']['key'] == '/type/work' title = w.get('title', None) if not title: return p = SolrProcessor(obj_cache, resolve_redirects) get_pub_year = p.get_pub_year # Load stuff if not already provided if 'editions' not in w: q = {'type': '/type/edition', 'works': wkey, '*': None} w['editions'] = list(query_iter(q)) #print 'editions:', [e['key'] for e in w['editions']] identifiers = defaultdict(list) editions = p.process_editions(w, identifiers) authors = p.extract_authors(w) has_fulltext = any(e.get('ocaid', None) for e in editions) subjects = p.get_subject_counts(w, editions, has_fulltext) def add_field(doc, name, value): doc[name] = value def add_field_list(doc, name, field_list): doc[name] = list(field_list) doc = p.build_data(w, editions, subjects, has_fulltext) cover_edition = pick_cover(w, editions) if cover_edition: add_field(doc, 'cover_edition_key', re_edition_key.match(cover_edition).group(1)) if w.get('covers'): cover = w['covers'][0] assert isinstance(cover, int) add_field(doc, 'cover_i', cover) k = 'first_sentence' fs = set(e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, fs) publishers = set() for e in editions: publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', [])) add_field_list(doc, 'publisher', publishers) # add_field_list(doc, 'publisher_facet', publishers) lang = set() ia_loaded_id = set() ia_box_id = set() last_modified_i = datetimestr_to_int(w.get('last_modified')) for e in editions: for l in e.get('languages', []): m = re_lang_key.match(l['key'] if isinstance(l, dict) else l) lang.add(m.group(1)) if e.get('ia_loaded_id'): if isinstance(e['ia_loaded_id'], basestring): ia_loaded_id.add(e['ia_loaded_id']) else: try: assert isinstance(e['ia_loaded_id'], list) and isinstance( e['ia_loaded_id'][0], basestring) except AssertionError: print e.get('ia') print e['ia_loaded_id'] raise ia_loaded_id.update(e['ia_loaded_id']) if e.get('ia_box_id'): if isinstance(e['ia_box_id'], basestring): ia_box_id.add(e['ia_box_id']) else: try: assert isinstance(e['ia_box_id'], list) and isinstance( e['ia_box_id'][0], basestring) except AssertionError: print e['key'] raise ia_box_id.update(e['ia_box_id']) if lang: add_field_list(doc, 'language', lang) #if lending_edition or in_library_edition: # add_field(doc, "borrowed_b", is_borrowed(lending_edition or in_library_edition)) author_keys = [re_author_key.match(a['key']).group(1) for a in authors] author_names = [a.get('name', '') for a in authors] add_field_list(doc, 'author_key', author_keys) add_field_list(doc, 'author_name', author_names) alt_names = set() for a in authors: if 'alternate_names' in a: alt_names.update(a['alternate_names']) add_field_list(doc, 'author_alternative_name', alt_names) add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names))) #if subjects: # add_field(doc, 'fiction', subjects['fiction']) for k in 'person', 'place', 'subject', 'time': if k not in subjects: continue add_field_list(doc, k, subjects[k].keys()) add_field_list(doc, k + '_facet', subjects[k].keys()) subject_keys = [str_to_key(s) for s in subjects[k].keys()] add_field_list(doc, k + '_key', subject_keys) for k in sorted(identifiers.keys()): add_field_list(doc, 'id_' + k, identifiers[k]) if ia_loaded_id: add_field_list(doc, 'ia_loaded_id', ia_loaded_id) if ia_box_id: add_field_list(doc, 'ia_box_id', ia_box_id) return doc
set_staging(True) rc = read_rc() ol = OpenLibrary("http://dev.openlibrary.org") ol.login('EdwardBot', rc['EdwardBot']) sys.stdout = codecs.getwriter('utf-8')(sys.stdout) work_q = { 'type': '/type/work', 'authors': None, 'title': None, } queue = [] for w in query_iter(work_q): if not w.get('authors'): print('no authors') continue if any(isinstance(a, dict) and 'author' in a for a in w['authors']): continue print(len(queue), w['key'], w['title']) # , ol.get(w['authors'][0]['key'])['name'] full = ol.get(w['key']) authors = full['authors'] assert all(isinstance(a, Reference) for a in authors) full['authors'] = [{'author': a} for a in authors] queue.append(full) if len(queue) > 1000: print('saving') print(
def build_doc(w, obj_cache={}, resolve_redirects=False): wkey = w['key'] assert w['type']['key'] == '/type/work' title = w.get('title', None) if not title: return def get_pub_year(e): pub_date = e.get('publish_date', None) if pub_date: m = re_iso_date.match(pub_date) if m: return m.group(1) m = re_year.search(pub_date) if m: return m.group(1) if 'editions' not in w: q = { 'type':'/type/edition', 'works': wkey, '*': None } w['editions'] = list(query_iter(q)) #print 'editions:', [e['key'] for e in w['editions']] identifiers = defaultdict(list) editions = [] for e in w['editions']: pub_year = get_pub_year(e) if pub_year: e['pub_year'] = pub_year ia = None if 'ocaid' in e: ia = e['ocaid'] elif 'ia_loaded_id' in e: loaded = e['ia_loaded_id'] ia = loaded if isinstance(loaded, basestring) else loaded[0] if ia: ia_meta_fields = get_ia_collection_and_box_id(ia) collection = ia_meta_fields['collection'] if 'ia_box_id' in e and isinstance(e['ia_box_id'], basestring): e['ia_box_id'] = [e['ia_box_id']] if ia_meta_fields.get('boxid'): box_id = list(ia_meta_fields['boxid'])[0] e.setdefault('ia_box_id', []) if box_id.lower() not in [x.lower() for x in e['ia_box_id']]: e['ia_box_id'].append(box_id) #print 'collection:', collection e['ia_collection'] = collection e['public_scan'] = ('lendinglibrary' not in collection) and ('printdisabled' not in collection) overdrive_id = e.get('identifiers', {}).get('overdrive', None) if overdrive_id: #print 'overdrive:', overdrive_id e['overdrive'] = overdrive_id if 'identifiers' in e: for k, id_list in e['identifiers'].iteritems(): k_orig = k k = k.replace('.', '_').replace(',', '_').replace('(', '').replace(')', '').replace(':', '_').replace('/', '').replace('#', '').lower() m = re_solr_field.match(k) if not m: print (k_orig, k) assert m for v in id_list: v = v.strip() if v not in identifiers[k]: identifiers[k].append(v) editions.append(e) editions.sort(key=lambda e: e.get('pub_year', None)) #print len(w['editions']), 'editions found' #print w['key'] work_authors = [] authors = [] author_keys = [] for a in w.get('authors', []): if 'author' not in a: # OL Web UI bug continue # http://openlibrary.org/works/OL15365167W.yml?m=edit&v=1 akey = a['author']['key'] m = re_author_key.match(akey) if not m: print 'invalid author key:', akey continue work_authors.append(akey) author_keys.append(m.group(1)) if akey in obj_cache and obj_cache[akey]['type']['key'] != '/type/redirect': authors.append(obj_cache[akey]) else: authors.append(withKey(akey)) if any(a['type']['key'] == '/type/redirect' for a in authors): if resolve_redirects: def resolve(a): if a['type']['key'] == '/type/redirect': a = withKey(a['location']) return a authors = [resolve(a) for a in authors] else: print for a in authors: print 'author:', a print w['key'] print raise AuthorRedirect assert all(a['type']['key'] == '/type/author' for a in authors) try: subjects = four_types(get_work_subjects(w)) except: print 'bad work: ', w['key'] raise field_map = { 'subjects': 'subject', 'subject_places': 'place', 'subject_times': 'time', 'subject_people': 'person', } has_fulltext = any(e.get('ocaid', None) or e.get('overdrive', None) for e in editions) #print 'has_fulltext:', has_fulltext for db_field, solr_field in field_map.iteritems(): if not w.get(db_field, None): continue cur = subjects.setdefault(solr_field, {}) for v in w[db_field]: try: if isinstance(v, dict): if 'value' not in v: continue v = v['value'] cur[v] = cur.get(v, 0) + 1 except: print 'v:', v raise if any(e.get('ocaid', None) for e in editions): subjects.setdefault('subject', {}) subjects['subject']['Accessible book'] = subjects['subject'].get('Accessible book', 0) + 1 if not has_fulltext: subjects['subject']['Protected DAISY'] = subjects['subject'].get('Protected DAISY', 0) + 1 #print w['key'], subjects['subject'] doc = Element("doc") add_field(doc, 'key', w['key'][7:]) title = w.get('title', None) if title: add_field(doc, 'title', title) # add_field(doc, 'title_suggest', title) add_field(doc, 'has_fulltext', has_fulltext) if w.get('subtitle', None): add_field(doc, 'subtitle', w['subtitle']) alt_titles = set() for e in editions: if 'title' in e and e['title'] != title: alt_titles.add(e['title']) for f in 'work_titles', 'other_titles': for t in e.get(f, []): if t != title: alt_titles.add(t) add_field_list(doc, 'alternative_title', alt_titles) alt_subtitles = set( e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None)) add_field_list(doc, 'alternative_subtitle', alt_subtitles) add_field(doc, 'edition_count', len(editions)) for e in editions: add_field(doc, 'edition_key', re_edition_key.match(e['key']).group(1)) cover_edition = pick_cover(w, editions) if cover_edition: add_field(doc, 'cover_edition_key', re_edition_key.match(cover_edition).group(1)) if w.get('covers'): cover = w['covers'][0] assert isinstance(cover, int) add_field(doc, 'cover_i', cover) k = 'by_statement' add_field_list(doc, k, set( e[k] for e in editions if e.get(k, None))) k = 'publish_date' pub_dates = set(e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, pub_dates) pub_years = set(m.group(1) for m in (re_year.search(i) for i in pub_dates) if m) if pub_years: add_field_list(doc, 'publish_year', pub_years) add_field(doc, 'first_publish_year', min(int(i) for i in pub_years)) k = 'first_sentence' fs = set( e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, fs) publishers = set() for e in editions: publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', [])) add_field_list(doc, 'publisher', publishers) # add_field_list(doc, 'publisher_facet', publishers) field_map = [ ('lccn', 'lccn'), ('publish_places', 'publish_place'), ('oclc_numbers', 'oclc'), ('contributions', 'contributor'), ] for db_key, search_key in field_map: v = set() for e in editions: if db_key not in e: continue v.update(e[db_key]) add_field_list(doc, search_key, v) isbn = set() for e in editions: for f in 'isbn_10', 'isbn_13': for v in e.get(f, []): v = v.replace('-', '') isbn.add(v) alt = opposite_isbn(v) if alt: isbn.add(alt) add_field_list(doc, 'isbn', isbn) lang = set() ia_loaded_id = set() ia_box_id = set() for e in editions: for l in e.get('languages', []): m = re_lang_key.match(l['key'] if isinstance(l, dict) else l) lang.add(m.group(1)) if e.get('ia_loaded_id'): if isinstance(e['ia_loaded_id'], basestring): ia_loaded_id.add(e['ia_loaded_id']) else: try: assert isinstance(e['ia_loaded_id'], list) and isinstance(e['ia_loaded_id'][0], basestring) except AssertionError: print e.get('ia') print e['ia_loaded_id'] raise ia_loaded_id.update(e['ia_loaded_id']) if e.get('ia_box_id'): if isinstance(e['ia_box_id'], basestring): ia_box_id.add(e['ia_box_id']) else: try: assert isinstance(e['ia_box_id'], list) and isinstance(e['ia_box_id'][0], basestring) except AssertionError: print e['key'] raise ia_box_id.update(e['ia_box_id']) if lang: add_field_list(doc, 'language', lang) pub_goog = set() # google pub_nongoog = set() nonpub_goog = set() nonpub_nongoog = set() public_scan = False all_collection = set() all_overdrive = set() lending_edition = None in_library_edition = None printdisabled = set() for e in editions: if 'overdrive' in e: all_overdrive.update(e['overdrive']) if 'ocaid' not in e: continue if not lending_edition and 'lendinglibrary' in e.get('ia_collection', []): lending_edition = re_edition_key.match(e['key']).group(1) if not in_library_edition and 'inlibrary' in e.get('ia_collection', []): in_library_edition = re_edition_key.match(e['key']).group(1) if 'printdisabled' in e.get('ia_collection', []): printdisabled.add(re_edition_key.match(e['key']).group(1)) all_collection.update(e.get('ia_collection', [])) assert isinstance(e['ocaid'], basestring) i = e['ocaid'].strip() if e.get('public_scan'): public_scan = True if i.endswith('goog'): pub_goog.add(i) else: pub_nongoog.add(i) else: if i.endswith('goog'): nonpub_goog.add(i) else: nonpub_nongoog.add(i) #print 'lending_edition:', lending_edition ia_list = list(pub_nongoog) + list(pub_goog) + list(nonpub_nongoog) + list(nonpub_goog) add_field_list(doc, 'ia', ia_list) if has_fulltext: add_field(doc, 'public_scan_b', public_scan) if all_collection: add_field(doc, 'ia_collection_s', ';'.join(all_collection)) if all_overdrive: add_field(doc, 'overdrive_s', ';'.join(all_overdrive)) if lending_edition: add_field(doc, 'lending_edition_s', lending_edition) elif in_library_edition: add_field(doc, 'lending_edition_s', in_library_edition) if printdisabled: add_field(doc, 'printdisabled_s', ';'.join(list(printdisabled))) if lending_edition or in_library_edition: add_field(doc, "borrowed_b", is_borrowed(lending_edition or in_library_edition)) author_keys = [re_author_key.match(a['key']).group(1) for a in authors] author_names = [a.get('name', '') for a in authors] add_field_list(doc, 'author_key', author_keys) add_field_list(doc, 'author_name', author_names) alt_names = set() for a in authors: if 'alternate_names' in a: alt_names.update(a['alternate_names']) add_field_list(doc, 'author_alternative_name', alt_names) add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names))) #if subjects: # add_field(doc, 'fiction', subjects['fiction']) for k in 'person', 'place', 'subject', 'time': if k not in subjects: continue add_field_list(doc, k, subjects[k].keys()) add_field_list(doc, k + '_facet', subjects[k].keys()) subject_keys = [str_to_key(s) for s in subjects[k].keys()] add_field_list(doc, k + '_key', subject_keys) for k in sorted(identifiers.keys()): add_field_list(doc, 'id_' + k, identifiers[k]) if ia_loaded_id: add_field_list(doc, 'ia_loaded_id', ia_loaded_id) if ia_box_id: add_field_list(doc, 'ia_box_id', ia_box_id) return doc
def build_data(w, obj_cache=None, resolve_redirects=False): if obj_cache: obj_cache = {} wkey = w['key'] assert w['type']['key'] == '/type/work' title = w.get('title', None) if not title: return p = SolrProcessor(obj_cache, resolve_redirects) get_pub_year = p.get_pub_year # Load stuff if not already provided if 'editions' not in w: q = { 'type':'/type/edition', 'works': wkey, '*': None } w['editions'] = list(query_iter(q)) #print 'editions:', [e['key'] for e in w['editions']] identifiers = defaultdict(list) editions = p.process_editions(w, identifiers) authors = p.extract_authors(w) has_fulltext = any(e.get('ocaid', None) for e in editions) subjects = p.get_subject_counts(w, editions, has_fulltext) def add_field(doc, name, value): doc[name] = value def add_field_list(doc, name, field_list): doc[name] = list(field_list) doc = p.build_data(w, editions, subjects, has_fulltext) cover_edition = pick_cover(w, editions) if cover_edition: add_field(doc, 'cover_edition_key', re_edition_key.match(cover_edition).group(1)) if w.get('covers'): cover = w['covers'][0] assert isinstance(cover, int) add_field(doc, 'cover_i', cover) k = 'first_sentence' fs = set( e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, fs) publishers = set() for e in editions: publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', [])) add_field_list(doc, 'publisher', publishers) # add_field_list(doc, 'publisher_facet', publishers) lang = set() ia_loaded_id = set() ia_box_id = set() last_modified_i = datetimestr_to_int(w.get('last_modified')) for e in editions: for l in e.get('languages', []): m = re_lang_key.match(l['key'] if isinstance(l, dict) else l) lang.add(m.group(1)) if e.get('ia_loaded_id'): if isinstance(e['ia_loaded_id'], basestring): ia_loaded_id.add(e['ia_loaded_id']) else: try: assert isinstance(e['ia_loaded_id'], list) and isinstance(e['ia_loaded_id'][0], basestring) except AssertionError: print e.get('ia') print e['ia_loaded_id'] raise ia_loaded_id.update(e['ia_loaded_id']) if e.get('ia_box_id'): if isinstance(e['ia_box_id'], basestring): ia_box_id.add(e['ia_box_id']) else: try: assert isinstance(e['ia_box_id'], list) and isinstance(e['ia_box_id'][0], basestring) except AssertionError: print e['key'] raise ia_box_id.update(e['ia_box_id']) if lang: add_field_list(doc, 'language', lang) #if lending_edition or in_library_edition: # add_field(doc, "borrowed_b", is_borrowed(lending_edition or in_library_edition)) author_keys = [re_author_key.match(a['key']).group(1) for a in authors] author_names = [a.get('name', '') for a in authors] add_field_list(doc, 'author_key', author_keys) add_field_list(doc, 'author_name', author_names) alt_names = set() for a in authors: if 'alternate_names' in a: alt_names.update(a['alternate_names']) add_field_list(doc, 'author_alternative_name', alt_names) add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names))) #if subjects: # add_field(doc, 'fiction', subjects['fiction']) for k in 'person', 'place', 'subject', 'time': if k not in subjects: continue add_field_list(doc, k, subjects[k].keys()) add_field_list(doc, k + '_facet', subjects[k].keys()) subject_keys = [str_to_key(s) for s in subjects[k].keys()] add_field_list(doc, k + '_key', subject_keys) for k in sorted(identifiers.keys()): add_field_list(doc, 'id_' + k, identifiers[k]) if ia_loaded_id: add_field_list(doc, 'ia_loaded_id', ia_loaded_id) if ia_box_id: add_field_list(doc, 'ia_box_id', ia_box_id) return doc
def update_author(akey, a=None, handle_redirects=True): # http://ia331507.us.archive.org:8984/solr/works/select?indent=on&q=author_key:OL22098A&facet=true&rows=1&sort=edition_count%20desc&fl=title&facet.field=subject_facet&facet.mincount=1 if akey == '/authors/': return m = re_author_key.match(akey) if not m: print 'bad key:', akey assert m author_id = m.group(1) if not a: a = withKey(akey) if a['type']['key'] in ('/type/redirect', '/type/delete') or not a.get('name', None): return ['<delete><query>key:%s</query></delete>' % author_id] try: assert a['type']['key'] == '/type/author' except AssertionError: print a['type']['key'] raise facet_fields = ['subject', 'time', 'person', 'place'] url = 'http://' + get_solr( 'works' ) + '/solr/works/select?wt=json&json.nl=arrarr&q=author_key:%s&sort=edition_count+desc&rows=1&fl=title,subtitle&facet=true&facet.mincount=1' % author_id url += ''.join('&facet.field=%s_facet' % f for f in facet_fields) reply = json.load(urlopen(url)) work_count = reply['response']['numFound'] docs = reply['response'].get('docs', []) top_work = None if docs: top_work = docs[0]['title'] if docs[0].get('subtitle', None): top_work += ': ' + docs[0]['subtitle'] all_subjects = [] for f in facet_fields: for s, num in reply['facet_counts']['facet_fields'][f + '_facet']: all_subjects.append((num, s)) all_subjects.sort(reverse=True) top_subjects = [s for num, s in all_subjects[:10]] add = Element("add") doc = SubElement(add, "doc") add_field(doc, 'key', author_id) if a.get('name', None): add_field(doc, 'name', a['name']) for f in 'birth_date', 'death_date', 'date': if a.get(f, None): add_field(doc, f, a[f]) if top_work: add_field(doc, 'top_work', top_work) add_field(doc, 'work_count', work_count) add_field_list(doc, 'top_subjects', top_subjects) requests = [] if handle_redirects: q = {'type': '/type/redirect', 'location': akey} try: redirects = ''.join('<id>%s</id>' % re_author_key.match(r['key']).group(1) for r in query_iter(q)) except AttributeError: print 'redirects:', [r['key'] for r in query_iter(q)] raise if redirects: requests.append('<delete>' + redirects + '</delete>') requests.append(tostring(add).encode('utf-8')) return requests
def build_doc(w, obj_cache={}, resolve_redirects=False): wkey = w['key'] assert w['type']['key'] == '/type/work' title = w.get('title', None) if not title: return def get_pub_year(e): pub_date = e.get('publish_date', None) if pub_date: m = re_iso_date.match(pub_date) if m: return m.group(1) m = re_year.search(pub_date) if m: return m.group(1) if 'editions' not in w: q = {'type': '/type/edition', 'works': wkey, '*': None} w['editions'] = list(query_iter(q)) print 'editions:', [e['key'] for e in w['editions']] editions = [] for e in w['editions']: pub_year = get_pub_year(e) if pub_year: e['pub_year'] = pub_year if 'ocaid' in e: collection = get_ia_collection(e['ocaid']) #print 'collection:', collection e['ia_collection'] = collection e['public_scan'] = ('lendinglibrary' not in collection) and ('printdisabled' not in collection) overdrive_id = e.get('identifiers', {}).get('overdrive', None) if overdrive_id: #print 'overdrive:', overdrive_id e['overdrive'] = overdrive_id editions.append(e) editions.sort(key=lambda e: e.get('pub_year', None)) #print len(w['editions']), 'editions found' #print w['key'] work_authors = [] authors = [] author_keys = [] for a in w.get('authors', []): if 'author' not in a: # OL Web UI bug continue # http://openlibrary.org/works/OL15365167W.yml?m=edit&v=1 akey = a['author']['key'] m = re_author_key.match(akey) if not m: print 'invalid author key:', akey continue work_authors.append(akey) author_keys.append(m.group(1)) if akey in obj_cache and obj_cache[akey]['type'][ 'key'] != '/type/redirect': authors.append(obj_cache[akey]) else: authors.append(withKey(akey)) if any(a['type']['key'] == '/type/redirect' for a in authors): if resolve_redirects: def resolve(a): if a['type']['key'] == '/type/redirect': a = withKey(a['location']) return a authors = [resolve(a) for a in authors] else: print for a in authors: print 'author:', a print w['key'] print raise AuthorRedirect for a in authors: print 'author:', a assert all(a['type']['key'] == '/type/author' for a in authors) try: subjects = four_types(get_work_subjects(w)) except: print 'bad work: ', w['key'] raise field_map = { 'subjects': 'subject', 'subject_places': 'place', 'subject_times': 'time', 'subject_people': 'person', } has_fulltext = any( e.get('ocaid', None) or e.get('overdrive', None) for e in editions) #print 'has_fulltext:', has_fulltext for db_field, solr_field in field_map.iteritems(): if not w.get(db_field, None): continue cur = subjects.setdefault(solr_field, {}) for v in w[db_field]: try: if isinstance(v, dict): if 'value' not in v: continue v = v['value'] cur[v] = cur.get(v, 0) + 1 except: print 'v:', v raise if any(e.get('ocaid', None) for e in editions): subjects.setdefault('subject', {}) subjects['subject']['Accessible book'] = subjects['subject'].get( 'Accessible book', 0) + 1 if not has_fulltext: subjects['subject']['Protected DAISY'] = subjects['subject'].get( 'Protected DAISY', 0) + 1 #print w['key'], subjects['subject'] doc = Element("doc") add_field(doc, 'key', w['key'][7:]) title = w.get('title', None) if title: add_field(doc, 'title', title) # add_field(doc, 'title_suggest', title) add_field(doc, 'has_fulltext', has_fulltext) if w.get('subtitle', None): add_field(doc, 'subtitle', w['subtitle']) alt_titles = set() for e in editions: if 'title' in e and e['title'] != title: alt_titles.add(e['title']) for f in 'work_titles', 'other_titles': for t in e.get(f, []): if t != title: alt_titles.add(t) add_field_list(doc, 'alternative_title', alt_titles) alt_subtitles = set(e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None)) add_field_list(doc, 'alternative_subtitle', alt_subtitles) add_field(doc, 'edition_count', len(editions)) for e in editions: add_field(doc, 'edition_key', re_edition_key.match(e['key']).group(1)) cover_edition = pick_cover(w, editions) if cover_edition: add_field(doc, 'cover_edition_key', re_edition_key.match(cover_edition).group(1)) k = 'by_statement' add_field_list(doc, k, set(e[k] for e in editions if e.get(k, None))) k = 'publish_date' pub_dates = set(e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, pub_dates) pub_years = set( m.group(1) for m in (re_year.search(i) for i in pub_dates) if m) if pub_years: add_field_list(doc, 'publish_year', pub_years) add_field(doc, 'first_publish_year', min(int(i) for i in pub_years)) k = 'first_sentence' fs = set(e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, fs) publishers = set() for e in editions: publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', [])) add_field_list(doc, 'publisher', publishers) # add_field_list(doc, 'publisher_facet', publishers) field_map = [ ('lccn', 'lccn'), ('publish_places', 'publish_place'), ('oclc_numbers', 'oclc'), ('contributions', 'contributor'), ] for db_key, search_key in field_map: v = set() for e in editions: if db_key not in e: continue v.update(e[db_key]) add_field_list(doc, search_key, v) isbn = set() for e in editions: for f in 'isbn_10', 'isbn_13': for v in e.get(f, []): isbn.add(v.replace('-', '')) add_field_list(doc, 'isbn', isbn) lang = set() for e in editions: for l in e.get('languages', []): m = re_lang_key.match(l['key'] if isinstance(l, dict) else l) lang.add(m.group(1)) if lang: add_field_list(doc, 'language', lang) pub_goog = set() # google pub_nongoog = set() nonpub_goog = set() nonpub_nongoog = set() public_scan = False all_collection = set() all_overdrive = set() lending_edition = None in_library_edition = None printdisabled = set() for e in editions: if 'overdrive' in e: all_overdrive.update(e['overdrive']) if 'ocaid' not in e: continue if not lending_edition and 'lendinglibrary' in e['ia_collection']: lending_edition = re_edition_key.match(e['key']).group(1) if not in_library_edition and 'inlibrary' in e['ia_collection']: in_library_edition = re_edition_key.match(e['key']).group(1) if 'printdisabled' in e['ia_collection']: printdisabled.add(re_edition_key.match(e['key']).group(1)) all_collection.update(e['ia_collection']) assert isinstance(e['ocaid'], basestring) i = e['ocaid'].strip() if e['public_scan']: public_scan = True if i.endswith('goog'): pub_goog.add(i) else: pub_nongoog.add(i) else: if i.endswith('goog'): nonpub_goog.add(i) else: nonpub_nongoog.add(i) #print 'lending_edition:', lending_edition ia_list = list(pub_nongoog) + list(pub_goog) + list(nonpub_nongoog) + list( nonpub_goog) add_field_list(doc, 'ia', ia_list) if has_fulltext: add_field(doc, 'public_scan_b', public_scan) if all_collection: add_field(doc, 'ia_collection_s', ';'.join(all_collection)) if all_overdrive: add_field(doc, 'overdrive_s', ';'.join(all_overdrive)) if lending_edition: add_field(doc, 'lending_edition_s', lending_edition) elif in_library_edition: add_field(doc, 'lending_edition_s', in_library_edition) if printdisabled: add_field(doc, 'printdisabled_s', ';'.join(list(printdisabled))) author_keys = [re_author_key.match(a['key']).group(1) for a in authors] author_names = [a.get('name', '') for a in authors] add_field_list(doc, 'author_key', author_keys) add_field_list(doc, 'author_name', author_names) alt_names = set() for a in authors: if 'alternate_names' in a: alt_names.update(a['alternate_names']) add_field_list(doc, 'author_alternative_name', alt_names) add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names))) #if subjects: # add_field(doc, 'fiction', subjects['fiction']) for k in 'person', 'place', 'subject', 'time': if k not in subjects: continue add_field_list(doc, k, subjects[k].keys()) add_field_list(doc, k + '_facet', subjects[k].keys()) subject_keys = [str_to_key(s) for s in subjects[k].keys()] add_field_list(doc, k + '_key', subject_keys) return doc
def find_author(name): q = {'type': '/type/author', 'name': name} return [a['key'] for a in query_iter(q)]
def books_query(akey): # live version q = {'type': '/type/edition', 'authors': akey, '*': None} return query_iter(q)
def build_doc(w): wkey = w['key'] assert w['type']['key'] == '/type/work' title = w.get('title', None) if not title: return def get_pub_year(e): pub_date = e.get('publish_date', None) if pub_date: m = re_year.search(pub_date) if m: return m.group(1) if 'editions' not in w: q = { 'type':'/type/edition', 'works': wkey, '*': None } w['editions'] = list(query_iter(q)) print 'editions:', [e['key'] for e in w['editions']] editions = [] for e in w['editions']: pub_year = get_pub_year(e) if pub_year: e['pub_year'] = pub_year if 'ocaid' in e: collection = get_ia_collection(e['ocaid']) print 'collection:', collection e['ia_collection'] = collection e['public_scan'] = ('lendinglibrary' not in collection) and ('printdisabled' not in collection) overdrive_id = e.get('identifiers', {}).get('overdrive', None) if overdrive_id: print 'overdrive:', overdrive_id e['overdrive'] = overdrive_id editions.append(e) editions.sort(key=lambda e: e.get('pub_year', None)) print len(w['editions']), 'editions found' print w['key'] work_authors = [] authors = [] author_keys = [] for a in w.get('authors', []): if 'author' not in a: continue akey = a['author']['key'] m = re_author_key.match(akey) if not m: print 'invalid author key:', akey continue work_authors.append(akey) author_keys.append(m.group(1)) authors.append(withKey(akey)) if any(a['type']['key'] == '/type/redirect' for a in authors): raise AuthorRedirect assert all(a['type']['key'] == '/type/author' for a in authors) #subjects = four_types(find_subjects(get_marc_subjects(w))) subjects = {} field_map = { 'subjects': 'subject', 'subject_places': 'place', 'subject_times': 'time', 'subject_people': 'person', } has_fulltext = any(e.get('ocaid', None) or e.get('overdrive', None) for e in editions) print 'has_fulltext:', has_fulltext for db_field, solr_field in field_map.iteritems(): if not w.get(db_field, None): continue cur = subjects.setdefault(solr_field, {}) for v in w[db_field]: try: if isinstance(v, dict): if 'value' not in v: continue v = v['value'] cur[v] = cur.get(v, 0) + 1 except: print 'v:', v raise if any(e.get('ocaid', None) for e in editions): subjects.setdefault('subject', {}) subjects['subject']['Accessible book'] = subjects['subject'].get('Accessible book', 0) + 1 if not has_fulltext: subjects['subject']['Protected DAISY'] = subjects['subject'].get('Protected DAISY', 0) + 1 print w['key'], subjects['subject'] doc = Element("doc") add_field(doc, 'key', w['key'][7:]) title = w.get('title', None) if title: add_field(doc, 'title', title) # add_field(doc, 'title_suggest', title) add_field(doc, 'has_fulltext', has_fulltext) if w.get('subtitle', None): add_field(doc, 'subtitle', w['subtitle']) alt_titles = set() for e in editions: if 'title' in e and e['title'] != title: alt_titles.add(e['title']) for f in 'work_titles', 'other_titles': for t in e.get(f, []): if t != title: alt_titles.add(t) add_field_list(doc, 'alternative_title', alt_titles) alt_subtitles = set( e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None)) add_field_list(doc, 'alternative_subtitle', alt_subtitles) add_field(doc, 'edition_count', len(editions)) for e in editions: add_field(doc, 'edition_key', re_edition_key.match(e['key']).group(1)) cover_edition = pick_cover(w, editions) if cover_edition: add_field(doc, 'cover_edition_key', re_edition_key.match(cover_edition).group(1)) k = 'by_statement' add_field_list(doc, k, set( e[k] for e in editions if e.get(k, None))) k = 'publish_date' pub_dates = set(e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, pub_dates) pub_years = set(m.group(1) for m in (re_year.search(i) for i in pub_dates) if m) if pub_years: add_field_list(doc, 'publish_year', pub_years) add_field(doc, 'first_publish_year', min(int(i) for i in pub_years)) k = 'first_sentence' fs = set( e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None)) add_field_list(doc, k, fs) publishers = set() for e in editions: publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', [])) add_field_list(doc, 'publisher', publishers) # add_field_list(doc, 'publisher_facet', publishers) field_map = [ ('lccn', 'lccn'), ('publish_places', 'publish_place'), ('oclc_numbers', 'oclc'), ('contributions', 'contributor'), ] for db_key, search_key in field_map: v = set() for e in editions: if db_key not in e: continue v.update(e[db_key]) add_field_list(doc, search_key, v) isbn = set() for e in editions: for f in 'isbn_10', 'isbn_13': for v in e.get(f, []): isbn.add(v.replace('-', '')) add_field_list(doc, 'isbn', isbn) lang = set() for e in editions: for l in e.get('languages', []): m = re_lang_key.match(l['key'] if isinstance(l, dict) else l) lang.add(m.group(1)) if lang: add_field_list(doc, 'language', lang) pub_goog = set() # google pub_nongoog = set() nonpub_goog = set() nonpub_nongoog = set() public_scan = False all_collection = set() all_overdrive = set() lending_edition = None printdisabled = set() for e in editions: if 'overdrive' in e: all_overdrive.update(e['overdrive']) if 'ocaid' not in e: continue if not lending_edition and 'lendinglibrary' in e['ia_collection']: lending_edition = re_edition_key.match(e['key']).group(1) if 'printdisabled' in e['ia_collection']: printdisabled.add(re_edition_key.match(e['key']).group(1)) all_collection.update(e['ia_collection']) assert isinstance(e['ocaid'], basestring) i = e['ocaid'].strip() if e['public_scan']: public_scan = True if i.endswith('goog'): pub_goog.add(i) else: pub_nongoog.add(i) else: if i.endswith('goog'): nonpub_goog.add(i) else: nonpub_nongoog.add(i) print 'lending_edition:', lending_edition ia_list = list(pub_nongoog) + list(pub_goog) + list(nonpub_nongoog) + list(nonpub_goog) add_field_list(doc, 'ia', ia_list) if has_fulltext: add_field(doc, 'public_scan_b', public_scan) if all_collection: add_field(doc, 'ia_collection_s', ';'.join(all_collection)) if all_overdrive: add_field(doc, 'overdrive_s', ';'.join(all_overdrive)) if lending_edition: add_field(doc, 'lending_edition_s', lending_edition) if printdisabled: add_field(doc, 'printdisabled_s', ';'.join(list(printdisabled))) author_keys = [re_author_key.match(a['key']).group(1) for a in authors] author_names = [a.get('name', '') for a in authors] add_field_list(doc, 'author_key', author_keys) add_field_list(doc, 'author_name', author_names) alt_names = set() for a in authors: if 'alternate_names' in a: alt_names.update(a['alternate_names']) add_field_list(doc, 'author_alternative_name', alt_names) add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names))) #if subjects: # add_field(doc, 'fiction', subjects['fiction']) for k in 'person', 'place', 'subject', 'time': if k not in subjects: continue add_field_list(doc, k, subjects[k].keys()) # add_field_list(doc, k + '_facet', subjects[k].keys()) subject_keys = [str_to_key(s) for s in subjects[k].keys()] add_field_list(doc, k + '_key', subject_keys) return doc
def switch_author(old, new, other): q = { 'authors': old, 'type': '/type/edition', } for e in query_iter(q): print 'switch author:', e['key'] update_edition(e['key'], other, new)
def update_works(akey, works, do_updates=False): # we can now look up all works by an author if do_updates: rc = read_rc() ol.login('WorkBot', rc['WorkBot']) assert do_updates fh_log = open('/1/var/log/openlibrary/work_finder/' + strftime('%F_%T'), 'w') works = list(works) print(akey, file=fh_log) print('works:', file=fh_log) pprint(works, fh_log) while True: # until redirects repaired q = {'type':'/type/edition', 'authors': akey, 'works': None} work_to_edition = defaultdict(set) edition_to_work = defaultdict(set) for e in query_iter(q): if not isinstance(e, dict): continue if e.get('works', None): for w in e['works']: work_to_edition[w['key']].add(e['key']) edition_to_work[e['key']].add(w['key']) work_by_key = {} fix_redirects = [] for k, editions in work_to_edition.items(): w = withKey(k) if w['type']['key'] == '/type/redirect': wkey = w['location'] print('redirect found', w['key'], '->', wkey, editions, file=fh_log) assert re_work_key.match(wkey) for ekey in editions: e = get_with_retry(ekey) e['works'] = [{'key': wkey}] fix_redirects.append(e) continue work_by_key[k] = w if not fix_redirects: print('no redirects left', file=fh_log) break print('save redirects', file=fh_log) try: ol.save_many(fix_redirects, "merge works") except: for r in fix_redirects: print(r) raise all_existing = set() work_keys = [] print('edition_to_work:', file=fh_log) print(repr(dict(edition_to_work)), file=fh_log) print(file=fh_log) print('work_to_edition', file=fh_log) print(repr(dict(work_to_edition)), file=fh_log) print(file=fh_log) # open('edition_to_work', 'w').write(repr(dict(edition_to_work))) # open('work_to_edition', 'w').write(repr(dict(work_to_edition))) # open('work_by_key', 'w').write(repr(dict(work_by_key))) work_title_match = {} works_by_title = {} for w in works: # 1st pass for e in w['editions']: ekey = e['key'] if isinstance(e, dict) else e for wkey in edition_to_work.get(ekey, []): try: wtitle = work_by_key[wkey]['title'] except: print('bad work:', wkey) raise if wtitle == w['title']: work_title_match[wkey] = w['title'] wkey_to_new_title = defaultdict(set) for w in works: # 2nd pass works_by_title[w['title']] = w w['existing_works'] = defaultdict(int) for e in w['editions']: ekey = e['key'] if isinstance(e, dict) else e for wkey in edition_to_work.get(ekey, []): if wkey in work_title_match and work_title_match[wkey] != w['title']: continue wtitle = work_by_key[wkey]['title'] w['existing_works'][wkey] += 1 wkey_to_new_title[wkey].add(w['title']) existing_work_with_conflict = defaultdict(set) for w in works: # 3rd pass for wkey, v in w['existing_works'].iteritems(): if any(title != w['title'] for title in wkey_to_new_title[wkey]): w['has_conflict'] = True existing_work_with_conflict[wkey].add(w['title']) break for wkey, v in existing_work_with_conflict.iteritems(): cur_work = work_by_key[wkey] existing_titles = defaultdict(int) for ekey in work_to_edition[wkey]: e = withKey(ekey) title = e['title'] if e.get('title_prefix', None): title = e['title_prefix'].strip() + ' ' + e['title'] existing_titles[title] += 1 best_match = max(v, key=lambda wt: existing_titles[wt]) works_by_title[best_match]['best_match'] = work_by_key[wkey] for wtitle in v: del works_by_title[wtitle]['has_conflict'] if wtitle != best_match: works_by_title[wtitle]['existing_works'] = {} def other_matches(w, existing_wkey): return [title for title in wkey_to_new_title[existing_wkey] if title != w['title']] works_updated_this_session = set() for w in works: # 4th pass if 'has_conflict' in w: pprint(w) assert 'has_conflict' not in w if len(w['existing_works']) == 1: existing_wkey = w['existing_works'].keys()[0] if not other_matches(w, existing_wkey): w['best_match'] = work_by_key[existing_wkey] if 'best_match' in w: updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) for wkey in updated: if wkey in works_updated_this_session: print(wkey, 'already updated!', file=fh_log) print(wkey, 'already updated!') works_updated_this_session.update(updated) continue if not w['existing_works']: updated = new_work(akey, w, do_updates, fh_log) for wkey in updated: assert wkey not in works_updated_this_session works_updated_this_session.update(updated) continue assert not any(other_matches(w, wkey) for wkey in w['existing_works'].iterkeys()) best_match = max(w['existing_works'].iteritems(), key=lambda i:i[1])[0] w['best_match'] = work_by_key[best_match] updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) for wkey in updated: if wkey in works_updated_this_session: print(wkey, 'already updated!', file=fh_log) print(wkey, 'already updated!') works_updated_this_session.update(updated) #if not do_updates: # return [] return [withKey(key) for key in works_updated_this_session]
def update_author(akey, a=None, handle_redirects=True): # http://ia331507.us.archive.org:8984/solr/works/select?indent=on&q=author_key:OL22098A&facet=true&rows=1&sort=edition_count%20desc&fl=title&facet.field=subject_facet&facet.mincount=1 if akey == '/authors/': return m = re_author_key.match(akey) if not m: logger.error('bad key: %s', akey) assert m author_id = m.group(1) if not a: a = withKey(akey) if a['type']['key'] in ('/type/redirect', '/type/delete') or not a.get('name', None): return ['<delete><query>key:%s</query></delete>' % author_id] try: assert a['type']['key'] == '/type/author' except AssertionError: logger.error("AssertionError: %s", a['type']['key']) raise facet_fields = ['subject', 'time', 'person', 'place'] if is_single_core(): base_url = 'http://' + get_solr('works') + '/solr/select' else: base_url = 'http://' + get_solr('works') + '/solr/works/select' url = base_url + '?wt=json&json.nl=arrarr&q=author_key:%s&sort=edition_count+desc&rows=1&fl=title,subtitle&facet=true&facet.mincount=1' % author_id url += ''.join('&facet.field=%s_facet' % f for f in facet_fields) logger.info("urlopen %s", url) reply = json.load(urlopen(url)) work_count = reply['response']['numFound'] docs = reply['response'].get('docs', []) top_work = None if docs: top_work = docs[0]['title'] if docs[0].get('subtitle', None): top_work += ': ' + docs[0]['subtitle'] all_subjects = [] for f in facet_fields: for s, num in reply['facet_counts']['facet_fields'][f + '_facet']: all_subjects.append((num, s)) all_subjects.sort(reverse=True) top_subjects = [s for num, s in all_subjects[:10]] add = Element("add") doc = SubElement(add, "doc") if is_single_core(): add_field(doc, 'key', "/authors/" + author_id) add_field(doc, 'type', "author") else: add_field(doc, 'key', author_id) if a.get('name', None): add_field(doc, 'name', a['name']) for f in 'birth_date', 'death_date', 'date': if a.get(f, None): add_field(doc, f, a[f]) if top_work: add_field(doc, 'top_work', top_work) add_field(doc, 'work_count', work_count) add_field_list(doc, 'top_subjects', top_subjects) requests = [] if handle_redirects: q = {'type': '/type/redirect', 'location': akey} try: redirects = ''.join('<id>%s</id>' % re_author_key.match(r['key']).group(1) for r in query_iter(q)) except AttributeError: logger.error('AssertionError: redirects: %r', [r['key'] for r in query_iter(q)]) raise if redirects: requests.append('<delete>' + redirects + '</delete>') requests.append(tostring(add).encode('utf-8')) return requests
set_staging(True) rc = read_rc() ol = OpenLibrary("http://dev.openlibrary.org") ol.login('EdwardBot', rc['EdwardBot']) sys.stdout = codecs.getwriter('utf-8')(sys.stdout) work_q = { 'type': '/type/work', 'authors': None, 'title': None, } queue = [] for w in query_iter(work_q): if not w.get('authors'): print('no authors') continue if any(isinstance(a, dict) and 'author' in a for a in w['authors']): continue print(len(queue), w['key'], w['title']) # , ol.get(w['authors'][0]['key'])['name'] full = ol.get(w['key']) authors = full['authors'] assert all(isinstance(a, Reference) for a in authors) full['authors'] = [{'author':a} for a in authors] queue.append(full) if len(queue) > 1000: print('saving') print(ol.save_many(queue, 'update format of authors in works to provide roles')) queue = []
def update_works(akey, works, do_updates=False): # we can now look up all works by an author if do_updates: rc = read_rc() ol.login('WorkBot', rc['WorkBot']) assert do_updates fh_log = open('/1/var/log/openlibrary/work_finder/' + strftime('%F_%T'), 'w') works = list(works) print >> fh_log, akey print >> fh_log, 'works:' pprint(works, fh_log) while True: # until redirects repaired q = {'type': '/type/edition', 'authors': akey, 'works': None} work_to_edition = defaultdict(set) edition_to_work = defaultdict(set) for e in query_iter(q): if not isinstance(e, dict): continue if e.get('works', None): for w in e['works']: work_to_edition[w['key']].add(e['key']) edition_to_work[e['key']].add(w['key']) work_by_key = {} fix_redirects = [] for k, editions in work_to_edition.items(): w = withKey(k) if w['type']['key'] == '/type/redirect': wkey = w['location'] print >> fh_log, 'redirect found', w[ 'key'], '->', wkey, editions assert re_work_key.match(wkey) for ekey in editions: e = get_with_retry(ekey) e['works'] = [{'key': wkey}] fix_redirects.append(e) continue work_by_key[k] = w if not fix_redirects: print >> fh_log, 'no redirects left' break print >> fh_log, 'save redirects' try: ol.save_many(fix_redirects, "merge works") except: for r in fix_redirects: print r raise all_existing = set() work_keys = [] print >> fh_log, 'edition_to_work:' print >> fh_log, ` dict(edition_to_work) ` print >> fh_log print >> fh_log, 'work_to_edition' print >> fh_log, ` dict(work_to_edition) ` print >> fh_log # open('edition_to_work', 'w').write(`dict(edition_to_work)`) # open('work_to_edition', 'w').write(`dict(work_to_edition)`) # open('work_by_key', 'w').write(`dict(work_by_key)`) work_title_match = {} works_by_title = {} for w in works: # 1st pass for e in w['editions']: ekey = e['key'] if isinstance(e, dict) else e for wkey in edition_to_work.get(ekey, []): try: wtitle = work_by_key[wkey]['title'] except: print 'bad work:', wkey raise if wtitle == w['title']: work_title_match[wkey] = w['title'] wkey_to_new_title = defaultdict(set) for w in works: # 2nd pass works_by_title[w['title']] = w w['existing_works'] = defaultdict(int) for e in w['editions']: ekey = e['key'] if isinstance(e, dict) else e for wkey in edition_to_work.get(ekey, []): if wkey in work_title_match and work_title_match[wkey] != w[ 'title']: continue wtitle = work_by_key[wkey]['title'] w['existing_works'][wkey] += 1 wkey_to_new_title[wkey].add(w['title']) existing_work_with_conflict = defaultdict(set) for w in works: # 3rd pass for wkey, v in w['existing_works'].iteritems(): if any(title != w['title'] for title in wkey_to_new_title[wkey]): w['has_conflict'] = True existing_work_with_conflict[wkey].add(w['title']) break for wkey, v in existing_work_with_conflict.iteritems(): cur_work = work_by_key[wkey] existing_titles = defaultdict(int) for ekey in work_to_edition[wkey]: e = withKey(ekey) title = e['title'] if e.get('title_prefix', None): title = e['title_prefix'].strip() + ' ' + e['title'] existing_titles[title] += 1 best_match = max(v, key=lambda wt: existing_titles[wt]) works_by_title[best_match]['best_match'] = work_by_key[wkey] for wtitle in v: del works_by_title[wtitle]['has_conflict'] if wtitle != best_match: works_by_title[wtitle]['existing_works'] = {} def other_matches(w, existing_wkey): return [ title for title in wkey_to_new_title[existing_wkey] if title != w['title'] ] works_updated_this_session = set() for w in works: # 4th pass if 'has_conflict' in w: pprint(w) assert 'has_conflict' not in w if len(w['existing_works']) == 1: existing_wkey = w['existing_works'].keys()[0] if not other_matches(w, existing_wkey): w['best_match'] = work_by_key[existing_wkey] if 'best_match' in w: updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) for wkey in updated: if wkey in works_updated_this_session: print >> fh_log, wkey, 'already updated!' print wkey, 'already updated!' works_updated_this_session.update(updated) continue if not w['existing_works']: updated = new_work(akey, w, do_updates, fh_log) for wkey in updated: assert wkey not in works_updated_this_session works_updated_this_session.update(updated) continue assert not any( other_matches(w, wkey) for wkey in w['existing_works'].iterkeys()) best_match = max(w['existing_works'].iteritems(), key=lambda i: i[1])[0] w['best_match'] = work_by_key[best_match] updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) for wkey in updated: if wkey in works_updated_this_session: print >> fh_log, wkey, 'already updated!' print wkey, 'already updated!' works_updated_this_session.update(updated) #if not do_updates: # return [] return [withKey(key) for key in works_updated_this_session]
def update_work(w, obj_cache=None, debug=False, resolve_redirects=False): if obj_cache is None: obj_cache = {} wkey = w['key'] #assert wkey.startswith('/works') #assert '/' not in wkey[7:] deletes = [] requests = [] q = {'type': '/type/redirect', 'location': wkey} redirect_keys = [r['key'][7:] for r in query_iter(q)] deletes += redirect_keys deletes += [wkey[7:]] # strip /works/ from /works/OL1234W # handle edition records as well # When an edition is not belonged to a work, create a fake work and index it. if w['type']['key'] == '/type/edition' and w.get('title'): edition = w w = { # Use key as /works/OL1M. # In case of single-core-solr, we are using full path as key. So it is required # to be unique across all types of documents. # The website takes care of redirecting /works/OL1M to /books/OL1M. 'key': edition['key'].replace("/books/", "/works/"), 'type': { 'key': '/type/work' }, 'title': edition['title'], 'editions': [edition] } # Hack to add subjects when indexing /books/ia:xxx if edition.get("subjects"): w['subjects'] = edition['subjects'] if w['type']['key'] == '/type/work' and w.get('title'): try: d = build_data(w, obj_cache=obj_cache, resolve_redirects=resolve_redirects) doc = dict2element(d) except: logger.error("failed to update work %s", w['key'], exc_info=True) else: if d is not None: # Delete all ia:foobar keys # XXX-Anand: The works in in_library subject were getting wiped off for unknown reasons. # I suspect that this might be a cause. Disabling temporarily. #if d.get('ia'): # deletes += ["ia:" + iaid for iaid in d['ia']] # In single core solr, we use full path as key, not just the last part if is_single_core(): deletes = ["/works/" + k for k in deletes] requests.append(make_delete_query(deletes)) add = Element("add") add.append(doc) add_xml = tostring(add).encode('utf-8') requests.append(add_xml) elif w['type']['key'] == '/type/delete': # In single core solr, we use full path as key, not just the last part if is_single_core(): deletes = ["/works/" + k for k in deletes] requests.append(make_delete_query(deletes)) return requests