def run_update(): global authors_to_update global works_to_update global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue for attempt in range(5): try: requests += update_work(withKey(wkey)) break except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True assert need_update print w if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) solr_update(['<commit/>'], debug=True) last_update = time() print >> open(state_file, 'w'), offset if authors_to_update: requests = [] for akey in authors_to_update: print 'update author:', akey requests += update_author(akey) solr_update(requests + ['<commit/>'], index='authors', debug=True) authors_to_update = set() works_to_update = set() print >> open(state_file, 'w'), offset
def run_update(): global authors_to_update global works_to_update global last_update print "running update: %s works %s authors" % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print "update work: %s %d/%d" % (wkey, num, total) if "/" in wkey[7:]: print "bad wkey:", wkey continue for attempt in range(5): try: requests += update_work(withKey(wkey)) break except AuthorRedirect: print "fixing author redirect" w = ol.get(wkey) need_update = False for a in w["authors"]: r = ol.get(a["author"]) if r["type"] == "/type/redirect": a["author"] = {"key": r["location"]} need_update = True assert need_update print w if not done_login: rc = read_rc() ol.login("EdwardBot", rc["EdwardBot"]) ol.save(w["key"], w, "avoid author redirect") if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) solr_update(["<commit/>"], debug=True) last_update = time() print >> open(state_file, "w"), offset if authors_to_update: requests = [] for akey in authors_to_update: print "update author:", akey requests += update_author(akey) solr_update(requests + ["<commit/>"], index="authors", debug=True) authors_to_update = set() works_to_update = set() print >> open(state_file, "w"), offset
def add_cover_to_work(w): if 'cover_edition' in w: return q = {'type':'/type/edition', 'works':w['key'], 'publish_date': None, 'languages': '/l/eng'} cover_edition = pick_cover(query_iter(q)) if not cover_edition: q = {'type':'/type/edition', 'works':w['key'], 'publish_date': None} cover_edition = pick_cover(query_iter(q)) if not cover_edition: return w['cover_edition'] = Reference(cover_edition) if ol is None: rc = read_rc() ol = OpenLibrary("http://openlibrary.org") ol.login('WorkBot', rc['WorkBot']) print ol.save(w['key'], w, 'added cover to work')
def run_update(): global authors_to_update, works_to_update subjects_to_update = set() global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue work_to_update = withKey(wkey) for attempt in range(5): try: requests += update_work(work_to_update) except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True if need_update: if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if work_to_update['type'][ 'key'] == '/type/work' and work_to_update.get('title'): subjects = get_work_subjects(work_to_update) print subjects for subject_type, values in subjects.iteritems(): subjects_to_update.update( (subject_type, v) for v in values) if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) if not args.no_commit: solr_update(['<commit/>'], debug=True) last_update = time() if not args.no_author_updates and authors_to_update: requests = [] for akey in authors_to_update: print('update author:', repr(akey)) try: request = update_author(akey) if request: requests += request except AttributeError: print('akey:', repr(akey)) raise if not args.no_commit: solr_update(requests + ['<commit/>'], debug=True) subject_add = Element("add") print subjects_to_update for subject_type, subject_name in subjects_to_update: key = subject_type + '/' + subject_name count = subject_count(subject_type, subject_name) if not subject_need_update(key, count): print 'no updated needed:', (subject_type, subject_name, count) continue print 'updated needed:', (subject_type, subject_name, count) doc = Element("doc") add_field(doc, 'key', key) add_field(doc, 'name', subject_name) add_field(doc, 'type', subject_type) add_field(doc, 'count', count) subject_add.append(doc) if len(subject_add): print 'updating subjects' add_xml = tostring(subject_add).encode('utf-8') solr_update([add_xml, '<commit />'], debug=True) authors_to_update = set() works_to_update = set() subjects_to_update = set() print >> open(state_file, 'w'), offset
import re, web, sys import simplejson as json from urllib2 import urlopen, URLError from openlibrary.catalog.read_rc import read_rc from openlibrary.catalog.importer.db_read import get_mc from time import sleep from openlibrary.catalog.title_page_img.load import add_cover_image from openlibrary.api import OpenLibrary, unmarshal, marshal from pprint import pprint rc = read_rc() ol = OpenLibrary("http://openlibrary.org") ol.login('ImportBot', rc['ImportBot']) re_meta_mrc = re.compile('^([^/]*)_meta.mrc:0:\d+$') def make_redirect(old, new, msg='replace with redirect'): r = {'type': {'key': '/type/redirect'}, 'location': new} ol.save(old, r, msg) def fix_toc(e): toc = e.get('table_of_contents', None) if not toc: return print e['key'] pprint(toc) # http://openlibrary.org/books/OL789133M - /type/toc_item missing from table_of_contents if isinstance(toc[0], dict) and ('pagenum' in toc[0] or toc[0]['type'] == '/type/toc_item'): return return [{'title': unicode(i), 'type': '/type/toc_item'} for i in toc if i != u'']
from __future__ import print_function from openlibrary.catalog.merge.merge_marc import * from openlibrary.catalog.read_rc import read_rc import openlibrary.catalog.merge.amazon as amazon from openlibrary.catalog.get_ia import * from openlibrary.catalog.importer.db_read import withKey, get_mc from openlibrary.api import OpenLibrary, Reference import openlibrary.catalog.marc.fast_parse as fast_parse import xml.parsers.expat import web, sys from time import sleep import six rc = read_rc() ol = OpenLibrary("http://openlibrary.org") ol.login('ImportBot', rc['ImportBot']) ia_db = web.database(dbn='mysql', db='archive', user=rc['ia_db_user'], pw=rc['ia_db_pass'], host=rc['ia_db_host']) ia_db.printing = False re_meta_marc = re.compile('([^/]+)_(meta|marc)\.(mrc|xml)') threshold = 875 amazon.set_isbn_match(225)
def run_update(): global authors_to_update, works_to_update subjects_to_update = set() global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue work_to_update = withKey(wkey) for attempt in range(5): try: requests += update_work(work_to_update) except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True if need_update: if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if work_to_update['type']['key'] == '/type/work' and work_to_update.get('title'): subjects = get_work_subjects(work_to_update) print subjects for subject_type, values in subjects.iteritems(): subjects_to_update.update((subject_type, v) for v in values) if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) if not args.no_commit: solr_update(['<commit/>'], debug=True) last_update = time() if not args.no_author_updates and authors_to_update: requests = [] for akey in authors_to_update: print 'update author:', `akey` try: request = update_author(akey) if request: requests += request except AttributeError: print 'akey:', `akey` raise if not args.no_commit: solr_update(requests + ['<commit/>'], index='authors', debug=True) subject_add = Element("add") print subjects_to_update for subject_type, subject_name in subjects_to_update: key = subject_type + '/' + subject_name count = subject_count(subject_type, subject_name) if not subject_need_update(key, count): print 'no updated needed:', (subject_type, subject_name, count) continue print 'updated needed:', (subject_type, subject_name, count) doc = Element("doc") add_field(doc, 'key', key) add_field(doc, 'name', subject_name) add_field(doc, 'type', subject_type) add_field(doc, 'count', count) subject_add.append(doc) if len(subject_add): print 'updating subjects' add_xml = tostring(subject_add).encode('utf-8') solr_update([add_xml, '<commit />'], debug=True, index='subjects') authors_to_update = set() works_to_update = set() subjects_to_update = set() print >> open(state_file, 'w'), offset
def update_works(akey, works, do_updates=False): # we can now look up all works by an author if do_updates: rc = read_rc() ol.login('WorkBot', rc['WorkBot']) assert do_updates fh_log = open('/1/var/log/openlibrary/work_finder/' + strftime('%F_%T'), 'w') works = list(works) print(akey, file=fh_log) print('works:', file=fh_log) pprint(works, fh_log) while True: # until redirects repaired q = {'type':'/type/edition', 'authors': akey, 'works': None} work_to_edition = defaultdict(set) edition_to_work = defaultdict(set) for e in query_iter(q): if not isinstance(e, dict): continue if e.get('works', None): for w in e['works']: work_to_edition[w['key']].add(e['key']) edition_to_work[e['key']].add(w['key']) work_by_key = {} fix_redirects = [] for k, editions in work_to_edition.items(): w = withKey(k) if w['type']['key'] == '/type/redirect': wkey = w['location'] print('redirect found', w['key'], '->', wkey, editions, file=fh_log) assert re_work_key.match(wkey) for ekey in editions: e = get_with_retry(ekey) e['works'] = [{'key': wkey}] fix_redirects.append(e) continue work_by_key[k] = w if not fix_redirects: print('no redirects left', file=fh_log) break print('save redirects', file=fh_log) try: ol.save_many(fix_redirects, "merge works") except: for r in fix_redirects: print(r) raise all_existing = set() work_keys = [] print('edition_to_work:', file=fh_log) print(repr(dict(edition_to_work)), file=fh_log) print(file=fh_log) print('work_to_edition', file=fh_log) print(repr(dict(work_to_edition)), file=fh_log) print(file=fh_log) # open('edition_to_work', 'w').write(repr(dict(edition_to_work))) # open('work_to_edition', 'w').write(repr(dict(work_to_edition))) # open('work_by_key', 'w').write(repr(dict(work_by_key))) work_title_match = {} works_by_title = {} for w in works: # 1st pass for e in w['editions']: ekey = e['key'] if isinstance(e, dict) else e for wkey in edition_to_work.get(ekey, []): try: wtitle = work_by_key[wkey]['title'] except: print('bad work:', wkey) raise if wtitle == w['title']: work_title_match[wkey] = w['title'] wkey_to_new_title = defaultdict(set) for w in works: # 2nd pass works_by_title[w['title']] = w w['existing_works'] = defaultdict(int) for e in w['editions']: ekey = e['key'] if isinstance(e, dict) else e for wkey in edition_to_work.get(ekey, []): if wkey in work_title_match and work_title_match[wkey] != w['title']: continue wtitle = work_by_key[wkey]['title'] w['existing_works'][wkey] += 1 wkey_to_new_title[wkey].add(w['title']) existing_work_with_conflict = defaultdict(set) for w in works: # 3rd pass for wkey, v in w['existing_works'].iteritems(): if any(title != w['title'] for title in wkey_to_new_title[wkey]): w['has_conflict'] = True existing_work_with_conflict[wkey].add(w['title']) break for wkey, v in existing_work_with_conflict.iteritems(): cur_work = work_by_key[wkey] existing_titles = defaultdict(int) for ekey in work_to_edition[wkey]: e = withKey(ekey) title = e['title'] if e.get('title_prefix', None): title = e['title_prefix'].strip() + ' ' + e['title'] existing_titles[title] += 1 best_match = max(v, key=lambda wt: existing_titles[wt]) works_by_title[best_match]['best_match'] = work_by_key[wkey] for wtitle in v: del works_by_title[wtitle]['has_conflict'] if wtitle != best_match: works_by_title[wtitle]['existing_works'] = {} def other_matches(w, existing_wkey): return [title for title in wkey_to_new_title[existing_wkey] if title != w['title']] works_updated_this_session = set() for w in works: # 4th pass if 'has_conflict' in w: pprint(w) assert 'has_conflict' not in w if len(w['existing_works']) == 1: existing_wkey = w['existing_works'].keys()[0] if not other_matches(w, existing_wkey): w['best_match'] = work_by_key[existing_wkey] if 'best_match' in w: updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) for wkey in updated: if wkey in works_updated_this_session: print(wkey, 'already updated!', file=fh_log) print(wkey, 'already updated!') works_updated_this_session.update(updated) continue if not w['existing_works']: updated = new_work(akey, w, do_updates, fh_log) for wkey in updated: assert wkey not in works_updated_this_session works_updated_this_session.update(updated) continue assert not any(other_matches(w, wkey) for wkey in w['existing_works'].iterkeys()) best_match = max(w['existing_works'].iteritems(), key=lambda i:i[1])[0] w['best_match'] = work_by_key[best_match] updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) for wkey in updated: if wkey in works_updated_this_session: print(wkey, 'already updated!', file=fh_log) print(wkey, 'already updated!') works_updated_this_session.update(updated) #if not do_updates: # return [] return [withKey(key) for key in works_updated_this_session]
def update_works(akey, works, do_updates=False): # we can now look up all works by an author if do_updates: rc = read_rc() ol.login('WorkBot', rc['WorkBot']) assert do_updates fh_log = open('/1/var/log/openlibrary/work_finder/' + strftime('%F_%T'), 'w') works = list(works) print >> fh_log, akey print >> fh_log, 'works:' pprint(works, fh_log) while True: # until redirects repaired q = {'type': '/type/edition', 'authors': akey, 'works': None} work_to_edition = defaultdict(set) edition_to_work = defaultdict(set) for e in query_iter(q): if not isinstance(e, dict): continue if e.get('works', None): for w in e['works']: work_to_edition[w['key']].add(e['key']) edition_to_work[e['key']].add(w['key']) work_by_key = {} fix_redirects = [] for k, editions in work_to_edition.items(): w = withKey(k) if w['type']['key'] == '/type/redirect': wkey = w['location'] print >> fh_log, 'redirect found', w[ 'key'], '->', wkey, editions assert re_work_key.match(wkey) for ekey in editions: e = get_with_retry(ekey) e['works'] = [{'key': wkey}] fix_redirects.append(e) continue work_by_key[k] = w if not fix_redirects: print >> fh_log, 'no redirects left' break print >> fh_log, 'save redirects' try: ol.save_many(fix_redirects, "merge works") except: for r in fix_redirects: print r raise all_existing = set() work_keys = [] print >> fh_log, 'edition_to_work:' print >> fh_log, ` dict(edition_to_work) ` print >> fh_log print >> fh_log, 'work_to_edition' print >> fh_log, ` dict(work_to_edition) ` print >> fh_log # open('edition_to_work', 'w').write(`dict(edition_to_work)`) # open('work_to_edition', 'w').write(`dict(work_to_edition)`) # open('work_by_key', 'w').write(`dict(work_by_key)`) work_title_match = {} works_by_title = {} for w in works: # 1st pass for e in w['editions']: ekey = e['key'] if isinstance(e, dict) else e for wkey in edition_to_work.get(ekey, []): try: wtitle = work_by_key[wkey]['title'] except: print 'bad work:', wkey raise if wtitle == w['title']: work_title_match[wkey] = w['title'] wkey_to_new_title = defaultdict(set) for w in works: # 2nd pass works_by_title[w['title']] = w w['existing_works'] = defaultdict(int) for e in w['editions']: ekey = e['key'] if isinstance(e, dict) else e for wkey in edition_to_work.get(ekey, []): if wkey in work_title_match and work_title_match[wkey] != w[ 'title']: continue wtitle = work_by_key[wkey]['title'] w['existing_works'][wkey] += 1 wkey_to_new_title[wkey].add(w['title']) existing_work_with_conflict = defaultdict(set) for w in works: # 3rd pass for wkey, v in w['existing_works'].iteritems(): if any(title != w['title'] for title in wkey_to_new_title[wkey]): w['has_conflict'] = True existing_work_with_conflict[wkey].add(w['title']) break for wkey, v in existing_work_with_conflict.iteritems(): cur_work = work_by_key[wkey] existing_titles = defaultdict(int) for ekey in work_to_edition[wkey]: e = withKey(ekey) title = e['title'] if e.get('title_prefix', None): title = e['title_prefix'].strip() + ' ' + e['title'] existing_titles[title] += 1 best_match = max(v, key=lambda wt: existing_titles[wt]) works_by_title[best_match]['best_match'] = work_by_key[wkey] for wtitle in v: del works_by_title[wtitle]['has_conflict'] if wtitle != best_match: works_by_title[wtitle]['existing_works'] = {} def other_matches(w, existing_wkey): return [ title for title in wkey_to_new_title[existing_wkey] if title != w['title'] ] works_updated_this_session = set() for w in works: # 4th pass if 'has_conflict' in w: pprint(w) assert 'has_conflict' not in w if len(w['existing_works']) == 1: existing_wkey = w['existing_works'].keys()[0] if not other_matches(w, existing_wkey): w['best_match'] = work_by_key[existing_wkey] if 'best_match' in w: updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) for wkey in updated: if wkey in works_updated_this_session: print >> fh_log, wkey, 'already updated!' print wkey, 'already updated!' works_updated_this_session.update(updated) continue if not w['existing_works']: updated = new_work(akey, w, do_updates, fh_log) for wkey in updated: assert wkey not in works_updated_this_session works_updated_this_session.update(updated) continue assert not any( other_matches(w, wkey) for wkey in w['existing_works'].iterkeys()) best_match = max(w['existing_works'].iteritems(), key=lambda i: i[1])[0] w['best_match'] = work_by_key[best_match] updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) for wkey in updated: if wkey in works_updated_this_session: print >> fh_log, wkey, 'already updated!' print wkey, 'already updated!' works_updated_this_session.update(updated) #if not do_updates: # return [] return [withKey(key) for key in works_updated_this_session]