def test_update_author(self, monkeypatch): update_work.data_provider = FakeDataProvider( [make_author(key='/authors/OL25A', name='Somebody')]) empty_solr_resp = MockResponse({ "facet_counts": { "facet_fields": { "place_facet": [], "person_facet": [], "subject_facet": [], "time_facet": [], } }, "response": { "numFound": 0 }, }) monkeypatch.setattr(update_work.requests, 'get', lambda url, **kwargs: empty_solr_resp) requests = update_work.update_author('/authors/OL25A') assert len(requests) == 1 assert isinstance(requests, list) assert isinstance(requests[0], update_work.UpdateRequest) assert requests[0].toxml().startswith('<add>') assert '<field name="key">/authors/OL25A</field>' in requests[0].toxml( )
def test_update_author(self): update_work.data_provider = FakeDataProvider( [make_author(key='/authors/OL25A', name='Somebody')]) empty_solr_resp = MockResponse({ "facet_counts": { "facet_fields": { "place_facet": [], "person_facet": [], "subject_facet": [], "time_facet": [], } }, "response": { "numFound": 0 }, }) with mock.patch('openlibrary.solr.update_work.urlopen', return_value=empty_solr_resp): requests = update_work.update_author('/authors/OL25A') assert len(requests) == 1 assert isinstance(requests, list) assert isinstance(requests[0], update_work.UpdateRequest) assert requests[0].toxml().startswith('<add>') assert '<field name="key">/authors/OL25A</field>' in requests[0].toxml( )
def solr_update_authors(): global authors_to_update for a in authors_to_update: author_updates = ['<delete>' + ''.join('<id>%s</id>' % re_author_key.match(akey).group(1) for akey in a['redirects']) + '</delete>'] author_updates += update_author(a['master_key'], a=a['master'], handle_redirects=False) solr_update(author_updates, index='authors', debug=False) solr_update(['<commit/>'], index='authors', debug=True) authors_to_update = []
def test_delete_author(self): update_work.data_provider = FakeDataProvider( [make_author(key='/authors/OL23A', type={'key': '/type/delete'})]) requests = update_work.update_author('/authors/OL23A') assert isinstance(requests, list) assert isinstance(requests[0], update_work.DeleteRequest) assert requests[0].toxml( ) == '<delete><query>key:/authors/OL23A</query></delete>'
def test_redirect_author(self): update_work.data_provider = FakeDataProvider([ make_author(key='/authors/OL24A', type={'key': '/type/redirect'}) ]) requests = update_work.update_author('/authors/OL24A') assert isinstance(requests, list) assert isinstance(requests[0], update_work.DeleteRequest) assert requests[0].toxml() == '<delete><query>key:/authors/OL24A</query></delete>'
def test_redirect_author(self): update_work.data_provider = FakeDataProvider([ make_author(key='/authors/OL24A', type={'key': '/type/redirect'}) ]) requests = update_work.update_author('/authors/OL24A') assert isinstance(requests, list) assert isinstance(requests[0], update_work.DeleteRequest) assert requests[0].toxml() == '<delete><id>/authors/OL24A</id></delete>'
def solr_update_authors(authors_to_update): for a in authors_to_update: try: author_updates = ['<delete>' + ''.join('<id>%s</id>' % re_author_key.match(akey).group(1) for akey in a['redirects']) + '</delete>'] except: print 'redirects' print a['redirects'] raise author_updates += update_author(a['master_key'], a=a['master'], handle_redirects=False) solr_update(author_updates, index='authors', debug=False) solr_update(['<commit/>'], index='authors', debug=True)
def run_update(): global authors_to_update global works_to_update global last_update print "running update: %s works %s authors" % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print "update work: %s %d/%d" % (wkey, num, total) if "/" in wkey[7:]: print "bad wkey:", wkey continue for attempt in range(5): try: requests += update_work(withKey(wkey)) break except AuthorRedirect: print "fixing author redirect" w = ol.get(wkey) need_update = False for a in w["authors"]: r = ol.get(a["author"]) if r["type"] == "/type/redirect": a["author"] = {"key": r["location"]} need_update = True assert need_update print w if not done_login: rc = read_rc() ol.login("EdwardBot", rc["EdwardBot"]) ol.save(w["key"], w, "avoid author redirect") if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) solr_update(["<commit/>"], debug=True) last_update = time() print >> open(state_file, "w"), offset if authors_to_update: requests = [] for akey in authors_to_update: print "update author:", akey requests += update_author(akey) solr_update(requests + ["<commit/>"], index="authors", debug=True) authors_to_update = set() works_to_update = set() print >> open(state_file, "w"), offset
def run_update(): global authors_to_update global works_to_update global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue for attempt in range(5): try: requests += update_work(withKey(wkey)) break except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True assert need_update print w if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) solr_update(['<commit/>'], debug=True) last_update = time() print >> open(state_file, 'w'), offset if authors_to_update: requests = [] for akey in authors_to_update: print 'update author:', akey requests += update_author(akey) solr_update(requests + ['<commit/>'], index='authors', debug=True) authors_to_update = set() works_to_update = set() print >> open(state_file, 'w'), offset
def test_update_author(self, monkeypatch): update_work.data_provider = FakeDataProvider([ make_author(key='/authors/OL25A', name='Somebody') ]) # Minimal Solr response, author not found in Solr solr_response = """{ "facet_counts": { "facet_fields": { "place_facet": [], "person_facet": [], "subject_facet": [], "time_facet": [] } }, "response": {"numFound": 0} }""" monkeypatch.setattr(update_work, 'urlopen', lambda url: StringIO(solr_response)) requests = update_work.update_author('/authors/OL25A') assert len(requests) == 1 assert isinstance(requests, list) assert isinstance(requests[0], update_work.UpdateRequest) assert requests[0].toxml().startswith('<add>') assert '<field name="key">/authors/OL25A</field>' in requests[0].toxml()
def test_update_author(self, monkeypatch): update_work.data_provider = FakeDataProvider( [make_author(key='/authors/OL25A', name='Somebody')]) # Minimal Solr response, author not found in Solr solr_response = """{ "facet_counts": { "facet_fields": { "place_facet": [], "person_facet": [], "subject_facet": [], "time_facet": [] } }, "response": {"numFound": 0} }""" monkeypatch.setattr(update_work, 'urlopen', lambda url: StringIO(solr_response)) requests = update_work.update_author('/authors/OL25A') assert len(requests) == 1 assert isinstance(requests, list) assert isinstance(requests[0], update_work.UpdateRequest) assert requests[0].toxml().startswith('<add>') assert '<field name="key">/authors/OL25A</field>' in requests[0].toxml( )
def run_update(): global authors_to_update, works_to_update subjects_to_update = set() global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue work_to_update = withKey(wkey) for attempt in range(5): try: requests += update_work(work_to_update) except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True if need_update: if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if work_to_update['type'][ 'key'] == '/type/work' and work_to_update.get('title'): subjects = get_work_subjects(work_to_update) print subjects for subject_type, values in subjects.iteritems(): subjects_to_update.update( (subject_type, v) for v in values) if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) if not args.no_commit: solr_update(['<commit/>'], debug=True) last_update = time() if not args.no_author_updates and authors_to_update: requests = [] for akey in authors_to_update: print('update author:', repr(akey)) try: request = update_author(akey) if request: requests += request except AttributeError: print('akey:', repr(akey)) raise if not args.no_commit: solr_update(requests + ['<commit/>'], debug=True) subject_add = Element("add") print subjects_to_update for subject_type, subject_name in subjects_to_update: key = subject_type + '/' + subject_name count = subject_count(subject_type, subject_name) if not subject_need_update(key, count): print 'no updated needed:', (subject_type, subject_name, count) continue print 'updated needed:', (subject_type, subject_name, count) doc = Element("doc") add_field(doc, 'key', key) add_field(doc, 'name', subject_name) add_field(doc, 'type', subject_type) add_field(doc, 'count', count) subject_add.append(doc) if len(subject_add): print 'updating subjects' add_xml = tostring(subject_add).encode('utf-8') solr_update([add_xml, '<commit />'], debug=True) authors_to_update = set() works_to_update = set() subjects_to_update = set() print >> open(state_file, 'w'), offset
def test_redirect_author(self): update_work.data_provider = FakeDataProvider([ make_author(key='/authors/OL24A', type={'key': '/type/redirect'}) ]) requests = update_work.update_author('/authors/OL24A') assert requests[0].to_json_command() == '"delete": ["/authors/OL24A"]'
def run_update(): global authors_to_update, works_to_update subjects_to_update = set() global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue work_to_update = withKey(wkey) for attempt in range(5): try: requests += update_work(work_to_update) except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True if need_update: if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if work_to_update['type']['key'] == '/type/work' and work_to_update.get('title'): subjects = get_work_subjects(work_to_update) print subjects for subject_type, values in subjects.iteritems(): subjects_to_update.update((subject_type, v) for v in values) if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) if not args.no_commit: solr_update(['<commit/>'], debug=True) last_update = time() if not args.no_author_updates and authors_to_update: requests = [] for akey in authors_to_update: print 'update author:', `akey` try: request = update_author(akey) if request: requests += request except AttributeError: print 'akey:', `akey` raise if not args.no_commit: solr_update(requests + ['<commit/>'], index='authors', debug=True) subject_add = Element("add") print subjects_to_update for subject_type, subject_name in subjects_to_update: key = subject_type + '/' + subject_name count = subject_count(subject_type, subject_name) if not subject_need_update(key, count): print 'no updated needed:', (subject_type, subject_name, count) continue print 'updated needed:', (subject_type, subject_name, count) doc = Element("doc") add_field(doc, 'key', key) add_field(doc, 'name', subject_name) add_field(doc, 'type', subject_type) add_field(doc, 'count', count) subject_add.append(doc) if len(subject_add): print 'updating subjects' add_xml = tostring(subject_add).encode('utf-8') solr_update([add_xml, '<commit />'], debug=True, index='subjects') authors_to_update = set() works_to_update = set() subjects_to_update = set() print >> open(state_file, 'w'), offset
best_match = max(w['existing_works'].iteritems(), key=lambda i:i[1])[0] w['best_match'] = work_by_key[best_match] updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log) for wkey in updated: if wkey in works_updated_this_session: print(wkey, 'already updated!', file=fh_log) print(wkey, 'already updated!') works_updated_this_session.update(updated) #if not do_updates: # return [] return [withKey(key) for key in works_updated_this_session] if __name__ == '__main__': akey = '/authors/' + sys.argv[1] title_redirects = find_title_redirects(akey) works = find_works(akey, get_books(akey, books_query(akey)), existing=title_redirects) to_update = update_works(akey, works, do_updates=True) requests = [] for w in to_update: requests += update_work(w) if to_update: solr_update(requests + ['<commit />'], debug=True) requests = update_author(akey) solr_update(requests + ['<commit/>'], debug=True)
for wkey in updated: if wkey in works_updated_this_session: print >> fh_log, wkey, 'already updated!' print wkey, 'already updated!' works_updated_this_session.update(updated) #if not do_updates: # return [] return [withKey(key) for key in works_updated_this_session] if __name__ == '__main__': akey = '/authors/' + sys.argv[1] title_redirects = find_title_redirects(akey) works = find_works(akey, get_books(akey, books_query(akey)), existing=title_redirects) to_update = update_works(akey, works, do_updates=True) requests = [] for w in to_update: requests += update_work(w) if to_update: solr_update(requests + ['<commit />'], debug=True) requests = update_author(akey) solr_update(requests + ['<commit/>'], debug=True)