def solr_update_subjects(): global subjects_to_update print subjects_to_update subject_add = Element("add") for subject_type, subject_name in subjects_to_update: key = subject_type + '/' + subject_name count = subject_count(subject_type, subject_name) if not subject_need_update(key, count): print 'no updated needed:', (subject_type, subject_name, count) continue print 'updated needed:', (subject_type, subject_name, count) doc = Element("doc") add_field(doc, 'key', key) add_field(doc, 'name', subject_name) add_field(doc, 'type', subject_type) add_field(doc, 'count', count) subject_add.append(doc) if len(subject_add): print 'updating subjects' add_xml = tostring(subject_add).encode('utf-8') solr_update([add_xml], debug=False, index='subjects') solr_update(['<commit />'], debug=True, index='subjects') subjects_to_update = set()
def run_update(): global authors_to_update, works_to_update subjects_to_update = set() global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue work_to_update = withKey(wkey) for attempt in range(5): try: requests += update_work(work_to_update) except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True if need_update: if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if work_to_update['type'][ 'key'] == '/type/work' and work_to_update.get('title'): subjects = get_work_subjects(work_to_update) print subjects for subject_type, values in subjects.iteritems(): subjects_to_update.update( (subject_type, v) for v in values) if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) if not args.no_commit: solr_update(['<commit/>'], debug=True) last_update = time() if not args.no_author_updates and authors_to_update: requests = [] for akey in authors_to_update: print('update author:', repr(akey)) try: request = update_author(akey) if request: requests += request except AttributeError: print('akey:', repr(akey)) raise if not args.no_commit: solr_update(requests + ['<commit/>'], debug=True) subject_add = Element("add") print subjects_to_update for subject_type, subject_name in subjects_to_update: key = subject_type + '/' + subject_name count = subject_count(subject_type, subject_name) if not subject_need_update(key, count): print 'no updated needed:', (subject_type, subject_name, count) continue print 'updated needed:', (subject_type, subject_name, count) doc = Element("doc") add_field(doc, 'key', key) add_field(doc, 'name', subject_name) add_field(doc, 'type', subject_type) add_field(doc, 'count', count) subject_add.append(doc) if len(subject_add): print 'updating subjects' add_xml = tostring(subject_add).encode('utf-8') solr_update([add_xml, '<commit />'], debug=True) authors_to_update = set() works_to_update = set() subjects_to_update = set() print >> open(state_file, 'w'), offset
def run_update(): global authors_to_update, works_to_update subjects_to_update = set() global last_update print 'running update: %s works %s authors' % (len(works_to_update), len(authors_to_update)) if works_to_update: requests = [] num = 0 total = len(works_to_update) for wkey in works_to_update: num += 1 print 'update work: %s %d/%d' % (wkey, num, total) if '/' in wkey[7:]: print 'bad wkey:', wkey continue work_to_update = withKey(wkey) for attempt in range(5): try: requests += update_work(work_to_update) except AuthorRedirect: print 'fixing author redirect' w = ol.get(wkey) need_update = False for a in w['authors']: r = ol.get(a['author']) if r['type'] == '/type/redirect': a['author'] = {'key': r['location']} need_update = True if need_update: if not done_login: rc = read_rc() ol.login('EdwardBot', rc['EdwardBot']) ol.save(w['key'], w, 'avoid author redirect') if work_to_update['type']['key'] == '/type/work' and work_to_update.get('title'): subjects = get_work_subjects(work_to_update) print subjects for subject_type, values in subjects.iteritems(): subjects_to_update.update((subject_type, v) for v in values) if len(requests) >= 100: solr_update(requests, debug=True) requests = [] # if num % 1000 == 0: # solr_update(['<commit/>'], debug=True) if requests: solr_update(requests, debug=True) if not args.no_commit: solr_update(['<commit/>'], debug=True) last_update = time() if not args.no_author_updates and authors_to_update: requests = [] for akey in authors_to_update: print 'update author:', `akey` try: request = update_author(akey) if request: requests += request except AttributeError: print 'akey:', `akey` raise if not args.no_commit: solr_update(requests + ['<commit/>'], index='authors', debug=True) subject_add = Element("add") print subjects_to_update for subject_type, subject_name in subjects_to_update: key = subject_type + '/' + subject_name count = subject_count(subject_type, subject_name) if not subject_need_update(key, count): print 'no updated needed:', (subject_type, subject_name, count) continue print 'updated needed:', (subject_type, subject_name, count) doc = Element("doc") add_field(doc, 'key', key) add_field(doc, 'name', subject_name) add_field(doc, 'type', subject_type) add_field(doc, 'count', count) subject_add.append(doc) if len(subject_add): print 'updating subjects' add_xml = tostring(subject_add).encode('utf-8') solr_update([add_xml, '<commit />'], debug=True, index='subjects') authors_to_update = set() works_to_update = set() subjects_to_update = set() print >> open(state_file, 'w'), offset