def merge_works(works): master = works.pop(0) master_first_publish_year = get_publish_year( master.get('first_publish_date')) subtitles = sorted((w['subtitle'] for w in works if w.get('subtitle')), key=lambda s: len(s)) if subtitles and len(subtitles[-1]) > len(master.get('subtitle', '')): master['subtitle'] = subtitles[-1] updates = [] for w in works: wkey = w.pop('key') q = {'type': '/type/edition', 'works': wkey} for ekey in ol.query(q): e = ol.get(ekey) assert len(e['works']) == 1 and e['works'][0] == wkey e['works'] = [Reference(master['key'])] updates.append(e) assert w['type'] != Reference('/type/redirect') updates.append({ 'key': wkey, 'type': Reference('/type/redirect'), 'location': master['key'], }) for f in 'covers', 'subjects', 'subject_places', 'subject_people', 'subject_times', 'lc_classifications', 'dewey_number': if not w.get(f): continue assert not isinstance(w[f], basestring) for i in w[f]: if i not in master.setdefault(f, []): master[f].append(i) if w.get('first_sentence') and not master.get('first_sentence'): master['first_sentence'] = w['first_sentence'] if w.get('first_publish_date'): if not master.get('first_publish_date'): master['first_publish_date'] = w['first_publish_date'] else: publish_year = get_publish_year(w['first_publish_date']) if publish_year < master_first_publish_year: master['first_publish_date'] = w['first_publish_date'] master_first_publish_year = publish_year for excerpt in w.get('exceprts', []): master.setdefault('exceprts', []).append(excerpt) for f in 'title', 'subtitle', 'created', 'last_modified', 'latest_revision', 'revision', 'number_of_editions', 'type', 'first_sentence', 'authors', 'first_publish_date', 'excerpts', 'covers', 'subjects', 'subject_places', 'subject_people', 'subject_times', 'lc_classifications', 'dewey_number': try: del w[f] except KeyError: pass print w assert not w updates.append(master) print len(updates), [(doc['key'], doc['type']) for doc in updates] # update master # update editions to point at master # replace works with redirects print ol.save_many(updates, 'merge works')
def try_merge(e1, edition_key, thing): thing_type = thing['type'] if thing_type != Reference('/type/edition'): print thing['key'], 'is', str(thing['type']) if thing_type == Reference('/type/delete'): return False assert thing_type == Reference('/type/edition') if 'source_records' in thing: if fix_source_records(edition_key, thing): thing = withKey(edition_key) # reload return source_records_match(e1, thing) ia = thing.get('ocaid', None) print edition_key mc = get_mc(edition_key) print mc if mc: if mc.startswith('ia:'): ia = mc[3:] elif mc.endswith('.xml') or mc.endswith('.mrc'): ia = mc[:mc.find('/')] if '_meta.mrc:' in mc: print thing if 'ocaid' not in thing: return False ia = thing['ocaid'] rec2 = None if ia: if is_dark_or_bad(ia): return False try: rec2 = get_ia(ia) except xml.parsers.expat.ExpatError: return False except NoMARCXML: print 'no MARCXML' pass except urllib2.HTTPError, error: print error.code assert error.code in (404, 403) if not rec2: return True
def run_merge(ia): cur = g.db.cursor() cur.execute('select editions from merge where ia=%s', ia) [ekeys] = cur.fetchone() ekeys = [ '/books/OL%dM' % x for x in sorted( int(re_edition_key.match(ekey).group(1)) for ekey in ekeys.split(' ')) ] min_ekey = ekeys[0] editions = [ol.get(ekey) for ekey in ekeys] editions_by_key = dict((e['key'][7:], e) for e in editions) merged = build_merged(editions) missing = [] for k, v in merged.items(): if v is not None: continue use_ekey = request.form.get(k) if use_ekey is None: missing.append(k) continue merged[k] = editions_by_key[use_ekey][k] if missing: flash('please select: ' + ', '.join(missing)) return redirect(url_for('merge', ia=ia)) master = ol.get(min_ekey) for k, v in merged.items(): master[k] = v updates = [] updates.append(master) for ekey in ekeys: if ekey == min_ekey: continue ol_redirect = { 'type': Reference('/type/redirect'), 'location': min_ekey, 'key': ekey, } updates.append(ol_redirect) #print len(updates), min_ekey try: ol.save_many(updates, 'merge lending editions') except: #for i in updates: # print i raise cur.execute('update merge set done=now() where ia=%s', [ia]) flash(ia + ' merged') return redirect(url_for('index'))
def add_work(akey, w): q = { 'authors': [{ 'author': Reference(akey) }], 'type': '/type/work', 'title': w['title'] } try: wkey = ol.new(q, comment='create work page') except: print(q) raise write_log('work', wkey, w['title']) assert isinstance(wkey, six.string_types) for ekey in w['editions']: e = ol.get(ekey) fix_edition(ekey, e, ol) #assert 'works' not in e write_log('edition', ekey, e.get('title', 'title missing')) e['works'] = [Reference(wkey)] yield e
def update_edition(ekey, wkey): e = ol.get(ekey) fix_edition(ekey, e, ol) write_log('edition', ekey, e.get('title', 'title missing')) if e.get('works', []): assert len(e['works']) == 1 if e['works'][0] != wkey: print('e:', e) print('wkey:', wkey) print('ekey:', ekey) print('e["works"]:', e['works']) #merge_works([e['works'][0], wkey]) #assert e['works'][0] == wkey return None e['works'] = [Reference(wkey)] return e
def update_work_edition(ekey, wkey, use): print((ekey, wkey, use)) e = ol.get(ekey) works = [] for w in e['works']: if w == wkey: if use not in works: works.append(Reference(use)) else: if w not in works: works.append(w) if e['works'] == works: return print('before:', e['works']) print('after:', works) e['works'] = works print(ol.save(e['key'], e, 'remove duplicate work page'))
def add_works(works): q = [] for w in works: cur = { 'authors': [{ 'author': Reference(w['author']) }], 'type': '/type/work', 'title': w['title'] } if 'subjects' in w: cur['subjects'] = w['subjects'] q.append(cur) try: return ol.new(q, comment='create work page') except: print(q) raise
all_keys.update(k for k, v in e.items() if v) for k in 'latest_revision', 'revision', 'created', 'last_modified', 'key', 'type', 'genres': if k in all_keys: all_keys.remove(k) for k in all_keys.copy(): if k.startswith('subject'): all_keys.remove(k) for e in editions: # resolve redirects if 'authors' not in e: continue new_authors = [] for akey in e['authors']: a = ol.get(akey) if a['type'] == Reference('/type/redirect'): akey = Reference(a['location']) else: assert a['type'] == Reference('/type/author') new_authors.append(akey) e['authors'] = new_authors k = 'publish_date' publish_dates = set(e[k] for e in editions if k in e and len(e[k]) != 4) k = 'pagination' all_pagination = set(e[k].strip(':.') for e in editions if e.get(k)) one_item_lists = {} for k in 'lc_classifications', 'publishers', 'contributions', 'series': one_item_lists[k] = set(e[k][0].strip('.') for e in editions
def try_merge(e1, edition_key, thing): thing_type = thing['type'] if thing_type != Reference('/type/edition'): print(thing['key'], 'is', str(thing['type'])) if thing_type == Reference('/type/delete'): return False assert thing_type == Reference('/type/edition') if 'source_records' in thing: if fix_source_records(edition_key, thing): thing = withKey(edition_key) # reload return source_records_match(e1, thing) ia = thing.get('ocaid', None) print(edition_key) mc = get_mc(edition_key) print(mc) if mc: if mc.startswith('ia:'): ia = mc[3:] elif mc.endswith('.xml') or mc.endswith('.mrc'): ia = mc[:mc.find('/')] if '_meta.mrc:' in mc: print(thing) if 'ocaid' not in thing: return False ia = thing['ocaid'] rec2 = None if ia: if is_dark_or_bad(ia): return False try: rec2 = get_ia(ia) except xml.parsers.expat.ExpatError: return False except NoMARCXML: print('no MARCXML') pass except urllib2.HTTPError as error: print(error.code) assert error.code in (404, 403) if not rec2: return True if not rec2: if not mc: mc = get_mc(thing['key']) if not mc or mc == 'initial import': return False if mc.startswith('amazon:'): try: a = try_amazon(thing) except IndexError: print(thing['key']) raise except AttributeError: return False if not a: return False try: return amazon.attempt_merge(a, e1, threshold, debug=False) except: print(a) print(e1) print(thing['key']) raise print('mc:', mc) try: assert not mc.startswith('ia:') data = get_from_archive(mc) if not data: return True rec2 = fast_parse.read_edition(data) except (fast_parse.SoundRecording, IndexError, AssertionError): print(mc) print(edition_key) return False except: print(mc) print(edition_key) raise if not rec2: return False try: e2 = build_marc(rec2) except TypeError: print(rec2) raise return attempt_merge(e1, e2, threshold, debug=False)
def toc_items(toc_list): return [{ 'title': six.text_type(item), 'type': Reference('/type/toc_item') } for item in toc_list]
def toc_items(toc_list): return [{ 'title': unicode(item), 'type': Reference('/type/toc_item') } for item in toc_list]