def get_work_subjects(w): found = set() for e in w['editions']: sr = e.get('source_records', []) if sr: for i in sr: if i.endswith('initial import'): bad_source_record(e, i) continue if i.startswith('ia:') or i.startswith('marc:'): found.add(i) continue else: m = re_edition_key.match(e['key']) mc = get_mc('/b/' + m.group(1)) if mc: if mc.endswith('initial import'): bad_source_record(e, mc) continue if not mc.startswith('amazon:') and not re_ia_marc.match(mc): found.add('marc:' + mc) subjects = [] for sr in found: if sr.startswith('marc:ia:'): subjects.append(get_subjects_from_ia(sr[8:])) elif sr.startswith('marc:'): loc = sr[5:] data = get_from_archive(loc) rec = MarcBinary(data) subjects.append(read_subjects(rec)) else: assert sr.startswith('ia:') subjects.append(get_subjects_from_ia(sr[3:])) return combine_subjects(subjects)
def get_marc_src(e): mc = get_mc(e['key']) if mc and mc.startswith('amazon:'): mc = None if mc and mc.startswith('ia:'): yield 'ia', mc[3:] elif mc: m = re_ia_marc.match(mc) if m: #print 'IA marc match:', m.group(1) yield 'ia', m.group(1) else: yield 'marc', mc source_records = e.get('source_records', []) if not source_records: return for src in source_records: if src.startswith('ia:'): if not mc or src != mc: yield 'ia', src[3:] continue if src.startswith('marc:'): if not mc or src != 'marc:' + mc: yield 'marc', src[5:] continue
def get_work_subjects(w, do_get_mc=True): found = set() for e in w['editions']: sr = e.get('source_records', []) if sr: for i in sr: if i.endswith('initial import'): continue if i.startswith(('ia:', 'marc:')): found.add(i) continue else: mc = None if do_get_mc: m = re_edition_key.match(e['key']) mc = get_mc('/b/' + m.group(1)) if mc: if mc.endswith('initial import'): continue if not mc.startswith('amazon:') and not re_ia_marc.match(mc): found.add('marc:' + mc) subjects = [] for sr in found: if sr.startswith('marc:ia:'): subjects.append(get_subjects_from_ia(sr[8:])) elif sr.startswith('marc:'): loc = sr[5:] data = get_from_archive(loc) rec = MarcBinary(data) subjects.append(read_subjects(rec)) else: assert sr.startswith('ia:') subjects.append(get_subjects_from_ia(sr[3:])) return combine_subjects(subjects)
def try_merge(edition, ekey, thing): thing_type = thing['type']['key'] if 'isbn_10' not in edition: print(edition) asin = edition.get('isbn_10', None) or edition['asin'] if 'authors' in edition: authors = [i['name'] for i in edition['authors']] else: authors = [] a = amazon_merge.build_amazon(edition, authors) assert isinstance(asin, str) assert thing_type == '/type/edition' # print edition['asin'], ekey if 'source_records' in thing: if 'amazon:' + asin in thing['source_records']: return True return source_records_match(a, thing) # print 'no source records' mc = get_mc(ekey) # print 'mc:', mc if mc == 'amazon:' + asin: return True if not mc: return False data = get_from_local(mc) e1 = build_marc(fast_parse.read_edition(data)) return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
def add_source_records(key, ia, v=None): new = 'ia:' + ia sr = None m = re_edition_key.match(key) old_style_key = '/b/' + m.group(1) key = '/books/' + m.group(1) e = ol.get(key, v=v) need_update = False if 'ocaid' not in e: need_update = True e['ocaid'] = ia if 'source_records' in e: if new in e['source_records'] and not need_update: return e['source_records'].append(new) else: existing = get_mc(old_style_key) print 'get_mc(%s) == %s' % (old_style_key, existing) if existing is None: sr = [] elif existing.startswith('ia:') or existing.startswith('amazon:'): sr = [existing] else: m = re_meta_mrc.match(existing) sr = ['marc:' + existing if not m else 'ia:' + m.group(1)] print 'ocaid:', e['ocaid'] if 'ocaid' in e and 'ia:' + e['ocaid'] not in sr: sr.append('ia:' + e['ocaid']) print 'sr:', sr print 'ocaid:', e['ocaid'] if new not in sr: e['source_records'] = sr + [new] else: e['source_records'] = sr assert 'source_records' in e # fix other bits of the record as well new_toc = fix_toc(e) if new_toc: e['table_of_contents'] = new_toc if e.get('subjects', None) and any(has_dot(s) for s in e['subjects']): subjects = [s[:-1] if has_dot(s) else s for s in e['subjects']] e['subjects'] = subjects if 'authors' in e: assert not any(a=='None' for a in e['authors']) print e['authors'] authors = [ol.get(akey) for akey in e['authors']] authors = [ol.get(a['location']) if a['type'] == '/type/redirect' else a \ for a in authors] for a in authors: if a['type'] == '/type/redirect': print 'double redirect on:', e['key'] e['authors'] = [{'key': a['key']} for a in authors] undelete_authors(authors) print 'saving', key assert 'source_records' in e print ol.save(key, e, 'found a matching MARC record') add_cover_image(key, ia)
def add_source_records(key, ia, v=None): new = 'ia:' + ia sr = None m = re_edition_key.match(key) old_style_key = '/b/' + m.group(1) key = '/books/' + m.group(1) e = ol.get(key, v=v) need_update = False if 'ocaid' not in e: need_update = True e['ocaid'] = ia if 'source_records' in e: if new in e['source_records'] and not need_update: return e['source_records'].append(new) else: existing = get_mc(old_style_key) print('get_mc(%s) == %s' % (old_style_key, existing)) if existing is None: sr = [] elif existing.startswith('ia:') or existing.startswith('amazon:'): sr = [existing] else: m = re_meta_mrc.match(existing) sr = ['marc:' + existing if not m else 'ia:' + m.group(1)] print('ocaid:', e['ocaid']) if 'ocaid' in e and 'ia:' + e['ocaid'] not in sr: sr.append('ia:' + e['ocaid']) print('sr:', sr) print('ocaid:', e['ocaid']) if new not in sr: e['source_records'] = sr + [new] else: e['source_records'] = sr assert 'source_records' in e # fix other bits of the record as well new_toc = fix_toc(e) if new_toc: e['table_of_contents'] = new_toc if e.get('subjects', None) and any(has_dot(s) for s in e['subjects']): subjects = [s[:-1] if has_dot(s) else s for s in e['subjects']] e['subjects'] = subjects if 'authors' in e: assert not any(a == 'None' for a in e['authors']) print(e['authors']) authors = [ol.get(akey) for akey in e['authors']] authors = [ol.get(a['location']) if a['type'] == '/type/redirect' else a \ for a in authors] for a in authors: if a['type'] == '/type/redirect': print('double redirect on:', e['key']) e['authors'] = [{'key': a['key']} for a in authors] undelete_authors(authors) print('saving', key) assert 'source_records' in e print(ol.save(key, e, 'found a matching MARC record')) add_cover_image(key, ia)
def get_marc_source(w): found = set() for e in w['editions']: sr = e.get('source_record', []) if sr: found.update(i[5:] for i in sr if i.startswith('marc:')) else: mc = get_mc(e['key']) if mc and not mc.startswith('amazon:') and not re_ia_marc.match(mc): found.add(mc) return found
def add_source_records(key, new, thing, data): sr = None e = get_with_retry(key) if 'source_records' in e: if new in e['source_records']: return e['source_records'].append(new) else: existing = get_mc(key) amazon = 'amazon:' if existing.startswith('ia:'): sr = [existing] elif existing.startswith(amazon): sr = amazon_source_records(existing[len(amazon):]) or [existing] else: m = re_meta_mrc.match(existing) sr = ['marc:' + existing if not m else 'ia:' + m.group(1)] assert new not in sr e['source_records'] = sr + [new] # fix other bits of the record as well new_toc = fix_toc(e) if new_toc: e['table_of_contents'] = new_toc if e.get('subjects', None) and any(has_dot(s) for s in e['subjects']): subjects = [s[:-1] if has_dot(s) else s for s in e['subjects']] e['subjects'] = subjects if 'authors' in e: if any(a == 'None' for a in e['authors']): assert len(e['authors']) == 1 new_author = author_from_data(new, data) e['authors'] = [new_author] else: print(e['authors']) authors = [get_with_retry(akey) for akey in e['authors']] while any(a['type'] == '/type/redirect' for a in authors): print('following redirects') authors = [ ol.get(a['location']) if a['type'] == '/type/redirect' else a for a in authors ] e['authors'] = [{'key': a['key']} for a in authors] undelete_authors(authors) try: print(save_with_retry(key, e, 'found a matching MARC record')) except: print(e) raise if new_toc: new_edition = ol.get(key) # [{u'type': <ref: u'/type/toc_item'>}, ...] assert 'title' in new_edition['table_of_contents'][0]
def get_marc_source(w): found = set() for e in w['editions']: sr = e.get('source_records', []) if sr: found.update(i[5:] for i in sr if i.startswith('marc:')) else: m = re_edition_key.match(e['key']) if not m: print(e['key']) mc = get_mc('/b/' + m.group(1)) if mc and not mc.startswith('amazon:') and not re_ia_marc.match(mc): found.add(mc) return found
def get_marc_source(w): found = set() for e in w['editions']: sr = e.get('source_records', []) if sr: found.update(i[5:] for i in sr if i.startswith('marc:')) else: m = re_edition_key.match(e['key']) if not m: print e['key'] mc = get_mc('/b/' + m.group(1)) if mc and not mc.startswith('amazon:') and not re_ia_marc.match(mc): found.add(mc) return found
def add_source_records(key, new, thing, data): sr = None e = get_with_retry(key) if 'source_records' in e: if new in e['source_records']: return e['source_records'].append(new) else: existing = get_mc(key) amazon = 'amazon:' if existing.startswith('ia:'): sr = [existing] elif existing.startswith(amazon): sr = amazon_source_records(existing[len(amazon):]) or [existing] else: m = re_meta_mrc.match(existing) sr = ['marc:' + existing if not m else 'ia:' + m.group(1)] assert new not in sr e['source_records'] = sr + [new] # fix other bits of the record as well new_toc = fix_toc(e) if new_toc: e['table_of_contents'] = new_toc if e.get('subjects', None) and any(has_dot(s) for s in e['subjects']): subjects = [s[:-1] if has_dot(s) else s for s in e['subjects']] e['subjects'] = subjects if 'authors' in e: if any(a=='None' for a in e['authors']): assert len(e['authors']) == 1 new_author = author_from_data(new, data) e['authors'] = [new_author] else: print e['authors'] authors = [get_with_retry(akey) for akey in e['authors']] while any(a['type'] == '/type/redirect' for a in authors): print 'following redirects' authors = [ol.get(a['location']) if a['type'] == '/type/redirect' else a for a in authors] e['authors'] = [{'key': a['key']} for a in authors] undelete_authors(authors) try: print save_with_retry(key, e, 'found a matching MARC record') except: print e raise if new_toc: new_edition = ol.get(key) # [{u'type': <ref: u'/type/toc_item'>}, ...] assert 'title' in new_edition['table_of_contents'][0]
def try_merge(e1, edition_key, thing): thing_type = thing['type']['key'] if thing_type != '/type/edition': print thing['key'], 'is', thing['type']['key'] if thing_type == '/type/delete': # return False assert thing_type == '/type/edition' if 'source_records' in thing: if fix_source_records(edition_key, thing): thing = withKey(edition_key) # reload return source_records_match(e1, thing) ia = thing.get('ocaid', None) print edition_key mc = get_mc(edition_key) print mc if mc: if mc.startswith('ia:'): ia = mc[3:] elif mc.endswith('.xml') or mc.endswith('.mrc'): ia = mc[:mc.find('/')] if '_meta.mrc:' in mc: print thing if 'ocaid' not in thing: return False ia = thing['ocaid'] rec2 = None if ia: if is_dark_or_bad(ia): return False try: loc2, rec2 = get_ia(ia) except xml.parsers.expat.ExpatError: return False except NoMARCXML: print 'no MARCXML' pass except urllib2.HTTPError, error: print error.code assert error.code in (404, 403) if not rec2: return True
def try_merge(e1, edition_key, thing): thing_type = thing['type'] if thing_type != Reference('/type/edition'): print thing['key'], 'is', str(thing['type']) if thing_type == Reference('/type/delete'): return False assert thing_type == Reference('/type/edition') if 'source_records' in thing: if fix_source_records(edition_key, thing): thing = withKey(edition_key) # reload return source_records_match(e1, thing) ia = thing.get('ocaid', None) print edition_key mc = get_mc(edition_key) print mc if mc: if mc.startswith('ia:'): ia = mc[3:] elif mc.endswith('.xml') or mc.endswith('.mrc'): ia = mc[:mc.find('/')] if '_meta.mrc:' in mc: print thing if 'ocaid' not in thing: return False ia = thing['ocaid'] rec2 = None if ia: if is_dark_or_bad(ia): return False try: rec2 = get_ia(ia) except xml.parsers.expat.ExpatError: return False except NoMARCXML: print 'no MARCXML' pass except urllib2.HTTPError, error: print error.code assert error.code in (404, 403) if not rec2: return True
def try_merge(e1, edition_key, thing): thing_type = thing["type"]["key"] if thing_type != "/type/edition": print thing["key"], "is", thing["type"]["key"] if thing_type == "/type/delete": # return False assert thing_type == "/type/edition" if "source_records" in thing: if fix_source_records(edition_key, thing): thing = withKey(edition_key) # reload return source_records_match(e1, thing) ia = thing.get("ocaid", None) print edition_key mc = get_mc(edition_key) print mc if mc: if mc.startswith("ia:"): ia = mc[3:] elif mc.endswith(".xml") or mc.endswith(".mrc"): ia = mc[: mc.find("/")] if "_meta.mrc:" in mc: assert "ocaid" in thing ia = thing["ocaid"] rec2 = None if ia: if is_dark_or_bad(ia): return False try: loc2, rec2 = get_ia(ia) except xml.parsers.expat.ExpatError: return False except NoMARCXML: print "no MARCXML" pass except urllib2.HTTPError, error: print error.code assert error.code in (404, 403) if not rec2: return True
def fix_edition(key, e, ol): existing = get_mc(key) if 'source_records' not in e and existing: amazon = 'amazon:' if existing.startswith('ia:'): sr = [existing] elif existing.startswith(amazon): sr = amazon_source_records(existing[len(amazon):]) or [existing] else: print('existing:', existing) m = re_meta_mrc.search(existing) sr = ['marc:' + existing if not m else 'ia:' + m.group(1)] e['source_records'] = sr if 'ocaid' in e: ia = 'ia:' + e['ocaid'] if 'source_records' not in e: e['source_records'] = [ia] elif ia not in e['source_records']: e['source_records'].append(ia) fix_toc(e) fix_subject(e) fix_authors(e, ol) return e
def get_books(akey, query, do_get_mc=True): for e in query: try: if not e.get('title', None): continue except: print e # if len(e.get('authors', [])) != 1: # continue if 'title_prefix' in e and e['title_prefix']: prefix = e['title_prefix'] if prefix[-1] != ' ': prefix += ' ' title = prefix + e['title'] else: title = e['title'] title = title.strip(' ') if has_dot(title): title = title[:-1] m = re_parens.match(title) if m: title = m.group(1) n = mk_norm(title) book = { 'title': title, 'norm_title': n, 'key': e['key'], } lang = e.get('languages', []) if lang: book['lang'] = [re_lang_key.match(l['key']).group(1) for l in lang] if e.get('table_of_contents', None): if isinstance(e['table_of_contents'][0], basestring): book['table_of_contents'] = e['table_of_contents'] else: assert isinstance(e['table_of_contents'][0], dict) if e['table_of_contents'][0].get('type', None) == '/type/text': book['table_of_contents'] = [ i['value'] for i in e['table_of_contents'] ] if 'subtitle' in e: book['subtitle'] = e['subtitle'] if 'source_records' in e: book['source_records'] = e['source_records'] mc = get_mc(e['key']) if do_get_mc else None wt = get_work_title(e, mc) if not wt: yield book continue if wt in bad_titles: yield book continue n_wt = mk_norm(wt) book['work_title'] = wt book['norm_wt'] = n_wt yield book
def try_merge(e1, edition_key, thing): thing_type = thing['type'] if thing_type != Reference('/type/edition'): print(thing['key'], 'is', str(thing['type'])) if thing_type == Reference('/type/delete'): return False assert thing_type == Reference('/type/edition') if 'source_records' in thing: if fix_source_records(edition_key, thing): thing = withKey(edition_key) # reload return source_records_match(e1, thing) ia = thing.get('ocaid', None) print(edition_key) mc = get_mc(edition_key) print(mc) if mc: if mc.startswith('ia:'): ia = mc[3:] elif mc.endswith('.xml') or mc.endswith('.mrc'): ia = mc[:mc.find('/')] if '_meta.mrc:' in mc: print(thing) if 'ocaid' not in thing: return False ia = thing['ocaid'] rec2 = None if ia: if is_dark_or_bad(ia): return False try: rec2 = get_ia(ia) except xml.parsers.expat.ExpatError: return False except NoMARCXML: print('no MARCXML') pass except urllib2.HTTPError as error: print(error.code) assert error.code in (404, 403) if not rec2: return True if not rec2: if not mc: mc = get_mc(thing['key']) if not mc or mc == 'initial import': return False if mc.startswith('amazon:'): try: a = try_amazon(thing) except IndexError: print(thing['key']) raise except AttributeError: return False if not a: return False try: return amazon.attempt_merge(a, e1, threshold, debug=False) except: print(a) print(e1) print(thing['key']) raise print('mc:', mc) try: assert not mc.startswith('ia:') data = get_from_archive(mc) if not data: return True rec2 = fast_parse.read_edition(data) except (fast_parse.SoundRecording, IndexError, AssertionError): print(mc) print(edition_key) return False except: print(mc) print(edition_key) raise if not rec2: return False try: e2 = build_marc(rec2) except TypeError: print(rec2) raise return attempt_merge(e1, e2, threshold, debug=False)
return False try: rec2 = get_ia(ia) except xml.parsers.expat.ExpatError: return False except NoMARCXML: print 'no MARCXML' pass except urllib2.HTTPError, error: print error.code assert error.code in (404, 403) if not rec2: return True if not rec2: if not mc: mc = get_mc(thing['key']) if not mc or mc == 'initial import': return False if mc.startswith('amazon:'): try: a = try_amazon(thing) except IndexError: print thing['key'] raise except AttributeError: return False if not a: return False try: return amazon.attempt_merge(a, e1, threshold, debug=False) except:
def get_books(akey, query, do_get_mc=True): for e in query: try: if not e.get('title', None): continue except: print(e) # if len(e.get('authors', [])) != 1: # continue if 'title_prefix' in e and e['title_prefix']: prefix = e['title_prefix'] if prefix[-1] != ' ': prefix += ' ' title = prefix + e['title'] else: title = e['title'] title = title.strip(' ') if has_dot(title): title = title[:-1] m = re_parens.match(title) if m: title = m.group(1) n = mk_norm(title) book = { 'title': title, 'norm_title': n, 'key': e['key'], } lang = e.get('languages', []) if lang: book['lang'] = [re_lang_key.match(l['key']).group(1) for l in lang] if e.get('table_of_contents', None): if isinstance(e['table_of_contents'][0], six.string_types): book['table_of_contents'] = e['table_of_contents'] else: assert isinstance(e['table_of_contents'][0], dict) if e['table_of_contents'][0].get('type', None) == '/type/text': book['table_of_contents'] = [i['value'] for i in e['table_of_contents']] if 'subtitle' in e: book['subtitle'] = e['subtitle'] if 'source_records' in e: book['source_records'] = e['source_records'] mc = get_mc(e['key']) if do_get_mc else None wt = get_work_title(e, mc) if not wt: yield book continue if wt in bad_titles: yield book continue n_wt = mk_norm(wt) book['work_title'] = wt book['norm_wt'] = n_wt yield book
return False try: loc2, rec2 = get_ia(ia) except xml.parsers.expat.ExpatError: return False except NoMARCXML: print 'no MARCXML' pass except urllib2.HTTPError, error: print error.code assert error.code in (404, 403) if not rec2: return True if not rec2: if not mc: mc = get_mc(thing['key']) if not mc or mc == 'initial import': return False if mc.startswith('amazon:'): try: a = try_amazon(thing) except IndexError: print thing['key'] raise except AttributeError: return False if not a: return False try: return amazon.attempt_merge(a, e1, threshold, debug=False) except:
ol.login('EdwardBot', rc['EdwardBot']) test_dir = '/home/edward/ol/test_data' re_edition = re.compile('^/b/OL\d+M$') re_meta_mrc = re.compile('^([^/]*)_meta.mrc:0:\d+$') #out = open('source_records', 'w') for f in os.listdir(test_dir): key = f.replace('_', '/') if not re_edition.match(key): continue print key continue mc = get_mc(key) print key, mc if not mc: continue e = ol.get(key) if e.get('source_records', []): continue if mc.startswith('ia:') or mc.startswith('amazon:'): sr = mc else: m = re_meta_mrc.match(mc) sr = 'marc:' + mc if not m else 'ia:' + m.group(1) e['source_records'] = [sr] print >> out, (key, sr) print ol.save(key, e, 'add source record') #out.close()
def add_source_records(key, ia): new = 'ia:' + ia sr = None e = ol.get(key) need_update = False if 'ocaid' not in e: need_update = True e['ocaid'] = ia if 'source_records' in e: if new in e['source_records'] and not need_update: return e['source_records'].append(new) else: existing = get_mc(key) amazon = 'amazon:' if existing is None: sr = [] elif existing.startswith('ia:'): sr = [existing] elif existing.startswith(amazon): sr = amazon_source_records(existing[len(amazon):]) or [existing] else: m = re_meta_mrc.match(existing) sr = ['marc:' + existing if not m else 'ia:' + m.group(1)] if 'ocaid' in e and 'ia:' + e['ocaid'] not in sr: sr.append('ia:' + e['ocaid']) if new not in sr: e['source_records'] = sr + [new] # fix other bits of the record as well new_toc = fix_toc(e) if new_toc: e['table_of_contents'] = new_toc if e.get('subjects', None) and any(has_dot(s) for s in e['subjects']): subjects = [s[:-1] if has_dot(s) else s for s in e['subjects']] e['subjects'] = subjects if 'authors' in e: assert not any(a=='None' for a in e['authors']) print e['authors'] authors = [ol.get(akey) for akey in e['authors']] authors = [ol.get(a['location']) if a['type'] == '/type/redirect' else a \ for a in authors] e['authors'] = [{'key': a['key']} for a in authors] undelete_authors(authors) print 'saving', key print marshal(e) for attempt in range(50): try: print ol.save(key, e, 'found a matching MARC record') break except KeyboardInterrupt: raise except URLError: if attempt == 49: raise except: print e raise print 'attempt %d failed' % attempt sleep(30) if new_toc: new_edition = ol.get(key) # [{u'type': <ref: u'/type/toc_item'>}, ...] assert 'title' in new_edition['table_of_contents'][0]
return False try: loc2, rec2 = get_ia(ia) except xml.parsers.expat.ExpatError: return False except NoMARCXML: print "no MARCXML" pass except urllib2.HTTPError, error: print error.code assert error.code in (404, 403) if not rec2: return True if not rec2: if not mc: mc = get_mc(thing["key"]) if not mc or mc == "initial import": return False if mc.startswith("amazon:"): try: a = try_amazon(thing) except IndexError: print thing["key"] raise except AttributeError: return False if not a: return False try: return amazon.attempt_merge(a, e1, threshold, debug=False) except: