def get_src(key): e = withKey(key) if 'source_records' in e: return e['source_records'] src = get_mc(key) if src: return [src]
def load_part(archive_id, part, start_pos=0): print 'load_part:', archive_id, part global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print loc print fast_parse.get_tag_lines(data, ['245']) raise except AssertionError: print loc raise if not index_fields or 'title' not in index_fields: continue edition_pool = pool.build(index_fields) if not edition_pool: yield loc, data continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect': print 'following redirect %s => %s' % (edition_key, thing['location']) edition_key = thing['location'] if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing, data) match = True break if match: break if not match: yield loc, data
def follow_redirects(key): keys = [] thing = None while not thing or thing['type']['key'] == '/type/redirect': keys.append(key) thing = withKey(key) assert thing if thing['type']['key'] == '/type/redirect': print 'following redirect %s => %s' % (key, thing['location']) key = thing['location'] return (keys, thing)
def follow_redirects(key): keys = [] thing = None while not thing or thing['type']['key'] == '/type/redirect': keys.append(key) thing = withKey(key) assert thing if thing['type']['key'] == '/type/redirect': print('following redirect %s => %s' % (key, thing['location'])) key = thing['location'] return (keys, thing)
def try_amazon(thing): if 'isbn_10' not in thing: return None if 'authors' in thing: authors = [] for a in thing['authors']: author_thing = withKey(a['key']) if 'name' in author_thing: authors.append(author_thing['name']) else: authors = [] return amazon.build_amazon(thing, authors)
def load_part(archive_id, part, start_pos=0): print('load_part:', archive_id, part) global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print(loc) print(fast_parse.get_tag_lines(data, ['245'])) raise except AssertionError: print(loc) raise if not index_fields or 'title' not in index_fields: continue edition_pool = pool.build(index_fields) if not edition_pool: continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue seen.add(edition_key) thing = withKey(edition_key) assert thing if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing) match = True if not match: yield loc, data
def switch_author(old, new): q = { 'authors': old['key'], 'type': '/type/edition', } for key in get_things(q): edition = withKey(key) authors = [] for author in edition['authors']: if author['key'] == old['key']: author_key = new['key'] else: author_key = author['key'] authors.append({ 'key': author_key }) q = { 'key': key, 'authors': { 'connect': 'update_list', 'value': authors } } print ol.write(q, comment='merge authors')
def try_merge(e1, edition_key, thing): thing_type = thing['type']['key'] if thing_type == '/type/delete': # return False assert thing_type == '/type/edition' if 'source_records' in thing: if fix_source_records(edition_key, thing): thing = withKey(edition_key) # reload return source_records_match(e1, thing) ia = thing.get('ocaid', None) print edition_key mc = get_mc(edition_key) print mc if mc: if mc.startswith('ia:'): ia = mc[3:] elif mc.endswith('.xml') or mc.endswith('.mrc'): ia = mc[:mc.find('/')] if '_meta.mrc:' in mc: assert 'ocaid' in thing ia = thing['ocaid'] rec2 = None if ia: if is_dark_or_bad(ia): return False try: loc2, rec2 = get_ia(ia) except xml.parsers.expat.ExpatError: return False except urllib2.HTTPError, error: print error.code assert error.code in (404, 403) if not rec2: return True
do_normalize(author, new_name) switch_author(merge_with, author) # print "delete merge_with" make_redirect(merge_with, author) else: new_key = merge_with['key'] print "copy fields from author to", new_key # new = copy_fields(merge_with, author, new_name) # update_author(new_key, new) do_normalize(merge_with, new_name) switch_author(author, merge_with) # print "delete author" make_redirect(author, merge_with) print author = withKey(sys.argv[1]) merge_with = withKey(sys.argv[2]) print author print merge_with def norm(s): return normalize('NFC', s) name1 = author['name'] name2 = merge_with['name'] print sys.argv if len(sys.argv) > 3: name = norm(sys.argv[3].decode('utf8')) else:
rc = read_rc() infogami = Infogami() infogami.login('edward', rc['edward']) for line in open('works_for_staging'): work_key, title, authors, editions = eval(line) q = { 'create': 'unless_exists', 'type': { 'key': '/type/work' }, 'key': work_key, 'title': title, 'authors': [{'key': '/a/' + a} for a in authors], } print q ret = infogami.write(q, comment='create work') print ret for edition_key in editions: edition = db_read.withKey(edition_key) if not edition: continue if 'works' in edition: continue q = { 'key': edition_key, 'works': { 'connect': 'update_list', 'value': [{'key': work_key}]} } ret = infogami.write(q, comment='add work to edition') print edition print q print edition_key, ret assert ret['result']['updated']
import catalog.importer.db_read as db_read import re import sys import codecs db_read.set_staging(True) sys.stdout = codecs.getwriter('utf-8')(sys.stdout) rc = read_rc() infogami = Infogami() infogami.login('edward', rc['edward']) for line in open('works_for_staging'): work_key, title, authors, editions = eval(line) if not all(db_read.withKey('/a/' + a) for a in authors): continue work = db_read.withKey(work_key) print(work_key) if work: continue if not work: q = { 'create': 'unless_exists', 'type': { 'key': '/type/work' }, 'key': work_key, 'title': title, 'authors': [{'key': '/a/' + a} for a in authors], } ret = infogami.write(q, comment='create work') print(ret)