def try_merge(edition, ekey, thing): thing_type = thing['type']['key'] if 'isbn_10' not in edition: print edition asin = edition.get('isbn_10', None) or edition['asin'] if 'authors' in edition: authors = [i['name'] for i in edition['authors']] else: authors = [] a = amazon_merge.build_amazon(edition, authors) assert isinstance(asin, basestring) assert thing_type == '/type/edition' #print edition['asin'], ekey if 'source_records' in thing: if 'amazon:' + asin in thing['source_records']: return True return source_records_match(a, thing) #print 'no source records' mc = get_mc(ekey) #print 'mc:', mc if mc == 'amazon:' + asin: return True if not mc: return False data = get_from_local(mc) e1 = build_marc(fast_parse.read_edition(data)) return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
def try_merge(edition, ekey, thing): thing_type = thing['type']['key'] if 'isbn_10' not in edition: print(edition) asin = edition.get('isbn_10', None) or edition['asin'] if 'authors' in edition: authors = [i['name'] for i in edition['authors']] else: authors = [] a = amazon_merge.build_amazon(edition, authors) assert isinstance(asin, six.string_types) assert thing_type == '/type/edition' #print edition['asin'], ekey if 'source_records' in thing: if 'amazon:' + asin in thing['source_records']: return True return source_records_match(a, thing) #print 'no source records' mc = get_mc(ekey) #print 'mc:', mc if mc == 'amazon:' + asin: return True if not mc: return False data = get_from_local(mc) e1 = build_marc(fast_parse.read_edition(data)) return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
def load_part(archive_id, part, start_pos=0): print 'load_part:', archive_id, part global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print loc print fast_parse.get_tag_lines(data, ['245']) raise except AssertionError: print loc raise if not index_fields or 'title' not in index_fields: continue edition_pool = pool.build(index_fields) if not edition_pool: yield loc, data continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect': print 'following redirect %s => %s' % (edition_key, thing['location']) edition_key = thing['location'] if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing, data) match = True break if match: break if not match: yield loc, data
def ia_match(a, ia): try: loc, rec = get_ia(ia) except urllib2.HTTPError: return False if rec is None or 'full_title' not in rec: return False try: e1 = build_marc(rec) except TypeError: print rec raise return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
def ia_match(a, ia): try: loc, rec = get_ia(ia) except urllib2.HTTPError: return False if rec is None or 'full_title' not in rec: return False try: e1 = build_marc(rec) except TypeError: print(rec) raise return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
def get_record(key, mc): data = get_from_archive(mc) try: rec = fast_parse.read_edition(data) except (fast_parse.SoundRecording, IndexError, AssertionError): print mc print edition_key return False try: return marc.build_marc(rec) except TypeError: print rec raise
def get_record(key, mc): data = get_from_archive(mc) try: rec = fast_parse.read_edition(data) except (fast_parse.SoundRecording, IndexError, AssertionError): print(mc) print(key) return False try: return marc.build_marc(rec) except TypeError: print(rec) raise
def get_marc(loc): try: filename, p, l = loc.split(':') except ValueError: return None if not os.path.exists(marc_path + filename): return None f = open(marc_path + filename) f.seek(int(p)) buf = f.read(int(l)) f.close() rec = fast_parse.read_edition(buf) if rec: return build_marc(rec)
def load_part(archive_id, part, start_pos=0): print('load_part:', archive_id, part) global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print(loc) print(fast_parse.get_tag_lines(data, ['245'])) raise except AssertionError: print(loc) raise if not index_fields or 'title' not in index_fields: continue edition_pool = pool.build(index_fields) if not edition_pool: continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue seen.add(edition_key) thing = withKey(edition_key) assert thing if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing) match = True if not match: yield loc, data
def marc_match(a, loc): assert loc rec = fast_parse.read_edition(get_from_local(loc)) e1 = build_marc(rec) #print 'amazon:', a return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
def load(): global rec_no, t_prev skipping = False #for ia in ['nybc200715']: #cur.execute("select identifier from metadata where collection=%(c)s", {'c': collection}) cur.execute( "select identifier from metadata where scanner is not null and scanner != 'google' and noindex is null and mediatype='texts' and curatestate='approved'" ) # order by curatedate") for ia, in cur.fetchall(): rec_no += 1 if rec_no % chunk == 0: t = time() - t_prev t_prev = time() t1 = time() - t0 rec_per_sec = chunk / t rec_per_sec_total = rec_no / t1 remaining = total - rec_no sec = remaining / rec_per_sec_total print("%8d current: %9.3f overall: %9.3f" % (rec_no, rec_per_sec, rec_per_sec_total), end=' ') hours = sec / 3600 print("%6.3f hours" % hours) print(ia) if get_things({'type': '/type/edition', 'ocaid': ia}): print('already loaded') continue try: loc, rec = get_ia(ia) except (KeyboardInterrupt, NameError): raise except urllib2.HTTPError: continue if loc is None: continue print(loc, rec) if not loc.endswith('.xml'): print("not XML") continue if 'full_title' not in rec: print("full_title missing") continue index_fields = make_index_fields(rec) if not index_fields: print("no index_fields") continue edition_pool = pool.build(index_fields) print(edition_pool) if not edition_pool: yield loc, ia continue e1 = build_marc(rec) match = False for k, v in edition_pool.iteritems(): if k == 'title' and len(v) > 50: continue for edition_key in v: if try_merge(e1, edition_key.replace('\/', '/')): match = True break if match: break if not match: yield loc, ia
def load(): global rec_no, t_prev skipping = False #for ia in ['nybc200715']: #cur.execute("select identifier from metadata where collection=%(c)s", {'c': collection}) cur.execute("select identifier from metadata where scanner is not null and scanner != 'google' and noindex is null and mediatype='texts' and curatestate='approved'") # order by curatedate") for ia, in cur.fetchall(): rec_no += 1 if rec_no % chunk == 0: t = time() - t_prev t_prev = time() t1 = time() - t0 rec_per_sec = chunk / t rec_per_sec_total = rec_no / t1 remaining = total - rec_no sec = remaining / rec_per_sec_total print("%8d current: %9.3f overall: %9.3f" % (rec_no, rec_per_sec, rec_per_sec_total), end=' ') hours = sec / 3600 print("%6.3f hours" % hours) print(ia) if get_things({'type': '/type/edition', 'ocaid': ia}): print('already loaded') continue try: loc, rec = get_ia(ia) except (KeyboardInterrupt, NameError): raise except urllib2.HTTPError: continue if loc is None: continue print(loc, rec) if not loc.endswith('.xml'): print("not XML") continue if 'full_title' not in rec: print("full_title missing") continue index_fields = make_index_fields(rec) if not index_fields: print("no index_fields") continue edition_pool = pool.build(index_fields) print(edition_pool) if not edition_pool: yield loc, ia continue e1 = build_marc(rec) match = False for k, v in edition_pool.iteritems(): if k == 'title' and len(v) > 50: continue for edition_key in v: if try_merge(e1, edition_key.replace('\/', '/')): match = True break if match: break if not match: yield loc, ia
continue if 'full_title' not in rec: print "full_title missing" continue index_fields = make_index_fields(rec) if not index_fields: print "no index_fields" continue edition_pool = pool.build(index_fields) if not edition_pool: load(loc, ia) continue e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect': print 'following redirect %s => %s' % (edition_key, thing['location']) edition_key = thing['location']