def import_file(filename): for asin, edition in read_amazon_file(open(filename)): index_fields = build_index_fields(asin, edition) found = pool.build(index_fields) if 'title' not in found: print(found) print(asin) print(edition) print(index_fields) print() if not found['title'] and not found['isbn']: # print 'no pool load book:', asin # TODO load book continue # print asin, found # print(repr(edition['title'], edition.get('subtitle', None), edition.get('flags', None), edition.get('binding', None))) if 'sims' in edition: del edition['sims'] # print edition # print seen = set() for k, v in found.items(): for ekey in v: if ekey in seen: continue keys, thing = follow_redirects(ekey) seen.update(keys) assert thing try: m = try_merge(edition, ekey, thing) except: print(asin) print(edition) print(ekey) print(found) raise
if format.startswith('[graphic') or format.startswith( '[cartograph'): continue print(rec) if 'full_title' not in rec: print("full_title missing") write_log(ia, when, "error: full_title missing") continue index_fields = make_index_fields(rec) if not index_fields: print("no index_fields") write_log(ia, when, "error: no index fields") continue edition_pool = pool.build(index_fields) if not edition_pool: load(ia, use_binary=use_binary) write_log(ia, when, "loaded") continue e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None
def load_part(archive_id, part, start_pos=0): print 'load_part:', archive_id, part global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print loc print fast_parse.get_tag_lines(data, ['245']) raise except AssertionError: print loc raise except fast_parse.NotBook: continue if not index_fields or 'title' not in index_fields: continue print loc edition_pool = pool.build(index_fields) if not edition_pool: yield loc, data continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect': print 'following redirect %s => %s' % (edition_key, thing['location']) edition_key = thing['location'] if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing, data) match = True break if match: break if not match: yield loc, data
filename = '/2/edward/20century/scans/' + item[:2] + '/' + item + '/' + item + '_marc.xml' rec = read_xml.read_edition(open(filename)) if 'full_title' not in rec: print "full_title missing", item continue if 'physical_format' in rec: format = rec['physical_format'].lower() if format.startswith('[graphic') or format.startswith('[cartograph'): print item, format index_fields = make_index_fields(rec) if not index_fields: print "no index_fields" continue #print index_fields edition_pool = pool.build(index_fields) if not edition_pool or not any(v for v in edition_pool.itervalues()): print >> new_book, rec continue print item, edition_pool e1 = build_marc(rec) print e1 match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: # edition_key = '/books/' + re_edition_key.match(edition_key).match(1) if edition_key in seen: continue
def load_part(archive_id, part, start_pos=0): print('load_part:', archive_id, part) global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print(loc) print(fast_parse.get_tag_lines(data, ['245'])) raise except AssertionError: print(loc) raise except fast_parse.NotBook: continue if not index_fields or 'title' not in index_fields: continue print(loc) edition_pool = pool.build(index_fields) if not edition_pool: yield loc, data continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect': print('following redirect %s => %s' % (edition_key, thing['location'])) edition_key = thing['location'] if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing, data) match = True break if match: break if not match: yield loc, data