def try_merge(edition, ekey, thing): thing_type = thing['type']['key'] if 'isbn_10' not in edition: print(edition) asin = edition.get('isbn_10', None) or edition['asin'] if 'authors' in edition: authors = [i['name'] for i in edition['authors']] else: authors = [] a = amazon_merge.build_amazon(edition, authors) assert isinstance(asin, str) assert thing_type == '/type/edition' # print edition['asin'], ekey if 'source_records' in thing: if 'amazon:' + asin in thing['source_records']: return True return source_records_match(a, thing) # print 'no source records' mc = get_mc(ekey) # print 'mc:', mc if mc == 'amazon:' + asin: return True if not mc: return False data = get_from_local(mc) e1 = build_marc(fast_parse.read_edition(data)) return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
def test_read_oclc(): # DEPRECATED data was 'oregon_27194315', triggers exception for f in ('scrapbooksofmoun03tupp_meta.mrc', ): data = open(test_data + f).read() i = index_fields(data, ['001', '003', '010', '020', '035', '245']) assert 'oclc' in i e = read_edition(data) assert 'oclc' in e
def marc_match(e1, loc): rec = fast_parse.read_edition(get_from_local(loc)) try: e2 = build_marc(rec) except TypeError: print rec raise return attempt_merge(e1, e2, threshold, debug=False)
def marc_match(e1, loc): print('loc:', loc) rec = fast_parse.read_edition(get_from_archive(loc)) print('rec:', rec) try: e2 = build_marc(rec) except TypeError: print(rec) raise return attempt_merge(e1, e2, threshold, debug=False)
def get_marc_ia(ia): ia = ia.strip() # 'cyclopdiaofedu00kidd ' url = base + ia + "/" + ia + "_meta.mrc" data = urlopen_keep_trying(url).read() length = int(data[0:5]) if len(data) != length: data = data.decode('utf-8').encode('raw_unicode_escape') assert len(data) == length assert 'Internet Archive: Error' not in data print 'leader:', data[:24] return data return fast_parse.read_edition(data, accept_electronic = True)
def get_record(key, mc): data = get_from_archive(mc) try: rec = fast_parse.read_edition(data) except (fast_parse.SoundRecording, IndexError, AssertionError): print(mc) print(key) return False try: return marc.build_marc(rec) except TypeError: print(rec) raise
def get_marc_ia(ia): ia = ia.strip() # 'cyclopdiaofedu00kidd ' url = base + ia + "/" + ia + "_meta.mrc" data = urlopen_keep_trying(url).read() length = int(data[0:5]) if len(data) != length: data = data.decode('utf-8').encode('raw_unicode_escape') assert len(data) == length assert 'Internet Archive: Error' not in data print 'leader:', data[:24] return data return fast_parse.read_edition(data, accept_electronic=True)
try: marc_marc_data = marc_data.decode('utf-8').encode( 'raw_unicode_escape') except: bad_binary = "double UTF-8 decode error" if not bad_binary and len(marc_data) != length: bad_binary = 'MARC length mismatch: %d != %d' % ( len(marc_data), length) if not bad_binary and 'Internet Archive: Error' in marc_data: bad_binary = 'Internet Archive: Error' if not bad_binary: if str(marc_data)[6:8] != 'am': # only want books print('not a book!') continue try: rec = fast_parse.read_edition(marc_data, accept_electronic=True) except: bad_binary = "MARC parse error" if bad_binary and not formats['xml']: load_error_mail(ia, bad_binary, 'bad MARC binary, no MARC XML') continue if not use_binary and formats['xml']: if bad_ia_xml(ia) and bad_binary: load_error_mail(ia, bad_binary, 'bad MARC binary, bad MARC XML') continue try: rec = get_ia(ia) except (KeyboardInterrupt, NameError): raise except NoMARCXML:
def try_merge(e1, edition_key, thing): thing_type = thing['type'] if thing_type != Reference('/type/edition'): print(thing['key'], 'is', str(thing['type'])) if thing_type == Reference('/type/delete'): return False assert thing_type == Reference('/type/edition') if 'source_records' in thing: if fix_source_records(edition_key, thing): thing = withKey(edition_key) # reload return source_records_match(e1, thing) ia = thing.get('ocaid', None) print(edition_key) mc = get_mc(edition_key) print(mc) if mc: if mc.startswith('ia:'): ia = mc[3:] elif mc.endswith('.xml') or mc.endswith('.mrc'): ia = mc[:mc.find('/')] if '_meta.mrc:' in mc: print(thing) if 'ocaid' not in thing: return False ia = thing['ocaid'] rec2 = None if ia: if is_dark_or_bad(ia): return False try: rec2 = get_ia(ia) except xml.parsers.expat.ExpatError: return False except NoMARCXML: print('no MARCXML') pass except urllib2.HTTPError as error: print(error.code) assert error.code in (404, 403) if not rec2: return True if not rec2: if not mc: mc = get_mc(thing['key']) if not mc or mc == 'initial import': return False if mc.startswith('amazon:'): try: a = try_amazon(thing) except IndexError: print(thing['key']) raise except AttributeError: return False if not a: return False try: return amazon.attempt_merge(a, e1, threshold, debug=False) except: print(a) print(e1) print(thing['key']) raise print('mc:', mc) try: assert not mc.startswith('ia:') data = get_from_archive(mc) if not data: return True rec2 = fast_parse.read_edition(data) except (fast_parse.SoundRecording, IndexError, AssertionError): print(mc) print(edition_key) return False except: print(mc) print(edition_key) raise if not rec2: return False try: e2 = build_marc(rec2) except TypeError: print(rec2) raise return attempt_merge(e1, e2, threshold, debug=False)
def load_part(archive_id, part, start_pos=0): print('load_part:', archive_id, part) global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print(loc) print(fast_parse.get_tag_lines(data, ['245'])) raise except AssertionError: print(loc) raise except fast_parse.NotBook: continue if not index_fields or 'title' not in index_fields: continue print(loc) edition_pool = pool.build(index_fields) if not edition_pool: yield loc, data continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect': print('following redirect %s => %s' % (edition_key, thing['location'])) edition_key = thing['location'] if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing, data) match = True break if match: break if not match: yield loc, data
if not a: return False try: return amazon.attempt_merge(a, e1, threshold, debug=False) except: print a print e1 print thing['key'] raise print 'mc:', mc try: assert not mc.startswith('ia:') data = get_from_local(mc) if not data: return True rec2 = fast_parse.read_edition(data) except (fast_parse.SoundRecording, IndexError, AssertionError): print mc print edition_key return False except: print mc print edition_key raise if not rec2: return False try: e2 = build_marc(rec2) except TypeError: print rec2 raise
if not f: return None data = f.read() length = data[0:5] loc = ia + "/" + ia + "_meta.mrc:0:" + length if len(data) == 0: print 'zero length MARC for', url return None if 'Internet Archive: Error' in data: print 'internet archive error for', url return None if data.startswith('<html>\n<head>'): print 'internet archive error for', url return None try: return fast_parse.read_edition(data, accept_electronic = True) except (ValueError, AssertionError, fast_parse.BadDictionary): print `data` raise def files(archive_id): url = base + archive_id + "/" + archive_id + "_files.xml" for i in range(5): try: tree = etree.parse(urlopen_keep_trying(url)) break except xml.parsers.expat.ExpatError: sleep(2) try: tree = etree.parse(urlopen_keep_trying(url)) except:
def marc_match(a, loc): assert loc rec = fast_parse.read_edition(get_from_local(loc)) e1 = build_marc(rec) # print 'amazon:', a return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
if not f: return None data = f.read() length = data[0:5] loc = ia + "/" + ia + "_meta.mrc:0:" + length if len(data) == 0: print 'zero length MARC for', url return None if 'Internet Archive: Error' in data: print 'internet archive error for', url return None if data.startswith('<html>\n<head>'): print 'internet archive error for', url return None try: return fast_parse.read_edition(data, accept_electronic=True) except (ValueError, AssertionError, fast_parse.BadDictionary): print ` data ` raise def files(archive_id): url = base + archive_id + "/" + archive_id + "_files.xml" for i in range(5): try: tree = etree.parse(urlopen_keep_trying(url)) break except xml.parsers.expat.ExpatError: sleep(2) try: tree = etree.parse(urlopen_keep_trying(url))
def load_part(archive_id, part, start_pos=0): print 'load_part:', archive_id, part global rec_no, t_prev, load_count full_part = archive_id + "/" + part f = open(rc['marc_path'] + "/" + full_part) if start_pos: f.seek(start_pos) for pos, loc, data in read_marc_file(full_part, f, pos=start_pos): rec_no += 1 if rec_no % chunk == 0: progress(archive_id, rec_no, start_pos, pos) if is_loaded(loc): continue want = ['001', '003', '010', '020', '035', '245'] try: index_fields = fast_parse.index_fields(data, want) except KeyError: print loc print fast_parse.get_tag_lines(data, ['245']) raise except AssertionError: print loc raise except fast_parse.NotBook: continue if not index_fields or 'title' not in index_fields: continue print loc edition_pool = pool.build(index_fields) if not edition_pool: yield loc, data continue rec = fast_parse.read_edition(data) e1 = build_marc(rec) match = False seen = set() for k, v in edition_pool.iteritems(): for edition_key in v: if edition_key in seen: continue thing = None while not thing or thing['type']['key'] == '/type/redirect': seen.add(edition_key) thing = withKey(edition_key) assert thing if thing['type']['key'] == '/type/redirect': print 'following redirect %s => %s' % (edition_key, thing['location']) edition_key = thing['location'] if try_merge(e1, edition_key, thing): add_source_records(edition_key, loc, thing, data) match = True break if match: break if not match: yield loc, data
bad_binary = "MARC doesn't start with number" if not bad_binary and len(marc_data) != length: try: marc_marc_data = marc_data.decode('utf-8').encode('raw_unicode_escape') except: bad_binary = "double UTF-8 decode error" if not bad_binary and len(marc_data) != length: bad_binary = 'MARC length mismatch: %d != %d' % (len(marc_data), length) if not bad_binary and 'Internet Archive: Error' in marc_data: bad_binary = 'Internet Archive: Error' if not bad_binary: if str(marc_data)[6:8] != 'am': # only want books print 'not a book!' continue try: rec = fast_parse.read_edition(marc_data, accept_electronic = True) except: bad_binary = "MARC parse error" if bad_binary and not formats['xml']: load_error_mail(ia, bad_binary, 'bad MARC binary, no MARC XML') continue if not use_binary and formats['xml']: if bad_ia_xml(ia) and bad_binary: load_error_mail(ia, bad_binary, 'bad MARC binary, bad MARC XML') continue try: rec = get_ia(ia) except (KeyboardInterrupt, NameError): raise except NoMARCXML: write_log(ia, when, "no MARCXML")
if not a: return False try: return amazon.attempt_merge(a, e1, threshold, debug=False) except: print a print e1 print thing['key'] raise print 'mc:', mc try: assert not mc.startswith('ia:') data = get_from_archive(mc) if not data: return True rec2 = fast_parse.read_edition(data) except (fast_parse.SoundRecording, IndexError, AssertionError): print mc print edition_key return False except: print mc print edition_key raise if not rec2: return False try: e2 = build_marc(rec2) except TypeError: print rec2 raise