def find_title(item_id): (ia_host, ia_path) = find_item(item_id) if not ia_host: return url = 'http://' + ia_host + ia_path + "/" + item_id + "_scandata.xml" scandata = None try: scandata = urlopen_keep_trying(url).read() except: pass if not scandata or '<book>' not in scandata[:100]: url = "http://" + ia_host + "/zipview.php?zip=" + ia_path + "/scandata.zip&file=scandata.xml" scandata = urlopen_keep_trying(url).read() if not scandata or '<book>' not in scandata: return zip_type = 'tif' if item_id.endswith('goog') else 'jp2' try: status = zip_test(ia_host, ia_path, item_id, zip_type) except socket.error: #print 'socket error:', ia_host bad_hosts.add(ia_host) return if status in (403, 404): #print zip_type, ' not found:', item_id return (cover, title) = parse_scandata_xml(scandata) return title
def find_img(item_id): e = query({'type': '/type/edition', 'source_records': 'ia:' + item_id}) if len(e) != 1: print 'no source_records:', e e = query({'type': '/type/edition', 'ocaid': item_id}) if len(e) != 1: print 'no ocaid:', e return ol = e[0]['key'] (ia_host, ia_path) = find_item(item_id) if not ia_host: print 'no host', item_id, ia_host return if ia_host in bad_hosts: print 'bad_host' try: url = scandata_url(ia_host, ia_path, item_id) if not url: return except socket.error: print 'socket error:', ia_host bad_hosts.add(ia_host) return try: status = jp2_zip_test(ia_host, ia_path, item_id) except socket.error: print 'socket error:', ia_host bad_hosts.add(ia_host) return if status in (403, 404): print 'jp2 not found:', (ol, item_id) return try: (cover, title) = find_title_leaf_et(ia_host, ia_path, url) except (KeyboardInterrupt, SystemExit, NameError): raise if not cover or not title: return # except: # print 'skip error:', ol, item_id, ia_host, ia_path # return print(ol, item_id, ia_host, ia_path, cover, title) post(ol, item_id, ia_host, ia_path, cover, title)
def find_img(item_id): e = query({'type':'/type/edition', 'source_records':'ia:' + item_id}) if len(e) != 1: print 'no source_records:', e e = query({'type':'/type/edition', 'ocaid': item_id}) if len(e) != 1: print 'no ocaid:', e return ol = e[0]['key'] (ia_host, ia_path) = find_item(item_id) if not ia_host: print 'no host', item_id, ia_host return if ia_host in bad_hosts: print 'bad_host' try: url = scandata_url(ia_host, ia_path, item_id) if not url: return except socket.error: print 'socket error:', ia_host bad_hosts.add(ia_host) return try: status = jp2_zip_test(ia_host, ia_path, item_id) except socket.error: print 'socket error:', ia_host bad_hosts.add(ia_host) return if status in (403, 404): print 'jp2 not found:', (ol, item_id) return try: (cover, title) = find_title_leaf_et(ia_host, ia_path, url) except (KeyboardInterrupt, SystemExit, NameError): raise if not cover or not title: return # except: # print 'skip error:', ol, item_id, ia_host, ia_path # return print (ol, item_id, ia_host, ia_path, cover, title) post(ol, item_id, ia_host, ia_path, cover, title)
def get_from_archive(locator): if locator.startswith('marc:'): locator = locator[5:] filename, offset, length = locator.split(":") offset = int(offset) length = int(length) ia, rest = filename.split('/', 1) for attempt in range(5): try: host, path = find_item(ia) break except socket.timeout: if attempt == 4: raise print 'retry, attempt', attempt r0, r1 = offset, offset + length - 1 url = 'http://' + host + path + '/' + rest assert 0 < length < 100000 ureq = urllib2.Request( url, None, {'Range': 'bytes=%d-%d' % (r0, r1)}, ) f = None for i in range(3): try: f = urllib2.urlopen(ureq) except urllib2.HTTPError, error: if error.code == 416: raise elif error.code == 404: print "404 for '%s'" % url raise else: print url print 'error:', error.code, error.msg except urllib2.URLError: pass
def get_from_archive(locator): if locator.startswith('marc:'): locator = locator[5:] filename, offset, length = locator.split (":") offset = int (offset) length = int (length) ia, rest = filename.split('/', 1) for attempt in range(5): try: host, path = find_item(ia) break except socket.timeout: if attempt == 4: raise print 'retry, attempt', attempt r0, r1 = offset, offset+length-1 url = 'http://' + host + path + '/' + rest assert 0 < length < 100000 ureq = urllib2.Request(url, None, {'Range':'bytes=%d-%d'% (r0, r1)},) f = None for i in range(3): try: f = urllib2.urlopen(ureq) except urllib2.HTTPError, error: if error.code == 416: raise elif error.code == 404: print "404 for '%s'" % url raise else: print url print 'error:', error.code, error.msg except urllib2.URLError: pass
else: print('start:', start) db_iter = db.query( "select identifier, contributor, updated, noindex, collection, format from metadata where scanner is not null and mediatype='texts' and (not curatestate='dark' or curatestate is null) and scandate is not null and format is not null and updated between $start and date_add($start, interval 2 day) order by updated", {'start': start}) t_start = time() for row in db_iter: if len(bad_marc) > 10 or (bad_marc and time() - bad_marc_last_sent > (4 * 60 * 60)): bad_marc_alert(bad_marc) bad_marc = [] bad_marc_last_sent = time() ia = row.identifier host, path = find_item(ia) if 'pdf' not in row.format.lower(): continue # scancenter and billing staff often use format like "%pdf%" as a proxy for having derived if row.contributor == 'Allen County Public Library Genealogy Center': print('skipping Allen County Public Library Genealogy Center') continue if row.collection: collections = set(i.lower().strip() for i in row.collection.split(';')) else: collections = set() if row.noindex: if not row.collection: continue collections = set(i.lower().strip() for i in row.collection.split(';'))
def head(host, path, ia): conn = httplib.HTTPConnection(host) conn.request("HEAD", path + "/" + ia + "_marc.xml") return conn.getresponse() bad_machine = set() out = open('has_marc', 'w') no = open('no_marc', 'w') later = open('later', 'w') for line in open('to_load'): ia = line[:-1] if line.startswith('('): print(ia, file=no) continue (host, path) = find_item(ia) if not host: print(ia, file=no) continue if host in bad_machine: print(ia, file=later) continue # print "http://" + host + path + "/" + ia + "_marc.xml" try: r1 = head(host, path, ia) except socket.error: print('socket error') print("http://" + host + path + "/" + ia + "_marc.xml") print('try later') bad_machine.add(ia) print(ia, file=later)
def load_book(ia, collections, boxid, scanned=True): if ia.startswith('annualreportspri'): print 'skipping:', ia return if 'shenzhentest' in collections: return if any('census' in c for c in collections): print 'skipping census' return if re_census.match(ia) or ia.startswith('populationschedu') or ia.startswith('michigancensus') or 'census00reel' in ia or ia.startswith('populationsc1880'): print 'ia:', ia print 'collections:', list(collections) print 'census not marked correctly' return try: host, path = find_item(ia) except socket.timeout: print 'socket timeout:', ia return except FindItemError: print 'find item error:', ia bad_binary = None try: formats = marc_formats(ia, host, path) except urllib2.HTTPError as error: return if formats['bin']: # binary MARC marc_data = get_marc_ia_data(ia, host, path) assert isinstance(marc_data, str) marc_error = check_marc_data(marc_data) if marc_error == 'double encode': marc_data = marc_data.decode('utf-8').encode('raw_unicode_escape') marc_error = None if marc_error: return contenttype = 'application/marc' elif formats['xml']: # MARC XML return # waiting for Raj to fox MARC XML loader marc_data = urllib2.urlopen('http://' + host + path + '/' + ia + '_meta.xml').read() contenttype = 'text/xml' else: return subjects = [] if scanned: if 'lendinglibrary' in collections: subjects += ['Protected DAISY', 'Lending library'] elif 'inlibrary' in collections: subjects += ['Protected DAISY', 'In library'] elif 'printdisabled' in collections: subjects.append('Protected DAISY') if not boxid: boxid = None try: post_to_import_api(ia, marc_data, contenttype, subjects, boxid, scanned=scanned) except BadImport: print >> bad, ia bad.flush() except BadLang: print >> bad_lang, ia bad_lang.flush()
def run_find_item(): global find_item_book while True: (num, ia) = item_queue.get() find_item_book = ia #print 'find_item:', ia t0_find_item = time() try: (host, path) = find_item(ia) except FindItemError: t1_find_item = time() - t0_find_item #print 'fail find_item:', ia, t1_find_item item_queue.task_done() done(ia, False) continue t1_find_item = time() - t0_find_item #print 'find_item:', ia, t1_find_item if len(locator_times) == 100: locator_times.pop(0) locator_times.append((t1_find_item, host)) body = None if False: url = 'http://' + solr_src_host + '/solr/inside/select?wt=json&rows=10&q=ia:' + ia response = json.load(urllib2.urlopen(url))['response'] if response['numFound']: doc = response['docs'][0] for doc_lang in ['eng', 'fre', 'deu', 'spa', 'other']: if doc.get('body_' + doc_lang): body = doc['body_' + doc_lang] break assert body filename = '/1/abbyy_text/data/' + ia[:2] + '/' + ia if os.path.exists(filename): body = codecs.open(filename, 'r', 'utf-8').read() if body: try: meta_xml = urlread_keep_trying('http://%s%s/%s_meta.xml' % (host, path, ia)) except urllib2.HTTPError as error: if error.code != 403: raise print('403 on meta XML for:', ia) item_queue.task_done() # skip done(ia, False) continue try: root = fromstring(meta_xml) except: print('identifer:', ia) collection = [e.text for e in root.findall('collection')] elem_noindex = root.find('noindex') if elem_noindex is not None and elem_noindex.text == 'true' and ( 'printdisabled' not in collection and 'lendinglibrary' not in collection): item_queue.task_done() # skip done(ia, False) continue lang_elem = root.find('language') if lang_elem is None: print(meta_xml) if lang_elem is not None: lang = tidy_lang(lang_elem.text) or 'other' else: lang = 'other' #print 'solr_queue.put((ia, body, page_count))' solr_queue.put((ia, body, lang, page_counts[ia], collection)) #print 'solr_queue.put() done' else: host_queues[host].put((num, ia, path)) if host not in host_threads: host_threads[host] = spawn_link_exception( read_text_from_node, host) item_queue.task_done()
while True: if args.item_id: db_iter = db.query("select identifier, contributor, updated, noindex, collection, format from metadata where scanner is not null and mediatype='texts' and (not curatestate='dark' or curatestate is null) and scandate is not null and format is not null and identifier=$item_id", {'item_id': args.item_id}) else: print 'start:', start db_iter = db.query("select identifier, contributor, updated, noindex, collection, format from metadata where scanner is not null and mediatype='texts' and (not curatestate='dark' or curatestate is null) and scandate is not null and format is not null and updated between $start and date_add($start, interval 2 day) order by updated", {'start': start}) t_start = time() for row in db_iter: if len(bad_marc) > 10 or (bad_marc and time() - bad_marc_last_sent > (4 * 60 * 60)): bad_marc_alert(bad_marc) bad_marc = [] bad_marc_last_sent = time() ia = row.identifier host, path = find_item(ia) if 'pdf' not in row.format.lower(): continue # scancenter and billing staff often use format like "%pdf%" as a proxy for having derived if row.contributor == 'Allen County Public Library Genealogy Center': print 'skipping Allen County Public Library Genealogy Center' continue if row.collection: collections = set(i.lower().strip() for i in row.collection.split(';')) else: collections = set() if row.noindex: if not row.collection: continue collections = set(i.lower().strip() for i in row.collection.split(';')) if not ignore_noindex & collections: continue
def load_book(ia, collections, boxid, scanned=True): if ia.startswith('annualreportspri'): print('skipping:', ia) return if 'shenzhentest' in collections: return if any('census' in c for c in collections): print('skipping census') return if re_census.match(ia) or ia.startswith( 'populationschedu') or ia.startswith( 'michigancensus') or 'census00reel' in ia or ia.startswith( 'populationsc1880'): print('ia:', ia) print('collections:', list(collections)) print('census not marked correctly') return try: host, path = find_item(ia) except socket.timeout: print('socket timeout:', ia) return except FindItemError: print('find item error:', ia) bad_binary = None try: formats = marc_formats(ia, host, path) except urllib2.HTTPError as error: return if formats['bin']: # binary MARC marc_data = get_marc_ia_data(ia, host, path) assert isinstance(marc_data, str) marc_error = check_marc_data(marc_data) if marc_error == 'double encode': marc_data = marc_data.decode('utf-8').encode('raw_unicode_escape') marc_error = None if marc_error: return contenttype = 'application/marc' elif formats['xml']: # MARC XML return # waiting for Raj to fox MARC XML loader marc_data = urllib2.urlopen('http://' + host + path + '/' + ia + '_meta.xml').read() contenttype = 'text/xml' else: return subjects = [] if scanned: if 'lendinglibrary' in collections: subjects += ['Protected DAISY', 'Lending library'] elif 'inlibrary' in collections: subjects += ['Protected DAISY', 'In library'] elif 'printdisabled' in collections: subjects.append('Protected DAISY') if not boxid: boxid = None try: post_to_import_api(ia, marc_data, contenttype, subjects, boxid, scanned=scanned) except BadImport: print(ia, file=bad) bad.flush() except BadLang: print(ia, file=bad_lang) bad_lang.flush()
def run_find_item(): global find_item_book while True: (num, ia) = item_queue.get() find_item_book = ia #print 'find_item:', ia t0_find_item = time() try: (host, path) = find_item(ia) except FindItemError: t1_find_item = time() - t0_find_item #print 'fail find_item:', ia, t1_find_item item_queue.task_done() done(ia, False) continue t1_find_item = time() - t0_find_item #print 'find_item:', ia, t1_find_item if len(locator_times) == 100: locator_times.pop(0) locator_times.append((t1_find_item, host)) body = None if False: url = 'http://' + solr_src_host + '/solr/inside/select?wt=json&rows=10&q=ia:' + ia response = json.load(urllib2.urlopen(url))['response'] if response['numFound']: doc = response['docs'][0] for doc_lang in ['eng', 'fre', 'deu', 'spa', 'other']: if doc.get('body_' + doc_lang): body = doc['body_' + doc_lang] break assert body filename = '/1/abbyy_text/data/' + ia[:2] + '/' + ia if os.path.exists(filename): body = codecs.open(filename, 'r', 'utf-8').read() if body: try: meta_xml = urlread_keep_trying('http://%s%s/%s_meta.xml' % (host, path, ia)) except urllib2.HTTPError, error: if error.code != 403: raise print '403 on meta XML for:', ia item_queue.task_done() # skip done(ia, False) continue try: root = fromstring(meta_xml) except: print 'identifer:', ia collection = [e.text for e in root.findall('collection')] elem_noindex = root.find('noindex') if elem_noindex is not None and elem_noindex.text == 'true' and ('printdisabled' not in collection and 'lendinglibrary' not in collection): item_queue.task_done() # skip done(ia, False) continue lang_elem = root.find('language') if lang_elem is None: print meta_xml if lang_elem is not None: lang = tidy_lang(lang_elem.text) or 'other' else: lang = 'other' #print 'solr_queue.put((ia, body, page_count))' solr_queue.put((ia, body, lang, page_counts[ia], collection)) #print 'solr_queue.put() done' else: host_queues[host].put((num, ia, path)) if host not in host_threads: host_threads[host] = spawn_link_exception(read_text_from_node, host) item_queue.task_done()
def head(host, path, ia): conn = httplib.HTTPConnection(host) conn.request("HEAD", path + "/" + ia + "_marc.xml") return conn.getresponse() bad_machine = set() out = open('has_marc', 'w') no = open('no_marc', 'w') later = open('later', 'w') for line in open('to_load'): ia = line[:-1] if line.startswith('('): print >> no, ia continue (host, path) = find_item(ia) if not host: print >> no, ia continue if host in bad_machine: print >> later, ia continue # print "http://" + host + path + "/" + ia + "_marc.xml" try: r1 = head(host, path, ia) except socket.error: print 'socket error' print "http://" + host + path + "/" + ia + "_marc.xml" print 'try later' bad_machine.add(ia) print >> later, ia