def find_img(item_id): e = query({'type': '/type/edition', 'source_records': 'ia:' + item_id}) if len(e) != 1: print 'no source_records:', e e = query({'type': '/type/edition', 'ocaid': item_id}) if len(e) != 1: print 'no ocaid:', e return ol = e[0]['key'] (ia_host, ia_path) = find_item(item_id) if not ia_host: print 'no host', item_id, ia_host return if ia_host in bad_hosts: print 'bad_host' try: url = scandata_url(ia_host, ia_path, item_id) if not url: return except socket.error: print 'socket error:', ia_host bad_hosts.add(ia_host) return try: status = jp2_zip_test(ia_host, ia_path, item_id) except socket.error: print 'socket error:', ia_host bad_hosts.add(ia_host) return if status in (403, 404): print 'jp2 not found:', (ol, item_id) return try: (cover, title) = find_title_leaf_et(ia_host, ia_path, url) except (KeyboardInterrupt, SystemExit, NameError): raise if not cover or not title: return # except: # print 'skip error:', ol, item_id, ia_host, ia_path # return print(ol, item_id, ia_host, ia_path, cover, title) post(ol, item_id, ia_host, ia_path, cover, title)
def by_authors(): find_new_work_key() skipping = False skipping = True q = {'type': '/type/author', 'name': None, 'works': None} for a in query_iter(q, offset=215000): akey = a['key'] if skipping: print('skipping:', akey, a['name']) if akey == '/a/OL218496A': skipping = False continue q = { 'type': '/type/work', 'authors': akey, } if query(q): print((akey, repr(a['name']), 'has works')) continue # print akey, a['name'] found = find_works(akey) works = [i for i in found if len(i['editions']) > 2] if works: #open('found/' + akey[3:], 'w').write(repr(works)) print((akey, repr(a['name']))) #print_works(works) add_works(akey, works) print()
def by_authors(): find_new_work_key() skipping = False skipping = True q = { 'type':'/type/author', 'name': None, 'works': None } for a in query_iter(q, offset=215000): akey = a['key'] if skipping: print('skipping:', akey, a['name']) if akey == '/a/OL218496A': skipping = False continue q = { 'type':'/type/work', 'authors': akey, } if query(q): print((akey, repr(a['name']), 'has works')) continue # print akey, a['name'] found = find_works(akey) works = [i for i in found if len(i['editions']) > 2] if works: #open('found/' + akey[3:], 'w').write(repr(works)) print((akey, repr(a['name']))) #pprint(works) #print_works(works) add_works(akey, works) print()
def hide_books(start): mend = [] fix_works = set() db_iter = db.query("select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start order by updated", {'start': start}) for row in db_iter: ia = row.identifier if row.collection: collections = set(i.lower().strip() for i in row.collection.split(';')) if 'printdisabled' in collections: continue print `ia`, row.updated for eq in query({'type': '/type/edition', 'ocaid': ia}): print eq['key'] e = ol.get(eq['key']) if 'ocaid' not in e: continue if 'works' in e: fix_works.update(e['works']) print e['key'], `e.get('title', None)` del e['ocaid'] mend.append(e) print 'removing links from %d editions' % len(mend) print ol.save_many(mend, 'remove link') requests = [] for wkey in fix_works: requests += update_work(withKey(wkey)) if fix_works: solr_update(requests + ['<commit/>'], debug=True)
def find_img(item_id): e = query({'type':'/type/edition', 'source_records':'ia:' + item_id}) if len(e) != 1: print 'no source_records:', e e = query({'type':'/type/edition', 'ocaid': item_id}) if len(e) != 1: print 'no ocaid:', e return ol = e[0]['key'] (ia_host, ia_path) = find_item(item_id) if not ia_host: print 'no host', item_id, ia_host return if ia_host in bad_hosts: print 'bad_host' try: url = scandata_url(ia_host, ia_path, item_id) if not url: return except socket.error: print 'socket error:', ia_host bad_hosts.add(ia_host) return try: status = jp2_zip_test(ia_host, ia_path, item_id) except socket.error: print 'socket error:', ia_host bad_hosts.add(ia_host) return if status in (403, 404): print 'jp2 not found:', (ol, item_id) return try: (cover, title) = find_title_leaf_et(ia_host, ia_path, url) except (KeyboardInterrupt, SystemExit, NameError): raise if not cover or not title: return # except: # print 'skip error:', ol, item_id, ia_host, ia_path # return print (ol, item_id, ia_host, ia_path, cover, title) post(ol, item_id, ia_host, ia_path, cover, title)
def hide_books(start): hide_start = open(hide_state_file).readline()[:-1] print('hide start:', hide_start) mend = [] fix_works = set() db_iter = db.query( "select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start", {'start': hide_start}) last_updated = None for row in db_iter: ia = row.identifier if row.collection: collections = set(i.lower().strip() for i in row.collection.split(';')) if ignore_noindex & collections: continue print((repr(ia), row.updated)) for eq in query({'type': '/type/edition', 'ocaid': ia}): print(eq['key']) e = ol.get(eq['key']) if 'ocaid' not in e: continue if 'works' in e: fix_works.update(e['works']) print((e['key'], repr(e.get('title', None)))) del e['ocaid'] mend.append(e) last_updated = row.updated print('removing links from %d editions' % len(mend)) if not mend: return print(ol.save_many(mend, 'remove link')) requests = [] for wkey in fix_works: requests += update_work(withKey(wkey)) if fix_works: solr_update(requests + ['<commit/>'], debug=True) print(last_updated, file=open(hide_state_file, 'w'))
def hide_books(start): hide_start = open(hide_state_file).readline()[:-1] print "hide start:", hide_start mend = [] fix_works = set() db_iter = db.query( "select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start order by scandate_dt", {"start": hide_start}, ) last_updated = None for row in db_iter: ia = row.identifier if row.collection: collections = set(i.lower().strip() for i in row.collection.split(";")) if "printdisabled" in collections or "lendinglibrary" in collections: continue print ` ia `, row.updated for eq in query({"type": "/type/edition", "ocaid": ia}): print eq["key"] e = ol.get(eq["key"]) if "ocaid" not in e: continue if "works" in e: fix_works.update(e["works"]) print e["key"], ` e.get("title", None) ` del e["ocaid"] mend.append(e) last_updated = row.updated print "removing links from %d editions" % len(mend) if not mend: return print ol.save_many(mend, "remove link") requests = [] for wkey in fix_works: requests += update_work(withKey(wkey)) if fix_works: solr_update(requests + ["<commit/>"], debug=True) print >> open(hide_state_file, "w"), last_updated
def hide_books(start): hide_start = open(hide_state_file).readline()[:-1] print 'hide start:', hide_start mend = [] fix_works = set() db_iter = db.query("select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start", {'start': hide_start}) last_updated = None for row in db_iter: ia = row.identifier if row.collection: collections = set(i.lower().strip() for i in row.collection.split(';')) if ignore_noindex & collections: continue print(repr(ia), row.updated) for eq in query({'type': '/type/edition', 'ocaid': ia}): print eq['key'] e = ol.get(eq['key']) if 'ocaid' not in e: continue if 'works' in e: fix_works.update(e['works']) print(e['key'], repr(e.get('title', None))) del e['ocaid'] mend.append(e) last_updated = row.updated print 'removing links from %d editions' % len(mend) if not mend: return print ol.save_many(mend, 'remove link') requests = [] for wkey in fix_works: requests += update_work(withKey(wkey)) if fix_works: solr_update(requests + ['<commit/>'], debug=True) print >> open(hide_state_file, 'w'), last_updated
) or 'census00reel' in ia or ia.startswith( 'populationsc1880'): print('ia:', ia) print('collections:', list(collections)) print('census not marked correctly') continue assert 'passportapplicat' not in ia and 'passengerlistsof' not in ia if 'passportapplicat' in ia: print('skip passport applications for now:', ia) continue if 'passengerlistsof' in ia: print('skip passenger lists', ia) continue print((repr(ia), row.updated)) when = str(row.updated) if query({'type': '/type/edition', 'ocaid': ia}): print('already loaded') continue if query({'type': '/type/edition', 'source_records': 'ia:' + ia}): print('already loaded') continue try: formats = marc_formats(ia, host, path) except urllib2.HTTPError as error: write_log(ia, when, "error: HTTPError: " + str(error)) continue use_binary = False bad_binary = None print(formats) rec = {}
if re_census.match(ia) or ia.startswith('populationschedu') or ia.startswith('michigancensus') or 'census00reel' in ia or ia.startswith('populationsc1880'): print 'ia:', ia print 'collections:', list(collections) print 'census not marked correctly' continue assert 'passportapplicat' not in ia and 'passengerlistsof' not in ia if 'passportapplicat' in ia: print 'skip passport applications for now:', ia continue if 'passengerlistsof' in ia: print 'skip passenger lists', ia continue print(repr(ia), row.updated) when = str(row.updated) if query({'type': '/type/edition', 'ocaid': ia}): print 'already loaded' continue if query({'type': '/type/edition', 'source_records': 'ia:' + ia}): print 'already loaded' continue try: formats = marc_formats(ia, host, path) except urllib2.HTTPError as error: write_log(ia, when, "error: HTTPError: " + str(error)) continue use_binary = False bad_binary = None print formats rec = {}
from openlibrary.catalog.utils.query import query, withKey from openlibrary.catalog.importer.update import add_source_records for num, line in enumerate(open('/1/edward/imagepdf/possible_match2')): doc = eval(line) if 'publisher' not in doc: continue item_id = doc['item_id'] if query({'type':'/type/edition','source_records':'ia:' + item_id}): continue e = withKey(doc['ol']) if 'publishers' not in e: continue title_match = False if doc['title'] == e['title']: title_match = True elif doc['title'] == e.get('title_prefix', '') + e['title']: title_match = True elif doc['title'] == e.get('title_prefix', '') + e['title'] + e.get('subtitle', ''): title_match = True elif doc['title'] == e['title'] + e.get('subtitle', ''): title_match = True if not title_match: continue if doc['publisher'] != e['publishers'][0]: continue print 'match:', item_id, doc['ol'] add_source_records(doc['ol'], item_id)
or ia.startswith("populationsc1880") ): print "ia:", ia print "collections:", list(collections) print "census not marked correctly" continue assert "passportapplicat" not in ia and "passengerlistsof" not in ia if "passportapplicat" in ia: print "skip passport applications for now:", ia continue if "passengerlistsof" in ia: print "skip passenger lists", ia continue print ` ia `, row.updated when = str(row.updated) if query({"type": "/type/edition", "ocaid": ia}): print "already loaded" continue if query({"type": "/type/edition", "source_records": "ia:" + ia}): print "already loaded" continue try: formats = marc_formats(ia) except urllib2.HTTPError as error: write_log(ia, when, "error: HTTPError: " + str(error)) continue use_binary = False bad_binary = None print formats rec = {}
from openlibrary.catalog.utils.query import query, withKey from openlibrary.api import OpenLibrary, unmarshal from openlibrary.catalog.read_rc import read_rc rc = read_rc() ol = OpenLibrary("http://openlibrary.org") ol.login('ImportBot', rc['ImportBot']) to_fix = [] num = 0 for line in open('no_index'): for e in query({'type': '/type/edition', 'title': None, 'ocaid': line[:-1]}): num += 1 print num, e['key'], `e['title']`, line[:-1] e2 = ol.get(e['key']) del e2['ocaid'] to_fix.append(e2) ol.save_many(to_fix, 'remove link')
from openlibrary.catalog.utils.query import query, withKey from openlibrary.api import OpenLibrary, unmarshal from openlibrary.catalog.read_rc import read_rc rc = read_rc() ol = OpenLibrary("http://openlibrary.org") ol.login('ImportBot', rc['ImportBot']) to_fix = [] num = 0 for line in open('no_index'): for e in query({ 'type': '/type/edition', 'title': None, 'ocaid': line[:-1] }): num += 1 print(num, e['key'], repr(e['title']), line[:-1]) e2 = ol.get(e['key']) del e2['ocaid'] to_fix.append(e2) ol.save_many(to_fix, 'remove link')
from openlibrary.catalog.utils.query import query, withKey from openlibrary.catalog.importer.update import add_source_records for num, line in enumerate(open('/1/edward/imagepdf/possible_match2')): doc = eval(line) if 'publisher' not in doc: continue item_id = doc['item_id'] if query({'type': '/type/edition', 'source_records': 'ia:' + item_id}): continue e = withKey(doc['ol']) if 'publishers' not in e: continue title_match = False if doc['title'] == e['title']: title_match = True elif doc['title'] == e.get('title_prefix', '') + e['title']: title_match = True elif doc['title'] == e.get('title_prefix', '') + e['title'] + e.get( 'subtitle', ''): title_match = True elif doc['title'] == e['title'] + e.get('subtitle', ''): title_match = True if not title_match: continue if doc['publisher'] != e['publishers'][0]: continue print 'match:', item_id, doc['ol'] add_source_records(doc['ol'], item_id)