def find_img(item_id):
    e = query({'type': '/type/edition', 'source_records': 'ia:' + item_id})
    if len(e) != 1:
        print 'no source_records:', e
        e = query({'type': '/type/edition', 'ocaid': item_id})
        if len(e) != 1:
            print 'no ocaid:', e
            return
    ol = e[0]['key']
    (ia_host, ia_path) = find_item(item_id)

    if not ia_host:
        print 'no host', item_id, ia_host
        return
    if ia_host in bad_hosts:
        print 'bad_host'
    try:
        url = scandata_url(ia_host, ia_path, item_id)
        if not url:
            return
    except socket.error:
        print 'socket error:', ia_host
        bad_hosts.add(ia_host)
        return

    try:
        status = jp2_zip_test(ia_host, ia_path, item_id)
    except socket.error:
        print 'socket error:', ia_host
        bad_hosts.add(ia_host)
        return
    if status in (403, 404):
        print 'jp2 not found:', (ol, item_id)
        return

    try:
        (cover, title) = find_title_leaf_et(ia_host, ia_path, url)
    except (KeyboardInterrupt, SystemExit, NameError):
        raise
    if not cover or not title:
        return


#    except:
#        print 'skip error:', ol, item_id, ia_host, ia_path
#        return
    print(ol, item_id, ia_host, ia_path, cover, title)
    post(ol, item_id, ia_host, ia_path, cover, title)
Пример #2
0
def by_authors():
    find_new_work_key()

    skipping = False
    skipping = True
    q = {'type': '/type/author', 'name': None, 'works': None}
    for a in query_iter(q, offset=215000):
        akey = a['key']
        if skipping:
            print('skipping:', akey, a['name'])
            if akey == '/a/OL218496A':
                skipping = False
            continue

        q = {
            'type': '/type/work',
            'authors': akey,
        }
        if query(q):
            print((akey, repr(a['name']), 'has works'))
            continue

    #    print akey, a['name']
        found = find_works(akey)
        works = [i for i in found if len(i['editions']) > 2]
        if works:
            #open('found/' + akey[3:], 'w').write(repr(works))
            print((akey, repr(a['name'])))
            #print_works(works)
            add_works(akey, works)
            print()
Пример #3
0
def by_authors():
    find_new_work_key()

    skipping = False
    skipping = True
    q = { 'type':'/type/author', 'name': None, 'works': None }
    for a in query_iter(q, offset=215000):
        akey = a['key']
        if skipping:
            print('skipping:', akey, a['name'])
            if akey == '/a/OL218496A':
                skipping = False
            continue

        q = {
            'type':'/type/work',
            'authors': akey,
        }
        if query(q):
            print((akey, repr(a['name']), 'has works'))
            continue

    #    print akey, a['name']
        found = find_works(akey)
        works = [i for i in found if len(i['editions']) > 2]
        if works:
            #open('found/' + akey[3:], 'w').write(repr(works))
            print((akey, repr(a['name'])))
            #pprint(works)
            #print_works(works)
            add_works(akey, works)
            print()
Пример #4
0
def hide_books(start):
    mend = []
    fix_works = set()
    db_iter = db.query("select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start order by updated", {'start': start})
    for row in db_iter:
        ia = row.identifier
        if row.collection:
            collections = set(i.lower().strip() for i in row.collection.split(';'))
            if 'printdisabled' in collections:
                continue
        print `ia`, row.updated
        for eq in query({'type': '/type/edition', 'ocaid': ia}):
            print eq['key']
            e = ol.get(eq['key'])
            if 'ocaid' not in e:
                continue
            if 'works' in e:
                fix_works.update(e['works'])
            print e['key'], `e.get('title', None)`
            del e['ocaid']
            mend.append(e)
    print 'removing links from %d editions' % len(mend)
    print ol.save_many(mend, 'remove link')
    requests = []
    for wkey in fix_works:
        requests += update_work(withKey(wkey))
    if fix_works:
        solr_update(requests + ['<commit/>'], debug=True)
def find_img(item_id):
    e = query({'type':'/type/edition', 'source_records':'ia:' + item_id})
    if len(e) != 1:
        print 'no source_records:', e
        e = query({'type':'/type/edition', 'ocaid': item_id})
        if len(e) != 1:
            print 'no ocaid:', e
            return
    ol = e[0]['key']
    (ia_host, ia_path) = find_item(item_id)

    if not ia_host:
        print 'no host', item_id, ia_host
        return
    if ia_host in bad_hosts:
        print 'bad_host'
    try:
        url = scandata_url(ia_host, ia_path, item_id)
        if not url:
            return
    except socket.error:
        print 'socket error:', ia_host
        bad_hosts.add(ia_host)
        return

    try:
        status = jp2_zip_test(ia_host, ia_path, item_id)
    except socket.error:
        print 'socket error:', ia_host
        bad_hosts.add(ia_host)
        return
    if status in (403, 404):
        print 'jp2 not found:', (ol, item_id)
        return

    try:
        (cover, title) = find_title_leaf_et(ia_host, ia_path, url)
    except (KeyboardInterrupt, SystemExit, NameError):
        raise
    if not cover or not title:
        return
#    except:
#        print 'skip error:', ol, item_id, ia_host, ia_path
#        return
    print (ol, item_id, ia_host, ia_path, cover, title)
    post(ol, item_id, ia_host, ia_path, cover, title)
Пример #6
0
def hide_books(start):
    hide_start = open(hide_state_file).readline()[:-1]
    print('hide start:', hide_start)

    mend = []
    fix_works = set()
    db_iter = db.query(
        "select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start",
        {'start': hide_start})
    last_updated = None
    for row in db_iter:
        ia = row.identifier
        if row.collection:
            collections = set(i.lower().strip()
                              for i in row.collection.split(';'))
            if ignore_noindex & collections:
                continue
        print((repr(ia), row.updated))
        for eq in query({'type': '/type/edition', 'ocaid': ia}):
            print(eq['key'])
            e = ol.get(eq['key'])
            if 'ocaid' not in e:
                continue
            if 'works' in e:
                fix_works.update(e['works'])
            print((e['key'], repr(e.get('title', None))))
            del e['ocaid']
            mend.append(e)
        last_updated = row.updated
    print('removing links from %d editions' % len(mend))
    if not mend:
        return
    print(ol.save_many(mend, 'remove link'))
    requests = []
    for wkey in fix_works:
        requests += update_work(withKey(wkey))
    if fix_works:
        solr_update(requests + ['<commit/>'], debug=True)
    print(last_updated, file=open(hide_state_file, 'w'))
Пример #7
0
def hide_books(start):
    hide_start = open(hide_state_file).readline()[:-1]
    print "hide start:", hide_start

    mend = []
    fix_works = set()
    db_iter = db.query(
        "select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start order by scandate_dt",
        {"start": hide_start},
    )
    last_updated = None
    for row in db_iter:
        ia = row.identifier
        if row.collection:
            collections = set(i.lower().strip() for i in row.collection.split(";"))
            if "printdisabled" in collections or "lendinglibrary" in collections:
                continue
        print ` ia `, row.updated
        for eq in query({"type": "/type/edition", "ocaid": ia}):
            print eq["key"]
            e = ol.get(eq["key"])
            if "ocaid" not in e:
                continue
            if "works" in e:
                fix_works.update(e["works"])
            print e["key"], ` e.get("title", None) `
            del e["ocaid"]
            mend.append(e)
        last_updated = row.updated
    print "removing links from %d editions" % len(mend)
    if not mend:
        return
    print ol.save_many(mend, "remove link")
    requests = []
    for wkey in fix_works:
        requests += update_work(withKey(wkey))
    if fix_works:
        solr_update(requests + ["<commit/>"], debug=True)
    print >> open(hide_state_file, "w"), last_updated
Пример #8
0
def hide_books(start):
    hide_start = open(hide_state_file).readline()[:-1]
    print 'hide start:', hide_start

    mend = []
    fix_works = set()
    db_iter = db.query("select identifier, collection, updated from metadata where (noindex is not null or curatestate='dark') and mediatype='texts' and scandate is not null and updated > $start", {'start': hide_start})
    last_updated = None
    for row in db_iter:
        ia = row.identifier
        if row.collection:
            collections = set(i.lower().strip() for i in row.collection.split(';'))
            if ignore_noindex & collections:
                continue
        print(repr(ia), row.updated)
        for eq in query({'type': '/type/edition', 'ocaid': ia}):
            print eq['key']
            e = ol.get(eq['key'])
            if 'ocaid' not in e:
                continue
            if 'works' in e:
                fix_works.update(e['works'])
            print(e['key'], repr(e.get('title', None)))
            del e['ocaid']
            mend.append(e)
        last_updated = row.updated
    print 'removing links from %d editions' % len(mend)
    if not mend:
        return
    print ol.save_many(mend, 'remove link')
    requests = []
    for wkey in fix_works:
        requests += update_work(withKey(wkey))
    if fix_works:
        solr_update(requests + ['<commit/>'], debug=True)
    print >> open(hide_state_file, 'w'), last_updated
Пример #9
0
                    ) or 'census00reel' in ia or ia.startswith(
                        'populationsc1880'):
                print('ia:', ia)
                print('collections:', list(collections))
                print('census not marked correctly')
                continue
            assert 'passportapplicat' not in ia and 'passengerlistsof' not in ia
            if 'passportapplicat' in ia:
                print('skip passport applications for now:', ia)
                continue
            if 'passengerlistsof' in ia:
                print('skip passenger lists', ia)
                continue
            print((repr(ia), row.updated))
            when = str(row.updated)
            if query({'type': '/type/edition', 'ocaid': ia}):
                print('already loaded')
                continue
            if query({'type': '/type/edition', 'source_records': 'ia:' + ia}):
                print('already loaded')
                continue

            try:
                formats = marc_formats(ia, host, path)
            except urllib2.HTTPError as error:
                write_log(ia, when, "error: HTTPError: " + str(error))
                continue
            use_binary = False
            bad_binary = None
            print(formats)
            rec = {}
Пример #10
0
            if re_census.match(ia) or ia.startswith('populationschedu') or ia.startswith('michigancensus') or 'census00reel' in ia or ia.startswith('populationsc1880'):
                print 'ia:', ia
                print 'collections:', list(collections)
                print 'census not marked correctly'
                continue
            assert 'passportapplicat' not in ia and 'passengerlistsof' not in ia
            if 'passportapplicat' in ia:
                print 'skip passport applications for now:', ia
                continue
            if 'passengerlistsof' in ia:
                print 'skip passenger lists', ia
                continue
            print(repr(ia), row.updated)
            when = str(row.updated)
            if query({'type': '/type/edition', 'ocaid': ia}):
                print 'already loaded'
                continue
            if query({'type': '/type/edition', 'source_records': 'ia:' + ia}):
                print 'already loaded'
                continue

            try:
                formats = marc_formats(ia, host, path)
            except urllib2.HTTPError as error:
                write_log(ia, when, "error: HTTPError: " + str(error))
                continue
            use_binary = False
            bad_binary = None
            print formats
            rec = {}
Пример #11
0
from openlibrary.catalog.utils.query import query, withKey
from openlibrary.catalog.importer.update import add_source_records

for num, line in enumerate(open('/1/edward/imagepdf/possible_match2')):
    doc = eval(line)
    if 'publisher' not in doc:
        continue
    item_id = doc['item_id']
    if query({'type':'/type/edition','source_records':'ia:' + item_id}):
        continue
    e = withKey(doc['ol'])
    if 'publishers' not in e:
        continue
    title_match = False
    if doc['title'] == e['title']:
        title_match = True
    elif doc['title'] == e.get('title_prefix', '') + e['title']:
        title_match = True
    elif doc['title'] == e.get('title_prefix', '') + e['title'] + e.get('subtitle', ''):
        title_match = True
    elif doc['title'] == e['title'] + e.get('subtitle', ''):
        title_match = True
    if not title_match:
        continue
    if doc['publisher'] != e['publishers'][0]:
        continue
    print 'match:', item_id, doc['ol']
    add_source_records(doc['ol'], item_id)

Пример #12
0
                or ia.startswith("populationsc1880")
            ):
                print "ia:", ia
                print "collections:", list(collections)
                print "census not marked correctly"
                continue
            assert "passportapplicat" not in ia and "passengerlistsof" not in ia
            if "passportapplicat" in ia:
                print "skip passport applications for now:", ia
                continue
            if "passengerlistsof" in ia:
                print "skip passenger lists", ia
                continue
            print ` ia `, row.updated
            when = str(row.updated)
            if query({"type": "/type/edition", "ocaid": ia}):
                print "already loaded"
                continue
            if query({"type": "/type/edition", "source_records": "ia:" + ia}):
                print "already loaded"
                continue

            try:
                formats = marc_formats(ia)
            except urllib2.HTTPError as error:
                write_log(ia, when, "error: HTTPError: " + str(error))
                continue
            use_binary = False
            bad_binary = None
            print formats
            rec = {}
Пример #13
0
from openlibrary.catalog.utils.query import query, withKey
from openlibrary.api import OpenLibrary, unmarshal
from openlibrary.catalog.read_rc import read_rc

rc = read_rc()
ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', rc['ImportBot'])

to_fix = []
num = 0
for line in open('no_index'):
    for e in query({'type': '/type/edition', 'title': None, 'ocaid': line[:-1]}):
        num += 1
        print num, e['key'], `e['title']`, line[:-1]
        e2 = ol.get(e['key'])
        del e2['ocaid']
        to_fix.append(e2)

ol.save_many(to_fix, 'remove link')
Пример #14
0
from openlibrary.catalog.utils.query import query, withKey
from openlibrary.api import OpenLibrary, unmarshal
from openlibrary.catalog.read_rc import read_rc

rc = read_rc()
ol = OpenLibrary("http://openlibrary.org")
ol.login('ImportBot', rc['ImportBot'])

to_fix = []
num = 0
for line in open('no_index'):
    for e in query({
            'type': '/type/edition',
            'title': None,
            'ocaid': line[:-1]
    }):
        num += 1
        print(num, e['key'], repr(e['title']), line[:-1])
        e2 = ol.get(e['key'])
        del e2['ocaid']
        to_fix.append(e2)

ol.save_many(to_fix, 'remove link')
Пример #15
0
from openlibrary.catalog.utils.query import query, withKey
from openlibrary.catalog.importer.update import add_source_records

for num, line in enumerate(open('/1/edward/imagepdf/possible_match2')):
    doc = eval(line)
    if 'publisher' not in doc:
        continue
    item_id = doc['item_id']
    if query({'type': '/type/edition', 'source_records': 'ia:' + item_id}):
        continue
    e = withKey(doc['ol'])
    if 'publishers' not in e:
        continue
    title_match = False
    if doc['title'] == e['title']:
        title_match = True
    elif doc['title'] == e.get('title_prefix', '') + e['title']:
        title_match = True
    elif doc['title'] == e.get('title_prefix', '') + e['title'] + e.get(
            'subtitle', ''):
        title_match = True
    elif doc['title'] == e['title'] + e.get('subtitle', ''):
        title_match = True
    if not title_match:
        continue
    if doc['publisher'] != e['publishers'][0]:
        continue
    print 'match:', item_id, doc['ol']
    add_source_records(doc['ol'], item_id)