Exemplo n.º 1
0
def find_title(item_id):
    (ia_host, ia_path) = find_item(item_id)

    if not ia_host:
        return
    url = 'http://' + ia_host + ia_path + "/" + item_id + "_scandata.xml"
    scandata = None
    try:
        scandata = urlopen_keep_trying(url).read()
    except:
        pass
    if not scandata or '<book>' not in scandata[:100]:
        url = "http://" + ia_host + "/zipview.php?zip=" + ia_path + "/scandata.zip&file=scandata.xml"
        scandata = urlopen_keep_trying(url).read()
    if not scandata or '<book>' not in scandata:
        return

    zip_type = 'tif' if item_id.endswith('goog') else 'jp2'
    try:
        status = zip_test(ia_host, ia_path, item_id, zip_type)
    except socket.error:
        #print 'socket error:', ia_host
        bad_hosts.add(ia_host)
        return
    if status in (403, 404):
        #print zip_type, ' not found:', item_id
        return

    (cover, title) = parse_scandata_xml(scandata)
    return title
Exemplo n.º 2
0
def find_title(item_id):
    (ia_host, ia_path) = find_item(item_id)

    if not ia_host:
        return
    url = 'http://' + ia_host + ia_path + "/" + item_id + "_scandata.xml"
    scandata = None
    try:
        scandata = urlopen_keep_trying(url).read()
    except:
        pass
    if not scandata or '<book>' not in scandata[:100]:
        url = "http://" + ia_host + "/zipview.php?zip=" + ia_path + "/scandata.zip&file=scandata.xml"
        scandata = urlopen_keep_trying(url).read()
    if not scandata or '<book>' not in scandata:
        return

    zip_type = 'tif' if item_id.endswith('goog') else 'jp2'
    try:
        status = zip_test(ia_host, ia_path, item_id, zip_type)
    except socket.error:
        #print 'socket error:', ia_host
        bad_hosts.add(ia_host)
        return
    if status in (403, 404):
        #print zip_type, ' not found:', item_id
        return

    (cover, title) = parse_scandata_xml(scandata)
    return title
def find_img(item_id):
    e = query({'type': '/type/edition', 'source_records': 'ia:' + item_id})
    if len(e) != 1:
        print 'no source_records:', e
        e = query({'type': '/type/edition', 'ocaid': item_id})
        if len(e) != 1:
            print 'no ocaid:', e
            return
    ol = e[0]['key']
    (ia_host, ia_path) = find_item(item_id)

    if not ia_host:
        print 'no host', item_id, ia_host
        return
    if ia_host in bad_hosts:
        print 'bad_host'
    try:
        url = scandata_url(ia_host, ia_path, item_id)
        if not url:
            return
    except socket.error:
        print 'socket error:', ia_host
        bad_hosts.add(ia_host)
        return

    try:
        status = jp2_zip_test(ia_host, ia_path, item_id)
    except socket.error:
        print 'socket error:', ia_host
        bad_hosts.add(ia_host)
        return
    if status in (403, 404):
        print 'jp2 not found:', (ol, item_id)
        return

    try:
        (cover, title) = find_title_leaf_et(ia_host, ia_path, url)
    except (KeyboardInterrupt, SystemExit, NameError):
        raise
    if not cover or not title:
        return


#    except:
#        print 'skip error:', ol, item_id, ia_host, ia_path
#        return
    print(ol, item_id, ia_host, ia_path, cover, title)
    post(ol, item_id, ia_host, ia_path, cover, title)
def find_img(item_id):
    e = query({'type':'/type/edition', 'source_records':'ia:' + item_id})
    if len(e) != 1:
        print 'no source_records:', e
        e = query({'type':'/type/edition', 'ocaid': item_id})
        if len(e) != 1:
            print 'no ocaid:', e
            return
    ol = e[0]['key']
    (ia_host, ia_path) = find_item(item_id)

    if not ia_host:
        print 'no host', item_id, ia_host
        return
    if ia_host in bad_hosts:
        print 'bad_host'
    try:
        url = scandata_url(ia_host, ia_path, item_id)
        if not url:
            return
    except socket.error:
        print 'socket error:', ia_host
        bad_hosts.add(ia_host)
        return

    try:
        status = jp2_zip_test(ia_host, ia_path, item_id)
    except socket.error:
        print 'socket error:', ia_host
        bad_hosts.add(ia_host)
        return
    if status in (403, 404):
        print 'jp2 not found:', (ol, item_id)
        return

    try:
        (cover, title) = find_title_leaf_et(ia_host, ia_path, url)
    except (KeyboardInterrupt, SystemExit, NameError):
        raise
    if not cover or not title:
        return
#    except:
#        print 'skip error:', ol, item_id, ia_host, ia_path
#        return
    print (ol, item_id, ia_host, ia_path, cover, title)
    post(ol, item_id, ia_host, ia_path, cover, title)
Exemplo n.º 5
0
def get_from_archive(locator):
    if locator.startswith('marc:'):
        locator = locator[5:]
    filename, offset, length = locator.split(":")
    offset = int(offset)
    length = int(length)

    ia, rest = filename.split('/', 1)

    for attempt in range(5):
        try:
            host, path = find_item(ia)
            break
        except socket.timeout:
            if attempt == 4:
                raise
            print 'retry, attempt', attempt

    r0, r1 = offset, offset + length - 1
    url = 'http://' + host + path + '/' + rest

    assert 0 < length < 100000

    ureq = urllib2.Request(
        url,
        None,
        {'Range': 'bytes=%d-%d' % (r0, r1)},
    )

    f = None
    for i in range(3):
        try:
            f = urllib2.urlopen(ureq)
        except urllib2.HTTPError, error:
            if error.code == 416:
                raise
            elif error.code == 404:
                print "404 for '%s'" % url
                raise
            else:
                print url
                print 'error:', error.code, error.msg
        except urllib2.URLError:
            pass
Exemplo n.º 6
0
def get_from_archive(locator):
    if locator.startswith('marc:'):
        locator = locator[5:]
    filename, offset, length = locator.split (":")
    offset = int (offset)
    length = int (length)

    ia, rest = filename.split('/', 1)

    for attempt in range(5):
        try:
            host, path = find_item(ia)
            break
        except socket.timeout:
            if attempt == 4:
                raise
            print 'retry, attempt', attempt

    r0, r1 = offset, offset+length-1
    url = 'http://' + host + path + '/' + rest 

    assert 0 < length < 100000

    ureq = urllib2.Request(url, None, {'Range':'bytes=%d-%d'% (r0, r1)},)

    f = None
    for i in range(3):
        try:
            f = urllib2.urlopen(ureq)
        except urllib2.HTTPError, error:
            if error.code == 416:
                raise
            elif error.code == 404:
                print "404 for '%s'" % url
                raise
            else:
                print url
                print 'error:', error.code, error.msg
        except urllib2.URLError:
            pass
Exemplo n.º 7
0
        else:
            print('start:', start)
            db_iter = db.query(
                "select identifier, contributor, updated, noindex, collection, format from metadata where scanner is not null and mediatype='texts' and (not curatestate='dark' or curatestate is null) and scandate is not null and format is not null and updated between $start and date_add($start, interval 2 day) order by updated",
                {'start': start})
        t_start = time()
        for row in db_iter:
            if len(bad_marc) > 10 or (bad_marc
                                      and time() - bad_marc_last_sent >
                                      (4 * 60 * 60)):
                bad_marc_alert(bad_marc)
                bad_marc = []
                bad_marc_last_sent = time()

            ia = row.identifier
            host, path = find_item(ia)
            if 'pdf' not in row.format.lower():
                continue  # scancenter and billing staff often use format like "%pdf%" as a proxy for having derived
            if row.contributor == 'Allen County Public Library Genealogy Center':
                print('skipping Allen County Public Library Genealogy Center')
                continue
            if row.collection:
                collections = set(i.lower().strip()
                                  for i in row.collection.split(';'))
            else:
                collections = set()
            if row.noindex:
                if not row.collection:
                    continue
                collections = set(i.lower().strip()
                                  for i in row.collection.split(';'))
Exemplo n.º 8
0
def head(host, path, ia):
    conn = httplib.HTTPConnection(host)
    conn.request("HEAD", path + "/" + ia + "_marc.xml")
    return conn.getresponse()

bad_machine = set()
out = open('has_marc', 'w')
no = open('no_marc', 'w')
later = open('later', 'w')
for line in open('to_load'):
    ia = line[:-1]
    if line.startswith('('):
        print(ia, file=no)
        continue
    (host, path) = find_item(ia)
    if not host:
        print(ia, file=no)
        continue
    if host in bad_machine:
        print(ia, file=later)
        continue
#    print "http://" + host + path + "/" + ia + "_marc.xml"
    try:
        r1 = head(host, path, ia)
    except socket.error:
        print('socket error')
        print("http://" + host + path + "/" + ia + "_marc.xml")
        print('try later')
        bad_machine.add(ia)
        print(ia, file=later)
Exemplo n.º 9
0
def load_book(ia, collections, boxid, scanned=True):
    if ia.startswith('annualreportspri'):
        print 'skipping:', ia
        return
    if 'shenzhentest' in collections:
        return

    if any('census' in c for c in collections):
        print 'skipping census'
        return

    if re_census.match(ia) or ia.startswith('populationschedu') or ia.startswith('michigancensus') or 'census00reel' in ia or ia.startswith('populationsc1880'):
        print 'ia:', ia
        print 'collections:', list(collections)
        print 'census not marked correctly'
        return
    try:
        host, path = find_item(ia)
    except socket.timeout:
        print 'socket timeout:', ia
        return
    except FindItemError:
        print 'find item error:', ia
    bad_binary = None
    try:
        formats = marc_formats(ia, host, path)
    except urllib2.HTTPError as error:
        return

    if formats['bin']: # binary MARC
        marc_data = get_marc_ia_data(ia, host, path)
        assert isinstance(marc_data, str)
        marc_error = check_marc_data(marc_data)
        if marc_error == 'double encode':
            marc_data = marc_data.decode('utf-8').encode('raw_unicode_escape')
            marc_error = None
        if marc_error:
            return
        contenttype = 'application/marc'
    elif formats['xml']: # MARC XML
        return # waiting for Raj to fox MARC XML loader
        marc_data = urllib2.urlopen('http://' + host + path + '/' + ia + '_meta.xml').read()
        contenttype = 'text/xml'
    else:
        return
    subjects = []
    if scanned:
        if 'lendinglibrary' in collections:
            subjects += ['Protected DAISY', 'Lending library']
        elif 'inlibrary' in collections:
            subjects += ['Protected DAISY', 'In library']
        elif 'printdisabled' in collections:
            subjects.append('Protected DAISY')

    if not boxid:
        boxid = None
    try:
        post_to_import_api(ia, marc_data, contenttype, subjects, boxid, scanned=scanned)
    except BadImport:
        print >> bad, ia
        bad.flush()
    except BadLang:
        print >> bad_lang, ia
        bad_lang.flush()
Exemplo n.º 10
0
def run_find_item():
    global find_item_book
    while True:
        (num, ia) = item_queue.get()
        find_item_book = ia
        #print 'find_item:', ia
        t0_find_item = time()
        try:
            (host, path) = find_item(ia)
        except FindItemError:
            t1_find_item = time() - t0_find_item
            #print 'fail find_item:', ia, t1_find_item
            item_queue.task_done()
            done(ia, False)
            continue
        t1_find_item = time() - t0_find_item
        #print 'find_item:', ia, t1_find_item
        if len(locator_times) == 100:
            locator_times.pop(0)
        locator_times.append((t1_find_item, host))

        body = None
        if False:
            url = 'http://' + solr_src_host + '/solr/inside/select?wt=json&rows=10&q=ia:' + ia
            response = json.load(urllib2.urlopen(url))['response']
            if response['numFound']:
                doc = response['docs'][0]
                for doc_lang in ['eng', 'fre', 'deu', 'spa', 'other']:
                    if doc.get('body_' + doc_lang):
                        body = doc['body_' + doc_lang]
                        break
                assert body
        filename = '/1/abbyy_text/data/' + ia[:2] + '/' + ia
        if os.path.exists(filename):
            body = codecs.open(filename, 'r', 'utf-8').read()
        if body:
            try:
                meta_xml = urlread_keep_trying('http://%s%s/%s_meta.xml' %
                                               (host, path, ia))
            except urllib2.HTTPError as error:
                if error.code != 403:
                    raise
                print('403 on meta XML for:', ia)
                item_queue.task_done()  # skip
                done(ia, False)
                continue
            try:
                root = fromstring(meta_xml)
            except:
                print('identifer:', ia)
            collection = [e.text for e in root.findall('collection')]
            elem_noindex = root.find('noindex')
            if elem_noindex is not None and elem_noindex.text == 'true' and (
                    'printdisabled' not in collection
                    and 'lendinglibrary' not in collection):
                item_queue.task_done()  # skip
                done(ia, False)
                continue
            lang_elem = root.find('language')
            if lang_elem is None:
                print(meta_xml)
            if lang_elem is not None:
                lang = tidy_lang(lang_elem.text) or 'other'
            else:
                lang = 'other'

            #print 'solr_queue.put((ia, body, page_count))'
            solr_queue.put((ia, body, lang, page_counts[ia], collection))
            #print 'solr_queue.put() done'
        else:
            host_queues[host].put((num, ia, path))
            if host not in host_threads:
                host_threads[host] = spawn_link_exception(
                    read_text_from_node, host)
        item_queue.task_done()
Exemplo n.º 11
0
    while True:

        if args.item_id:
            db_iter = db.query("select identifier, contributor, updated, noindex, collection, format from metadata where scanner is not null and mediatype='texts' and (not curatestate='dark' or curatestate is null) and scandate is not null and format is not null and identifier=$item_id", {'item_id': args.item_id})
        else:
            print 'start:', start
            db_iter = db.query("select identifier, contributor, updated, noindex, collection, format from metadata where scanner is not null and mediatype='texts' and (not curatestate='dark' or curatestate is null) and scandate is not null and format is not null and updated between $start and date_add($start, interval 2 day) order by updated", {'start': start})
        t_start = time()
        for row in db_iter:
            if len(bad_marc) > 10 or (bad_marc and time() - bad_marc_last_sent > (4 * 60 * 60)):
                bad_marc_alert(bad_marc)
                bad_marc = []
                bad_marc_last_sent = time()

            ia = row.identifier
            host, path = find_item(ia)
            if 'pdf' not in row.format.lower():
                continue # scancenter and billing staff often use format like "%pdf%" as a proxy for having derived
            if row.contributor == 'Allen County Public Library Genealogy Center':
                print 'skipping Allen County Public Library Genealogy Center'
                continue
            if row.collection:
                collections = set(i.lower().strip() for i in row.collection.split(';'))
            else:
                collections = set()
            if row.noindex:
                if not row.collection:
                    continue
                collections = set(i.lower().strip() for i in row.collection.split(';'))
                if not ignore_noindex & collections:
                    continue
Exemplo n.º 12
0
def load_book(ia, collections, boxid, scanned=True):
    if ia.startswith('annualreportspri'):
        print('skipping:', ia)
        return
    if 'shenzhentest' in collections:
        return

    if any('census' in c for c in collections):
        print('skipping census')
        return

    if re_census.match(ia) or ia.startswith(
            'populationschedu') or ia.startswith(
                'michigancensus') or 'census00reel' in ia or ia.startswith(
                    'populationsc1880'):
        print('ia:', ia)
        print('collections:', list(collections))
        print('census not marked correctly')
        return
    try:
        host, path = find_item(ia)
    except socket.timeout:
        print('socket timeout:', ia)
        return
    except FindItemError:
        print('find item error:', ia)
    bad_binary = None
    try:
        formats = marc_formats(ia, host, path)
    except urllib2.HTTPError as error:
        return

    if formats['bin']:  # binary MARC
        marc_data = get_marc_ia_data(ia, host, path)
        assert isinstance(marc_data, str)
        marc_error = check_marc_data(marc_data)
        if marc_error == 'double encode':
            marc_data = marc_data.decode('utf-8').encode('raw_unicode_escape')
            marc_error = None
        if marc_error:
            return
        contenttype = 'application/marc'
    elif formats['xml']:  # MARC XML
        return  # waiting for Raj to fox MARC XML loader
        marc_data = urllib2.urlopen('http://' + host + path + '/' + ia +
                                    '_meta.xml').read()
        contenttype = 'text/xml'
    else:
        return
    subjects = []
    if scanned:
        if 'lendinglibrary' in collections:
            subjects += ['Protected DAISY', 'Lending library']
        elif 'inlibrary' in collections:
            subjects += ['Protected DAISY', 'In library']
        elif 'printdisabled' in collections:
            subjects.append('Protected DAISY')

    if not boxid:
        boxid = None
    try:
        post_to_import_api(ia,
                           marc_data,
                           contenttype,
                           subjects,
                           boxid,
                           scanned=scanned)
    except BadImport:
        print(ia, file=bad)
        bad.flush()
    except BadLang:
        print(ia, file=bad_lang)
        bad_lang.flush()
Exemplo n.º 13
0
def run_find_item():
    global find_item_book
    while True:
        (num, ia) = item_queue.get()
        find_item_book = ia
        #print 'find_item:', ia
        t0_find_item = time()
        try:
            (host, path) = find_item(ia)
        except FindItemError:
            t1_find_item = time() - t0_find_item
            #print 'fail find_item:', ia, t1_find_item
            item_queue.task_done()
            done(ia, False)
            continue
        t1_find_item = time() - t0_find_item
        #print 'find_item:', ia, t1_find_item
        if len(locator_times) == 100:
            locator_times.pop(0)
        locator_times.append((t1_find_item, host))

        body = None
        if False:
            url = 'http://' + solr_src_host + '/solr/inside/select?wt=json&rows=10&q=ia:' + ia
            response = json.load(urllib2.urlopen(url))['response']
            if response['numFound']:
                doc = response['docs'][0]
                for doc_lang in ['eng', 'fre', 'deu', 'spa', 'other']:
                    if doc.get('body_' + doc_lang):
                        body = doc['body_' + doc_lang]
                        break
                assert body
        filename = '/1/abbyy_text/data/' + ia[:2] + '/' + ia
        if os.path.exists(filename):
            body = codecs.open(filename, 'r', 'utf-8').read()
        if body:
            try:
                meta_xml = urlread_keep_trying('http://%s%s/%s_meta.xml' % (host, path, ia))
            except urllib2.HTTPError, error:
                if error.code != 403:
                    raise
                print '403 on meta XML for:', ia
                item_queue.task_done() # skip
                done(ia, False)
                continue
            try:
                root = fromstring(meta_xml)
            except:
                print 'identifer:', ia
            collection = [e.text for e in root.findall('collection')]
            elem_noindex = root.find('noindex')
            if elem_noindex is not None and elem_noindex.text == 'true' and ('printdisabled' not in collection and 'lendinglibrary' not in collection):
                item_queue.task_done() # skip
                done(ia, False)
                continue
            lang_elem = root.find('language')
            if lang_elem is None:
                print meta_xml
            if lang_elem is not None:
                lang = tidy_lang(lang_elem.text) or 'other'
            else:
                lang = 'other'

            #print 'solr_queue.put((ia, body, page_count))'
            solr_queue.put((ia, body, lang, page_counts[ia], collection))
            #print 'solr_queue.put() done'
        else:
            host_queues[host].put((num, ia, path))
            if host not in host_threads:
                host_threads[host] = spawn_link_exception(read_text_from_node, host)
        item_queue.task_done()
Exemplo n.º 14
0
def head(host, path, ia):
    conn = httplib.HTTPConnection(host)
    conn.request("HEAD", path + "/" + ia + "_marc.xml")
    return conn.getresponse()


bad_machine = set()
out = open('has_marc', 'w')
no = open('no_marc', 'w')
later = open('later', 'w')
for line in open('to_load'):
    ia = line[:-1]
    if line.startswith('('):
        print >> no, ia
        continue
    (host, path) = find_item(ia)
    if not host:
        print >> no, ia
        continue
    if host in bad_machine:
        print >> later, ia
        continue
#    print "http://" + host + path + "/" + ia + "_marc.xml"
    try:
        r1 = head(host, path, ia)
    except socket.error:
        print 'socket error'
        print "http://" + host + path + "/" + ia + "_marc.xml"
        print 'try later'
        bad_machine.add(ia)
        print >> later, ia