示例#1
0
def find_title(item_id):
    (ia_host, ia_path) = find_item(item_id)

    if not ia_host:
        return
    url = 'http://' + ia_host + ia_path + "/" + item_id + "_scandata.xml"
    scandata = None
    try:
        scandata = urlopen_keep_trying(url).read()
    except:
        pass
    if not scandata or '<book>' not in scandata[:100]:
        url = "http://" + ia_host + "/zipview.php?zip=" + ia_path + "/scandata.zip&file=scandata.xml"
        scandata = urlopen_keep_trying(url).read()
    if not scandata or '<book>' not in scandata:
        return

    zip_type = 'tif' if item_id.endswith('goog') else 'jp2'
    try:
        status = zip_test(ia_host, ia_path, item_id, zip_type)
    except socket.error:
        #print 'socket error:', ia_host
        bad_hosts.add(ia_host)
        return
    if status in (403, 404):
        #print zip_type, ' not found:', item_id
        return

    (cover, title) = parse_scandata_xml(scandata)
    return title
示例#2
0
def find_title(item_id):
    (ia_host, ia_path) = find_item(item_id)

    if not ia_host:
        return
    url = 'http://' + ia_host + ia_path + "/" + item_id + "_scandata.xml"
    scandata = None
    try:
        scandata = urlopen_keep_trying(url).read()
    except:
        pass
    if not scandata or '<book>' not in scandata[:100]:
        url = "http://" + ia_host + "/zipview.php?zip=" + ia_path + "/scandata.zip&file=scandata.xml"
        scandata = urlopen_keep_trying(url).read()
    if not scandata or '<book>' not in scandata:
        return

    zip_type = 'tif' if item_id.endswith('goog') else 'jp2'
    try:
        status = zip_test(ia_host, ia_path, item_id, zip_type)
    except socket.error:
        #print 'socket error:', ia_host
        bad_hosts.add(ia_host)
        return
    if status in (403, 404):
        #print zip_type, ' not found:', item_id
        return

    (cover, title) = parse_scandata_xml(scandata)
    return title
示例#3
0
def load_xml(ia):
    url = archive_url + ia + '/' + ia + '_marc.xml'
    f = urlopen_keep_trying(url)
    root = etree.parse(f).getroot()
    if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
        root = root[0]
    return MarcXml(root)
示例#4
0
def load_xml(ia):
    url = archive_url + ia + '/' + ia + '_marc.xml'
    f = urlopen_keep_trying(url)
    root = etree.parse(f).getroot()
    if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
        root = root[0]
    return MarcXml(root)
示例#5
0
def load_binary(ia):
    url = archive_url + ia + "/" + ia + "_meta.mrc"
    f = urlopen_keep_trying(url)
    data = f.read()
    assert "<title>Internet Archive: Page Not Found</title>" not in data[:200]
    if len(data) != int(data[:5]):
        data = data.decode("utf-8").encode("raw_unicode_escape")
    assert len(data) == int(data[:5])
    return MarcBinary(data)
示例#6
0
def load_binary(ia):
    url = archive_url + ia + '/' + ia + '_meta.mrc'
    f = urlopen_keep_trying(url)
    data = f.read()
    assert '<title>Internet Archive: Page Not Found</title>' not in data[:200]
    if len(data) != int(data[:5]):
        data = data.decode('utf-8').encode('raw_unicode_escape')
    assert len(data) == int(data[:5])
    return MarcBinary(data)
示例#7
0
def load_binary(ia, host, path):
    url = 'http://' + host + path + '/' + ia + '_meta.mrc'
    print(url)
    f = urlopen_keep_trying(url)
    data = f.read()
    assert '<title>Internet Archive: Page Not Found</title>' not in data[:200]
    if len(data) != int(data[:5]):
        data = data.decode('utf-8').encode('raw_unicode_escape')
    assert len(data) == int(data[:5])
    return MarcBinary(data)
示例#8
0
def load_xml(ia):
    url = archive_url + ia + "/" + ia + "_marc.xml"
    f = urlopen_keep_trying(url)
    root = etree.parse(f).getroot()
    if root.tag == "{http://www.loc.gov/MARC21/slim}collection":
        root = root[0]
    return MarcXml(root)
    edition = read_edition(rec)
    assert "title" in edition
    return edition
示例#9
0
def load_binary(ia):
    url = archive_url + ia + '/' + ia + '_meta.mrc'
    f = urlopen_keep_trying(url)
    data = f.content
    assert '<title>Internet Archive: Page Not Found</title>' not in data[:200]
    if len(data) != int(data[:5]):
        data = data.decode('utf-8').encode('raw_unicode_escape')
    if len(data) != int(data[:5]):
        return
    return MarcBinary(data)
示例#10
0
def load_xml(ia, host, path):
    url = 'http://' + host + path + '/' + ia + '_marc.xml'
    print(url)
    f = urlopen_keep_trying(url)
    root = etree.parse(f).getroot()
    if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
        root = root[0]
    return MarcXml(root)
    edition = read_edition(rec)
    assert 'title' in edition
    return edition
示例#11
0
def load_xml(ia, host, path):
    url = 'http://' + host + path + '/' + ia + '_marc.xml'
    print url
    f = urlopen_keep_trying(url)
    root = etree.parse(f).getroot()
    if root.tag == '{http://www.loc.gov/MARC21/slim}collection':
        root = root[0]
    return MarcXml(root)
    edition = read_edition(rec)
    assert 'title' in edition
    return edition
示例#12
0
def load(loc, ia):
    print "load", loc, ia
    url = archive_url + loc
    f = urlopen_keep_trying(url)
    try:
        edition = parse_xml.parse(f)
    except parse_xml.BadSubtag:
        return
    if 'title' not in edition:
        return
    edition['ocaid'] = ia
    write_edition(ia, edition)
示例#13
0
def load(loc, ia):
    print("load", loc, ia)
    url = archive_url + loc
    f = urlopen_keep_trying(url)
    try:
        edition = parse_xml.parse(f)
    except AssertionError:
        return
    except parse_xml.BadSubtag:
        return
    except KeyError:
        return
    if 'title' not in edition:
        return
    edition['ocaid'] = ia
    write_edition("ia:" + ia, edition)