예제 #1
0
 # if there is a problem with the XML switch to the binary MARC
 xml_file = ia + "_marc.xml"
 loc = ia + "/" + xml_file
 try:
     print base + loc
     f = urlopen_keep_trying(base + loc)
 except urllib2.HTTPError, error:
     if error.code == 404:
         raise NoMARCXML
     else:
         print 'error:', error.code, error.msg
         raise
 assert f
 if f:
     try:
         return read_xml.read_edition(f)
     except read_xml.BadXML:
         pass
     except xml.parsers.expat.ExpatError:
         #print 'IA:', `ia`
         #print 'XML parse error:', base + loc
         pass
 print base + loc
 if '<title>Internet Archive: Page Not Found</title>' in urllib2.urlopen(base + loc).read(200):
     raise NoMARCXML
 url = base + ia + "/" + ia + "_meta.mrc"
 print url
 try:
     f = urlopen_keep_trying(url)
 except urllib2.URLError:
     pass
예제 #2
0
            print item, 'already loaded'
            load_count += 1
            continue
        if ol.query({'type': '/type/edition', 'source_records': 'ia:' + ia}):
            print 'already loaded'
            load_count += 1
            continue
    try:
        assert not re_census.match(item)
        assert 'passportapplicat' not in item
        assert len(full_rec.keys()) != 1
    except AssertionError:
        print item
        raise
    filename = '/2/edward/20century/scans/' + item[:2] + '/' + item + '/' + item + '_marc.xml'
    rec = read_xml.read_edition(open(filename))
    if 'full_title' not in rec:
        print "full_title missing", item
        continue
    if 'physical_format' in rec:
        format = rec['physical_format'].lower()
        if format.startswith('[graphic') or format.startswith('[cartograph'):
            print item, format
    index_fields = make_index_fields(rec)
    if not index_fields:
        print "no index_fields"
        continue
    #print index_fields

    edition_pool = pool.build(index_fields)
    if not edition_pool or not any(v for v in edition_pool.itervalues()):
예제 #3
0
 # if there is a problem with the XML switch to the binary MARC
 xml_file = ia + "_marc.xml"
 loc = ia + "/" + xml_file
 try:
     print base + loc
     f = urlopen_keep_trying(base + loc)
 except urllib2.HTTPError, error:
     if error.code == 404:
         raise NoMARCXML
     else:
         print 'error:', error.code, error.msg
         raise
 assert f
 if f:
     try:
         return read_xml.read_edition(f)
     except read_xml.BadXML:
         print "read_xml BADXML"
         pass
     except xml.parsers.expat.ExpatError:
         #print 'IA:', `ia`
         #print 'XML parse error:', base + loc
         print "read_xml ExpatError"
         pass
 print base + loc
 if '<title>Internet Archive: Page Not Found</title>' in urllib2.urlopen(
         base + loc).read(200):
     raise NoMARCXML
 url = base + ia + "/" + ia + "_meta.mrc"
 print url
 try:
예제 #4
0
 xml_file = ia + "_marc.xml"
 loc = ia + "/" + xml_file
 if os.path.exists(xml_path + xml_file):
     f = open(xml_path + xml_file)
 else:
     try:
         f = urlopen_keep_trying(base + loc)
     except urllib2.HTTPError, error:
         if error.code == 404:
             raise NoMARCXML
         else:
             print 'error:', error.code, error.msg
             raise
 if f:
     try:
         return loc, read_xml.read_edition(f)
     except read_xml.BadXML:
         pass
     except xml.parsers.expat.ExpatError:
         #print 'IA:', `ia`
         #print 'XML parse error:', base + loc
         pass
 if '<title>Internet Archive: Page Not Found</title>' in urllib2.urlopen(base + loc).read(200):
     raise NoMARCXML
 url = base + ia + "/" + ia + "_meta.mrc"
 print url
 try:
     f = urlopen_keep_trying(url)
 except urllib2.URLError:
     pass
 if not f:
예제 #5
0
            load_count += 1
            continue
        if ol.query({'type': '/type/edition', 'source_records': 'ia:' + ia}):
            print 'already loaded'
            load_count += 1
            continue
    try:
        assert not re_census.match(item)
        assert 'passportapplicat' not in item
        assert len(full_rec.keys()) != 1
    except AssertionError:
        print item
        raise
    filename = '/2/edward/20century/scans/' + item[:
                                                   2] + '/' + item + '/' + item + '_marc.xml'
    rec = read_xml.read_edition(open(filename))
    if 'full_title' not in rec:
        print "full_title missing", item
        continue
    if 'physical_format' in rec:
        format = rec['physical_format'].lower()
        if format.startswith('[graphic') or format.startswith('[cartograph'):
            print item, format
    index_fields = make_index_fields(rec)
    if not index_fields:
        print "no index_fields"
        continue
    #print index_fields

    edition_pool = pool.build(index_fields)
    if not edition_pool or not any(v for v in edition_pool.itervalues()):