예제 #1
0
def try_merge(edition, ekey, thing):
    thing_type = thing['type']['key']
    if 'isbn_10' not in edition:
        print(edition)
    asin = edition.get('isbn_10', None) or edition['asin']
    if 'authors' in edition:
        authors = [i['name'] for i in edition['authors']]
    else:
        authors = []
    a = amazon_merge.build_amazon(edition, authors)
    assert isinstance(asin, str)
    assert thing_type == '/type/edition'
    # print edition['asin'], ekey
    if 'source_records' in thing:
        if 'amazon:' + asin in thing['source_records']:
            return True
        return source_records_match(a, thing)

    # print 'no source records'
    mc = get_mc(ekey)
    # print 'mc:', mc
    if mc == 'amazon:' + asin:
        return True
    if not mc:
        return False
    data = get_from_local(mc)
    e1 = build_marc(fast_parse.read_edition(data))
    return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
예제 #2
0
def test_read_oclc():
    # DEPRECATED data was 'oregon_27194315', triggers exception
    for f in ('scrapbooksofmoun03tupp_meta.mrc', ):
        data = open(test_data + f).read()
        i = index_fields(data, ['001', '003', '010', '020', '035', '245'])
        assert 'oclc' in i
        e = read_edition(data)
        assert 'oclc' in e
예제 #3
0
def marc_match(e1, loc):
    rec = fast_parse.read_edition(get_from_local(loc))
    try:
        e2 = build_marc(rec)
    except TypeError:
        print rec
        raise
    return attempt_merge(e1, e2, threshold, debug=False)
예제 #4
0
def marc_match(e1, loc):
    print('loc:', loc)
    rec = fast_parse.read_edition(get_from_archive(loc))
    print('rec:', rec)
    try:
        e2 = build_marc(rec)
    except TypeError:
        print(rec)
        raise
    return attempt_merge(e1, e2, threshold, debug=False)
예제 #5
0
def marc_match(e1, loc):
    print('loc:', loc)
    rec = fast_parse.read_edition(get_from_archive(loc))
    print('rec:', rec)
    try:
        e2 = build_marc(rec)
    except TypeError:
        print(rec)
        raise
    return attempt_merge(e1, e2, threshold, debug=False)
예제 #6
0
def get_marc_ia(ia):
    ia = ia.strip() # 'cyclopdiaofedu00kidd '
    url = base + ia + "/" + ia + "_meta.mrc"
    data = urlopen_keep_trying(url).read()
    length = int(data[0:5])
    if len(data) != length:
        data = data.decode('utf-8').encode('raw_unicode_escape')
    assert len(data) == length

    assert 'Internet Archive: Error' not in data
    print 'leader:', data[:24]
    return data
    return fast_parse.read_edition(data, accept_electronic = True)
예제 #7
0
def get_record(key, mc):
    data = get_from_archive(mc)
    try:
        rec = fast_parse.read_edition(data)
    except (fast_parse.SoundRecording, IndexError, AssertionError):
        print(mc)
        print(key)
        return False
    try:
        return marc.build_marc(rec)
    except TypeError:
        print(rec)
        raise
예제 #8
0
def get_marc_ia(ia):
    ia = ia.strip()  # 'cyclopdiaofedu00kidd '
    url = base + ia + "/" + ia + "_meta.mrc"
    data = urlopen_keep_trying(url).read()
    length = int(data[0:5])
    if len(data) != length:
        data = data.decode('utf-8').encode('raw_unicode_escape')
    assert len(data) == length

    assert 'Internet Archive: Error' not in data
    print 'leader:', data[:24]
    return data
    return fast_parse.read_edition(data, accept_electronic=True)
예제 #9
0
         try:
             marc_marc_data = marc_data.decode('utf-8').encode(
                 'raw_unicode_escape')
         except:
             bad_binary = "double UTF-8 decode error"
     if not bad_binary and len(marc_data) != length:
         bad_binary = 'MARC length mismatch: %d != %d' % (
             len(marc_data), length)
     if not bad_binary and 'Internet Archive: Error' in marc_data:
         bad_binary = 'Internet Archive: Error'
     if not bad_binary:
         if str(marc_data)[6:8] != 'am':  # only want books
             print('not a book!')
             continue
         try:
             rec = fast_parse.read_edition(marc_data,
                                           accept_electronic=True)
         except:
             bad_binary = "MARC parse error"
 if bad_binary and not formats['xml']:
     load_error_mail(ia, bad_binary, 'bad MARC binary, no MARC XML')
     continue
 if not use_binary and formats['xml']:
     if bad_ia_xml(ia) and bad_binary:
         load_error_mail(ia, bad_binary,
                         'bad MARC binary, bad MARC XML')
         continue
     try:
         rec = get_ia(ia)
     except (KeyboardInterrupt, NameError):
         raise
     except NoMARCXML:
예제 #10
0
def try_merge(e1, edition_key, thing):
    thing_type = thing['type']
    if thing_type != Reference('/type/edition'):
        print(thing['key'], 'is', str(thing['type']))
    if thing_type == Reference('/type/delete'):
        return False
    assert thing_type == Reference('/type/edition')

    if 'source_records' in thing:
        if fix_source_records(edition_key, thing):
            thing = withKey(edition_key)  # reload
        return source_records_match(e1, thing)

    ia = thing.get('ocaid', None)
    print(edition_key)
    mc = get_mc(edition_key)
    print(mc)
    if mc:
        if mc.startswith('ia:'):
            ia = mc[3:]
        elif mc.endswith('.xml') or mc.endswith('.mrc'):
            ia = mc[:mc.find('/')]
        if '_meta.mrc:' in mc:
            print(thing)
            if 'ocaid' not in thing:
                return False
            ia = thing['ocaid']
    rec2 = None
    if ia:
        if is_dark_or_bad(ia):
            return False
        try:
            rec2 = get_ia(ia)
        except xml.parsers.expat.ExpatError:
            return False
        except NoMARCXML:
            print('no MARCXML')
            pass
        except urllib2.HTTPError as error:
            print(error.code)
            assert error.code in (404, 403)
        if not rec2:
            return True
    if not rec2:
        if not mc:
            mc = get_mc(thing['key'])
        if not mc or mc == 'initial import':
            return False
        if mc.startswith('amazon:'):
            try:
                a = try_amazon(thing)
            except IndexError:
                print(thing['key'])
                raise
            except AttributeError:
                return False
            if not a:
                return False
            try:
                return amazon.attempt_merge(a, e1, threshold, debug=False)
            except:
                print(a)
                print(e1)
                print(thing['key'])
                raise
        print('mc:', mc)
        try:
            assert not mc.startswith('ia:')
            data = get_from_archive(mc)
            if not data:
                return True
            rec2 = fast_parse.read_edition(data)
        except (fast_parse.SoundRecording, IndexError, AssertionError):
            print(mc)
            print(edition_key)
            return False
        except:
            print(mc)
            print(edition_key)
            raise
    if not rec2:
        return False
    try:
        e2 = build_marc(rec2)
    except TypeError:
        print(rec2)
        raise
    return attempt_merge(e1, e2, threshold, debug=False)
예제 #11
0
def load_part(archive_id, part, start_pos=0):
    print('load_part:', archive_id, part)
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print(loc)
            print(fast_parse.get_tag_lines(data, ['245']))
            raise
        except AssertionError:
            print(loc)
            raise
        except fast_parse.NotBook:
            continue
        if not index_fields or 'title' not in index_fields:
            continue

        print(loc)
        edition_pool = pool.build(index_fields)

        if not edition_pool:
            yield loc, data
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                thing = None
                while not thing or thing['type']['key'] == '/type/redirect':
                    seen.add(edition_key)
                    thing = withKey(edition_key)
                    assert thing
                    if thing['type']['key'] == '/type/redirect':
                        print('following redirect %s => %s' %
                              (edition_key, thing['location']))
                        edition_key = thing['location']
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing, data)
                    match = True
                    break
            if match:
                break

        if not match:
            yield loc, data
예제 #12
0
         if not a:
             return False
         try:
             return amazon.attempt_merge(a, e1, threshold, debug=False)
         except:
             print a
             print e1
             print thing['key']
             raise
     print 'mc:', mc
     try:
         assert not mc.startswith('ia:')
         data = get_from_local(mc)
         if not data:
             return True
         rec2 = fast_parse.read_edition(data)
     except (fast_parse.SoundRecording, IndexError, AssertionError):
         print mc
         print edition_key
         return False
     except:
         print mc
         print edition_key
         raise
 if not rec2:
     return False
 try:
     e2 = build_marc(rec2)
 except TypeError:
     print rec2
     raise
예제 #13
0
    if not f:
        return None
    data = f.read()
    length = data[0:5]
    loc = ia + "/" + ia + "_meta.mrc:0:" + length
    if len(data) == 0:
        print 'zero length MARC for', url
        return None
    if 'Internet Archive: Error' in data:
        print 'internet archive error for', url
        return None
    if data.startswith('<html>\n<head>'):
        print 'internet archive error for', url
        return None
    try:
        return fast_parse.read_edition(data, accept_electronic = True)
    except (ValueError, AssertionError, fast_parse.BadDictionary):
        print `data`
        raise

def files(archive_id):
    url = base + archive_id + "/" + archive_id + "_files.xml"
    for i in range(5):
        try:
            tree = etree.parse(urlopen_keep_trying(url))
            break
        except xml.parsers.expat.ExpatError:
            sleep(2)
    try:
        tree = etree.parse(urlopen_keep_trying(url))
    except:
예제 #14
0
def marc_match(a, loc):
    assert loc
    rec = fast_parse.read_edition(get_from_local(loc))
    e1 = build_marc(rec)
    # print 'amazon:', a
    return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
예제 #15
0
def try_merge(e1, edition_key, thing):
    thing_type = thing['type']
    if thing_type != Reference('/type/edition'):
        print(thing['key'], 'is', str(thing['type']))
    if thing_type == Reference('/type/delete'):
        return False
    assert thing_type == Reference('/type/edition')

    if 'source_records' in thing:
        if fix_source_records(edition_key, thing):
            thing = withKey(edition_key) # reload
        return source_records_match(e1, thing)

    ia = thing.get('ocaid', None)
    print(edition_key)
    mc = get_mc(edition_key)
    print(mc)
    if mc:
        if mc.startswith('ia:'):
            ia = mc[3:]
        elif mc.endswith('.xml') or mc.endswith('.mrc'):
            ia = mc[:mc.find('/')]
        if '_meta.mrc:' in mc:
            print(thing)
            if 'ocaid' not in thing:
                return False
            ia = thing['ocaid']
    rec2 = None
    if ia:
        if is_dark_or_bad(ia):
            return False
        try:
            rec2 = get_ia(ia)
        except xml.parsers.expat.ExpatError:
            return False
        except NoMARCXML:
            print('no MARCXML')
            pass
        except urllib2.HTTPError as error:
            print(error.code)
            assert error.code in (404, 403)
        if not rec2:
            return True
    if not rec2:
        if not mc:
            mc = get_mc(thing['key'])
        if not mc or mc == 'initial import':
            return False
        if mc.startswith('amazon:'):
            try:
                a = try_amazon(thing)
            except IndexError:
                print(thing['key'])
                raise
            except AttributeError:
                return False
            if not a:
                return False
            try:
                return amazon.attempt_merge(a, e1, threshold, debug=False)
            except:
                print(a)
                print(e1)
                print(thing['key'])
                raise
        print('mc:', mc)
        try:
            assert not mc.startswith('ia:')
            data = get_from_archive(mc)
            if not data:
                return True
            rec2 = fast_parse.read_edition(data)
        except (fast_parse.SoundRecording, IndexError, AssertionError):
            print(mc)
            print(edition_key)
            return False
        except:
            print(mc)
            print(edition_key)
            raise
    if not rec2:
        return False
    try:
        e2 = build_marc(rec2)
    except TypeError:
        print(rec2)
        raise
    return attempt_merge(e1, e2, threshold, debug=False)
예제 #16
0
    if not f:
        return None
    data = f.read()
    length = data[0:5]
    loc = ia + "/" + ia + "_meta.mrc:0:" + length
    if len(data) == 0:
        print 'zero length MARC for', url
        return None
    if 'Internet Archive: Error' in data:
        print 'internet archive error for', url
        return None
    if data.startswith('<html>\n<head>'):
        print 'internet archive error for', url
        return None
    try:
        return fast_parse.read_edition(data, accept_electronic=True)
    except (ValueError, AssertionError, fast_parse.BadDictionary):
        print ` data `
        raise


def files(archive_id):
    url = base + archive_id + "/" + archive_id + "_files.xml"
    for i in range(5):
        try:
            tree = etree.parse(urlopen_keep_trying(url))
            break
        except xml.parsers.expat.ExpatError:
            sleep(2)
    try:
        tree = etree.parse(urlopen_keep_trying(url))
예제 #17
0
def load_part(archive_id, part, start_pos=0):
    print 'load_part:', archive_id, part
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print loc
            print fast_parse.get_tag_lines(data, ['245'])
            raise
        except AssertionError:
            print loc
            raise
        except fast_parse.NotBook:
            continue
        if not index_fields or 'title' not in index_fields:
            continue

        print loc
        edition_pool = pool.build(index_fields)

        if not edition_pool:
            yield loc, data
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                thing = None
                while not thing or thing['type']['key'] == '/type/redirect':
                    seen.add(edition_key)
                    thing = withKey(edition_key)
                    assert thing
                    if thing['type']['key'] == '/type/redirect':
                        print 'following redirect %s => %s' % (edition_key, thing['location'])
                        edition_key = thing['location']
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing, data)
                    match = True
                    break
            if match:
                break

        if not match:
            yield loc, data
예제 #18
0
             bad_binary = "MARC doesn't start with number"
     if not bad_binary and len(marc_data) != length:
         try:
             marc_marc_data = marc_data.decode('utf-8').encode('raw_unicode_escape')
         except:
             bad_binary = "double UTF-8 decode error"
     if not bad_binary and len(marc_data) != length:
         bad_binary = 'MARC length mismatch: %d != %d' % (len(marc_data), length)
     if not bad_binary and 'Internet Archive: Error' in marc_data:
         bad_binary = 'Internet Archive: Error'
     if not bad_binary:
         if str(marc_data)[6:8] != 'am': # only want books
             print 'not a book!'
             continue
         try:
             rec = fast_parse.read_edition(marc_data, accept_electronic = True)
         except:
             bad_binary = "MARC parse error"
 if bad_binary and not formats['xml']:
     load_error_mail(ia, bad_binary, 'bad MARC binary, no MARC XML')
     continue
 if not use_binary and formats['xml']:
     if bad_ia_xml(ia) and bad_binary:
         load_error_mail(ia, bad_binary, 'bad MARC binary, bad MARC XML')
         continue
     try:
         rec = get_ia(ia)
     except (KeyboardInterrupt, NameError):
         raise
     except NoMARCXML:
         write_log(ia, when, "no MARCXML")
예제 #19
0
         if not a:
             return False
         try:
             return amazon.attempt_merge(a, e1, threshold, debug=False)
         except:
             print a
             print e1
             print thing['key']
             raise
     print 'mc:', mc
     try:
         assert not mc.startswith('ia:')
         data = get_from_archive(mc)
         if not data:
             return True
         rec2 = fast_parse.read_edition(data)
     except (fast_parse.SoundRecording, IndexError, AssertionError):
         print mc
         print edition_key
         return False
     except:
         print mc
         print edition_key
         raise
 if not rec2:
     return False
 try:
     e2 = build_marc(rec2)
 except TypeError:
     print rec2
     raise