Пример #1
0
def import_file(filename):
    for asin, edition in read_amazon_file(open(filename)):
        index_fields = build_index_fields(asin, edition)
        found = pool.build(index_fields)
        if 'title' not in found:
            print(found)
            print(asin)
            print(edition)
            print(index_fields)
            print()

        if not found['title'] and not found['isbn']:
            # print 'no pool load book:', asin
            # TODO load book
            continue
        # print asin, found
        # print(repr(edition['title'], edition.get('subtitle', None), edition.get('flags', None), edition.get('binding', None)))
        if 'sims' in edition:
            del edition['sims']
        # print edition
        # print

        seen = set()
        for k, v in found.items():
            for ekey in v:
                if ekey in seen:
                    continue
                keys, thing = follow_redirects(ekey)
                seen.update(keys)
                assert thing
                try:
                    m = try_merge(edition, ekey, thing)
                except:
                    print(asin)
                    print(edition)
                    print(ekey)
                    print(found)
                    raise
Пример #2
0
                if format.startswith('[graphic') or format.startswith(
                        '[cartograph'):
                    continue
            print(rec)

            if 'full_title' not in rec:
                print("full_title missing")
                write_log(ia, when, "error: full_title missing")
                continue
            index_fields = make_index_fields(rec)
            if not index_fields:
                print("no index_fields")
                write_log(ia, when, "error: no index fields")
                continue

            edition_pool = pool.build(index_fields)

            if not edition_pool:
                load(ia, use_binary=use_binary)
                write_log(ia, when, "loaded")
                continue

            e1 = build_marc(rec)

            match = False
            seen = set()
            for k, v in edition_pool.iteritems():
                for edition_key in v:
                    if edition_key in seen:
                        continue
                    thing = None
Пример #3
0
def load_part(archive_id, part, start_pos=0):
    print 'load_part:', archive_id, part
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print loc
            print fast_parse.get_tag_lines(data, ['245'])
            raise
        except AssertionError:
            print loc
            raise
        except fast_parse.NotBook:
            continue
        if not index_fields or 'title' not in index_fields:
            continue

        print loc
        edition_pool = pool.build(index_fields)

        if not edition_pool:
            yield loc, data
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                thing = None
                while not thing or thing['type']['key'] == '/type/redirect':
                    seen.add(edition_key)
                    thing = withKey(edition_key)
                    assert thing
                    if thing['type']['key'] == '/type/redirect':
                        print 'following redirect %s => %s' % (edition_key, thing['location'])
                        edition_key = thing['location']
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing, data)
                    match = True
                    break
            if match:
                break

        if not match:
            yield loc, data
Пример #4
0
    filename = '/2/edward/20century/scans/' + item[:2] + '/' + item + '/' + item + '_marc.xml'
    rec = read_xml.read_edition(open(filename))
    if 'full_title' not in rec:
        print "full_title missing", item
        continue
    if 'physical_format' in rec:
        format = rec['physical_format'].lower()
        if format.startswith('[graphic') or format.startswith('[cartograph'):
            print item, format
    index_fields = make_index_fields(rec)
    if not index_fields:
        print "no index_fields"
        continue
    #print index_fields

    edition_pool = pool.build(index_fields)
    if not edition_pool or not any(v for v in edition_pool.itervalues()):
        print >> new_book, rec
        continue

    print item, edition_pool
    e1 = build_marc(rec)
    print e1

    match = False
    seen = set()
    for k, v in edition_pool.iteritems():
        for edition_key in v:
#            edition_key = '/books/' + re_edition_key.match(edition_key).match(1)
            if edition_key in seen:
                continue
Пример #5
0
def load_part(archive_id, part, start_pos=0):
    print('load_part:', archive_id, part)
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print(loc)
            print(fast_parse.get_tag_lines(data, ['245']))
            raise
        except AssertionError:
            print(loc)
            raise
        except fast_parse.NotBook:
            continue
        if not index_fields or 'title' not in index_fields:
            continue

        print(loc)
        edition_pool = pool.build(index_fields)

        if not edition_pool:
            yield loc, data
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                thing = None
                while not thing or thing['type']['key'] == '/type/redirect':
                    seen.add(edition_key)
                    thing = withKey(edition_key)
                    assert thing
                    if thing['type']['key'] == '/type/redirect':
                        print('following redirect %s => %s' %
                              (edition_key, thing['location']))
                        edition_key = thing['location']
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing, data)
                    match = True
                    break
            if match:
                break

        if not match:
            yield loc, data