예제 #1
0
    def test_author_contrib(self):
        rec1 = {
            'authors': [
                {'db_name': 'Bruner, Jerome S.',
                 'name': 'Bruner, Jerome S.'}],
            'full_title': ('Contemporary approaches to cognition '
                           'a symposium held at the University of Colorado.'),
            'number_of_pages': 210,
            'publish_country': 'xxu',
            'publish_date': '1957',
            'publishers': ['Harvard U.P']}

        rec2 = {
            'authors': [
                {'db_name': ('University of Colorado (Boulder campus). '
                             'Dept. of Psychology.'),
                 'name': ('University of Colorado (Boulder campus). '
                          'Dept. of Psychology.')}],
            'contribs': [
                {'db_name': 'Bruner, Jerome S.',
                 'name': 'Bruner, Jerome S.'}],
            'full_title': ('Contemporary approaches to cognition '
                           'a symposium held at the University of Colorado'),
            'lccn': ['57012963'],
            'number_of_pages': 210,
            'publish_country': 'mau',
            'publish_date': '1957',
            'publishers': ['Harvard University Press']}

        e1 = build_marc(rec1)
        e2 = build_marc(rec2)

        assert compare_authors(e1, e2) == ('authors', 'exact match', 125)
        threshold = 875
        assert editions_match(e1, e2, threshold) is True
예제 #2
0
def test_merge2():
    amazon = {'publishers': [u'Collins'], 'isbn_10': ['0002167530'], 'number_of_pages': 287, 'short_title': u'sea birds britain ireland', 'normalized_title': u'sea birds britain ireland', 'full_title': u'Sea Birds Britain Ireland', 'titles': [u'Sea Birds Britain Ireland', u'sea birds britain ireland'], 'publish_date': u'1975',
            'authors': [{'name': 'Stanley Cramp', 'db_name': 'Cramp, Stanley'}]}

    marc = {'publisher': [u'Collins'], 'isbn_10': [u'0002167530'], 'short_title': u'seabirds of britain and i', 'normalized_title': u'seabirds of britain and ireland', 'full_title': u'seabirds of Britain and Ireland', 'titles': [u'seabirds of Britain and Ireland', u'seabirds of britain and ireland'], 'publish_date': '1974', 'authors': [{'db_name': u'Cramp, Stanley.', 'entity_type': 'person', 'name': u'Cramp, Stanley.', 'personal_name': u'Cramp, Stanley.'}], 'source_record_loc': 'marc_records_scriblio_net/part08.dat:61449973:855'}
    threshold = 875
    # build_marc() will place all isbn_ types in the 'isbn' field.
    # compare_author_fields() expects all authors to have a db_name
    assert attempt_merge(build_marc(amazon), build_marc(marc), threshold, debug=True)
예제 #3
0
    def test_match_low_threshold(self):
        # year is off by < 2 years, counts a little
        # build_marc() will place all isbn_ types in the 'isbn' field.
        e1 = build_marc({
            'publishers': ['Collins'],
            'isbn_10': ['0002167530'],
            'number_of_pages':
            287,
            'short_title':
            'sea birds britain ireland',
            'normalized_title':
            'sea birds britain ireland',
            'full_title':
            'Sea Birds Britain Ireland',
            'titles':
            ['Sea Birds Britain Ireland', 'sea birds britain ireland'],
            'publish_date':
            '1975',
            'authors': [{
                'name': 'Stanley Cramp',
                'db_name': 'Cramp, Stanley'
            }],
        })

        e2 = build_marc({
            'publishers': ['Collins'],
            'isbn_10': ['0002167530'],
            'short_title':
            'seabirds of britain and i',
            'normalized_title':
            'seabirds of britain and ireland',
            'full_title':
            'seabirds of Britain and Ireland',
            'titles': [
                'seabirds of Britain and Ireland',
                'seabirds of britain and ireland',
            ],
            'publish_date':
            '1974',
            'authors': [{
                'db_name': 'Cramp, Stanley.',
                'entity_type': 'person',
                'name': 'Cramp, Stanley.',
                'personal_name': 'Cramp, Stanley.',
            }],
            'source_record_loc':
            'marc_records_scriblio_net/part08.dat:61449973:855',
        })
        threshold = 515
        assert editions_match(e1, e2, threshold, debug=True)
        assert editions_match(e1, e2, threshold + 1) is False
예제 #4
0
def try_merge(e1, edition_key, existing):
    thing_type = existing.type.key
    if thing_type == '/type/delete':
        return False
    assert thing_type == '/type/edition'

    rec2 = {}
    rec2['full_title'] = existing.title
    if existing.subtitle:
        rec2['full_title'] += ' ' + existing.subtitle
    for f in 'isbn', 'isbn_10', 'isbn_13', 'lccn', 'publish_country', 'publishers', 'publish_date':
        if existing.get(f):
            rec2[f] = existing[f]
    if existing.authors:
        rec2['authors'] = []
        for a in existing.authors:
            author_type = a.type.key
            while author_type == '/type/delete' or author_type == '/type/redirect':
                if author_type == '/type/delete':
                    a = undelete_author(a)
                    author_type = a.type.key
                    continue
                if author_type == '/type/redirect':
                    a = web.ctx.site.get(a.location)
                    author_type = a.type.key
                    continue
            assert author_type == '/type/author'
            assert a['name']
            rec2['authors'].append({'name': a['name'], 'db_name': db_name(a)})

    e2 = build_marc(rec2)
    return attempt_merge(e1, e2, threshold, debug=False)
예제 #5
0
def try_merge(e1, edition_key, existing):
    thing_type = existing.type.key
    if thing_type == '/type/delete':
        return False
    assert thing_type == '/type/edition'

    rec2 = {}
    rec2['full_title'] = existing.title
    if existing.subtitle:
        rec2['full_title'] += ' ' + existing.subtitle
    if existing.lccn:
        rec2['lccn'] = existing.lccn
    rec2['authors'] = [{
        'name': a.name,
        'db_name': db_name(a)
    } for a in existing.authors]
    if existing.publishers:
        rec2['publishers'] = existing.publishers
    if existing.publish_date:
        rec2['publisher_date'] = existing.publish_date

    e2 = build_marc(rec2)
    print
    print 'e1:', e1
    print 'e2:', e2
    return attempt_merge(e1, e2, threshold, debug=True)
예제 #6
0
def try_merge(e1, edition_key, existing):
    """
    Converts the existing edition into a comparable dict and performs a
    thresholded comparison to decide whether they are the same.

    :param dict e1:
    :param str edition_key:
    :param Thing existing: Edition object that most likely matches e1, the object of edition_key
    :rtype: bool
    :return: Whether e1 is sufficiently the same as the 'existing' edition
    """
    thing_type = existing.type.key
    if thing_type == '/type/delete':
        return False
    assert thing_type == '/type/edition'
    rec2 = {}
    rec2['full_title'] = existing.title
    if existing.subtitle:
        rec2['full_title'] += ' ' + existing.subtitle
    for f in 'isbn', 'isbn_10', 'isbn_13', 'lccn', 'publish_country', 'publishers', 'publish_date':
        if existing.get(f):
            rec2[f] = existing[f]
    if existing.authors:
        rec2['authors'] = []
        for a in existing.authors:
            while a.type.key == '/type/redirect':
                a = web.ctx.site.get(a.location)
            if a.type.key == '/type/author':
                assert a['name']
                rec2['authors'].append({
                    'name': a['name'],
                    'db_name': db_name(a)
                })
    e2 = build_marc(rec2)
    return attempt_merge(e1, e2, threshold)
예제 #7
0
파일: merge.py 프로젝트: yzou/openlibrary
def try_merge(e1, edition_key, existing):
    thing_type = existing.type.key
    if thing_type == '/type/delete':
        return False
    assert thing_type == '/type/edition'

    rec2 = {}
    rec2['full_title'] = existing.title
    if existing.subtitle:
        rec2['full_title'] += ' ' + existing.subtitle
    for f in 'isbn', 'isbn_10', 'isbn_13', 'lccn', 'publish_country', 'publishers', 'publish_date':
        if existing.get(f):
            rec2[f] = existing[f]
    if existing.authors:
        rec2['authors'] = []
        for a in existing.authors:
            author_type = a.type.key
            while author_type == '/type/delete' or author_type == '/type/redirect':
                if author_type == '/type/delete':
                    a = undelete_author(a)
                    author_type = a.type.key
                    continue
                if author_type == '/type/redirect':
                    a = web.ctx.site.get(a.location)
                    author_type = a.type.key
                    continue
            assert author_type == '/type/author'
            assert a['name']
            rec2['authors'].append({'name': a['name'], 'db_name': db_name(a)})

    e2 = build_marc(rec2)
    return attempt_merge(e1, e2, threshold, debug=False)
예제 #8
0
def try_merge(edition, ekey, thing):
    thing_type = thing['type']['key']
    if 'isbn_10' not in edition:
        print(edition)
    asin = edition.get('isbn_10', None) or edition['asin']
    if 'authors' in edition:
        authors = [i['name'] for i in edition['authors']]
    else:
        authors = []
    a = amazon_merge.build_amazon(edition, authors)
    assert isinstance(asin, str)
    assert thing_type == '/type/edition'
    # print edition['asin'], ekey
    if 'source_records' in thing:
        if 'amazon:' + asin in thing['source_records']:
            return True
        return source_records_match(a, thing)

    # print 'no source records'
    mc = get_mc(ekey)
    # print 'mc:', mc
    if mc == 'amazon:' + asin:
        return True
    if not mc:
        return False
    data = get_from_local(mc)
    e1 = build_marc(fast_parse.read_edition(data))
    return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
예제 #9
0
    def test_compare_authors_by_statement(self):
        # requires db_name to be present on both records.
        rec1 = {
            'full_title': 'Full Title, required',
            'authors': [{
                'name': 'Alistair Smith',
                'db_name': 'Alistair Smith'}]}
        rec2 = {
            'full_title': 'A different Full Title, only matching authors here.',
            'authors': [{
                'db_name': 'National Gallery (Great Britain)',
                'name': 'National Gallery (Great Britain)',
                'entity_type': 'org'}],
            'by_statement': 'Alistair Smith.'}

        result = compare_authors(build_marc(rec1), build_marc(rec2))
        assert result == ('main', 'exact match', 125)
예제 #10
0
def test_compare_authors_by_statement():
    # requires db_name to be present on both records.
    rec1 = {
            'full_title': 'Full Title, required',
            'authors': [{
                'name': 'Alistair Smith',
                'db_name': 'Alistair Smith'}]}
    rec2 = {
            'full_title': 'A different Full Title, only matching authors here.',
            'authors': [{
                'db_name': u'National Gallery (Great Britain)',
                'name': u'National Gallery (Great Britain)',
                'entity_type': 'org'}],
            'by_statement': 'Alistair Smith.'}

    result = compare_authors(build_marc(rec1), build_marc(rec2))
    # This expected result taken from the amazon and merge versions of compare_author,
    # Current merge_marc.compare_authors() does not take by_statement into account.
    assert result == ('main', 'exact match', 125)
예제 #11
0
def test_build_marc():
    # used in add_book.load() when trying to find an existing edition match
    edition = {
            'title': 'A test title (parens)',
            'full_title': 'A test full title : subtitle (parens).',  # required, and set by add_book.load()
            'source_records': ['ia:test-source']
            }
    result = build_marc(edition)
    assert isinstance(result['titles'], list)
    assert result['isbn'] == []
    assert result['normalized_title'] == 'a test full title subtitle (parens)'
    assert result['short_title'] == 'a test full title subtitl'
예제 #12
0
def get_record(key, mc):
    data = get_from_archive(mc)
    try:
        rec = fast_parse.read_edition(data)
    except (fast_parse.SoundRecording, IndexError, AssertionError):
        print(mc)
        print(key)
        return False
    try:
        return marc.build_marc(rec)
    except TypeError:
        print(rec)
        raise
예제 #13
0
def ia_match(a, ia):
    try:
        loc, rec = get_ia(ia)
    except urllib.error.HTTPError:
        return False
    if rec is None or 'full_title' not in rec:
        return False
    try:
        e1 = build_marc(rec)
    except TypeError:
        print(rec)
        raise
    return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
예제 #14
0
def test_try_merge(mock_site):
    rec = {
        'title': 'Test item',
        'lccn': ['123'],
        'authors': [{'name': 'Smith, John', 'birth_date': '1980'}],
        'source_records': ['ia:test_item'],
    }
    reply = load(rec)
    ekey = reply['edition']['key']
    e = mock_site.get(ekey)

    rec['full_title'] = rec['title']
    e1 = build_marc(rec)
    add_db_name(e1)
    result = try_merge(e1, ekey, e)
    assert result is True
예제 #15
0
def test_try_merge(mock_site):
    rec = {
        'title': 'Test item',
        'lccn': ['123'],
        'authors': [{'name': 'Smith, John', 'birth_date': '1980'}],
    }
    reply = load(rec)
    ekey = reply['edition']['key']
    e = mock_site.get(ekey)

    rec['full_title'] = rec['title']
    if rec.get('subtitle'):
        rec['full_title'] += ' ' + rec['subtitle']
    e1 = build_marc(rec)
    add_db_name(e1)

    assert try_merge(e1, ekey, e)
예제 #16
0
def editions_match(candidate, existing):
    """
    Converts the existing edition into a comparable dict and performs a
    thresholded comparison to decide whether they are the same.
    Used by add_book.load() -> add_book.find_match() to check whether two
    editions match.

    :param dict candidate: Output of build_marc(import record candidate)
    :param Thing existing: Edition object to be tested against candidate
    :rtype: bool
    :return: Whether candidate is sufficiently the same as the 'existing' edition
    """
    thing_type = existing.type.key
    if thing_type == '/type/delete':
        return False
    # FIXME: will fail if existing is a redirect.
    assert thing_type == '/type/edition'
    rec2 = {}
    rec2['full_title'] = existing.title
    if existing.subtitle:
        rec2['full_title'] += ' ' + existing.subtitle
    for f in (
        'isbn',
        'isbn_10',
        'isbn_13',
        'lccn',
        'publish_country',
        'publishers',
        'publish_date',
    ):
        if existing.get(f):
            rec2[f] = existing[f]
    if existing.authors:
        rec2['authors'] = []
        for a in existing.authors:
            while a.type.key == '/type/redirect':
                a = web.ctx.site.get(a.location)
            if a.type.key == '/type/author':
                assert a['name']
                rec2['authors'].append({'name': a['name'], 'db_name': db_name(a)})
    e2 = build_marc(rec2)
    return threshold_match(candidate, e2, threshold)
예제 #17
0
def test_try_merge(mock_site):
    rec = {
        'title': 'Test item',
        'lccn': ['123'],
        'authors': [{
            'name': 'Smith, John',
            'birth_date': '1980'
        }],
    }
    reply = load(rec)
    ekey = reply['edition']['key']
    e = mock_site.get(ekey)

    rec['full_title'] = rec['title']
    if rec.get('subtitle'):
        rec['full_title'] += ' ' + rec['subtitle']
    e1 = build_marc(rec)
    add_db_name(e1)

    assert try_merge(e1, ekey, e)
예제 #18
0
def try_merge(e1, edition_key, existing):
    thing_type = existing.type.key
    if thing_type == '/type/delete':
        return False
    assert thing_type == '/type/edition'

    rec2 = {}
    rec2['full_title'] = existing.title
    if existing.subtitle:
        rec2['full_title'] += ' ' + existing.subtitle
    if existing.lccn:
        rec2['lccn'] = existing.lccn
    rec2['authors'] = [{'name': a.name, 'db_name': db_name(a)}
        for a in existing.authors]
    if existing.publishers:
        rec2['publishers'] = existing.publishers
    if existing.publish_date:
        rec2['publisher_date'] = existing.publish_date

    e2 = build_marc(rec2)
    print
    print 'e1:', e1
    print 'e2:', e2
    return attempt_merge(e1, e2, threshold, debug=True)
예제 #19
0
def load(rec):
    """Given a record, tries to add/match that edition in the system.

    Record is a dictionary containing all the metadata of the edition.
    The following fields are mandatory:

        * title
        * source_records
    """
    if not rec.get('title'):
        raise RequiredField('title')
    if not rec.get('source_records'):
        raise RequiredField('source_records')
    if isinstance(rec['source_records'], basestring):
        rec['source_records'] = [rec['source_records']]

    edition_pool = build_pool(rec)
    if not edition_pool:
        # No match candidates found, add edition
        return load_data(rec)

    #matches = set(item for sublist in edition_pool.values() for item in sublist)
    #if len(matches) == 1:
    #    return {'success': True, 'edition': {'key': list(matches)[0]}}

    match = early_exit(rec)
    if not match:
        match = find_exact_match(rec, edition_pool)

    if not match:
        rec['full_title'] = rec['title']
        if rec.get('subtitle'):
            rec['full_title'] += ' ' + rec['subtitle']
        e1 = build_marc(rec)
        add_db_name(e1)

        match = find_match(e1, edition_pool)

    if not match:
        # No match found, add edition
        return load_data(rec)

    # We have an edition match at this point
    need_work_save = False
    need_edition_save = False
    w = None
    e = web.ctx.site.get(match)
    if e.works:
        w = e.works[0].dict()
        work_created = False
    else:
        # Found an edition without a work
        work_created = True
        need_work_save = True
        need_edition_save = True
        w = {
            'type': {
                'key': '/type/work'
            },
            'title': get_title(rec),
            'key': web.ctx.site.new_key('/type/work'),
        }
        #TODO: add edition covers and author to new work
        e.works = [{'key': w['key']}]

    # Add subjects to work, if not already present
    if 'subjects' in rec:
        work_subjects = list(w.get('subjects', []))
        for s in rec['subjects']:
            if s not in work_subjects:
                work_subjects.append(s)
                need_work_save = True
        if need_work_save and work_subjects:
            w['subjects'] = work_subjects

    # Add cover to edition, and work, if needed
    if 'cover' in rec and not e.covers:
        cover_url = rec['cover']
        cover_id = add_cover(cover_url, e.key)
        if cover_id:
            e['covers'] = [cover_id]
            need_edition_save = True
            if not w.get('covers'):
                w['covers'] = [cover_id]
                need_work_save = True

    # Add ocaid to edition (str), if needed
    if 'ocaid' in rec and not e.ocaid:
        e['ocaid'] = rec['ocaid']
        need_edition_save = True

    # add values to edition lists
    for f in 'source_records', 'local_id', 'ia_box_id', 'ia_loaded_id':
        if f not in rec:
            continue
        # ensure values is a list
        values = rec[f] if isinstance(rec[f], list) else [rec[f]]
        if f in e:
            # get values from rec that are not currently on the edition
            to_add = [v for v in values if v not in e[f]]
            e[f] += to_add
        else:
            e[f] = to_add = values
        if to_add:
            need_edition_save = True

    edits = []
    reply = {
        'success': True,
        'edition': {
            'key': match,
            'status': 'matched'
        },
        'work': {
            'key': w['key'],
            'status': 'matched'
        },
    }
    if need_edition_save:
        reply['edition']['status'] = 'modified'
        edits.append(e.dict())
    if need_work_save:
        reply['work']['status'] = 'created' if work_created else 'modified'
        edits.append(w)
    if edits:
        web.ctx.site.save_many(edits, 'import existing book')
    return reply
예제 #20
0
def marc_match(a, loc):
    assert loc
    rec = fast_parse.read_edition(get_from_local(loc))
    e1 = build_marc(rec)
    # print 'amazon:', a
    return amazon_merge.attempt_merge(a, e1, threshold, debug=False)
예제 #21
0
def load(rec, account_key=None):
    """Given a record, tries to add/match that edition in the system.

    Record is a dictionary containing all the metadata of the edition.
    The following fields are mandatory:

        * title: str
        * source_records: list

    :param dict rec: Edition record to add
    :rtype: dict
    :return: a dict to be converted into a JSON HTTP response, same as load_data()
    """
    required_fields = ['title', 'source_records'
                       ]  # ['authors', 'publishers', 'publish_date']
    for field in required_fields:
        if not rec.get(field):
            raise RequiredField(field)
    if not isinstance(rec['source_records'], list):
        rec['source_records'] = [rec['source_records']]

    # Split subtitle if required and not already present
    if ':' in rec.get('title') and not rec.get('subtitle'):
        title, subtitle = split_subtitle(rec.get('title'))
        if subtitle:
            rec['title'] = title
            rec['subtitle'] = subtitle

    rec = normalize_record_isbns(rec)

    edition_pool = build_pool(rec)
    if not edition_pool:
        # No match candidates found, add edition
        return load_data(rec, account_key=account_key)

    match = early_exit(rec)
    if not match:
        match = find_exact_match(rec, edition_pool)

    if not match:
        rec['full_title'] = rec['title']
        if rec.get('subtitle'):
            rec['full_title'] += ' ' + rec['subtitle']
        e1 = build_marc(rec)
        add_db_name(e1)
        match = find_match(e1, edition_pool)

    if not match:
        # No match found, add edition
        return load_data(rec, account_key=account_key)

    # We have an edition match at this point
    need_work_save = need_edition_save = False
    w = None
    e = web.ctx.site.get(match)
    # check for, and resolve, author redirects
    for a in e.authors:
        while is_redirect(a):
            if a in e.authors:
                e.authors.remove(a)
            a = web.ctx.site.get(a.location)
            if not is_redirect(a):
                e.authors.append(a)

    if e.get('works'):
        w = e.works[0].dict()
        work_created = False
    else:
        # Found an edition without a work
        work_created = need_work_save = need_edition_save = True
        w = new_work(e.dict(), rec)
        e.works = [{'key': w['key']}]

    # Add subjects to work, if not already present
    if 'subjects' in rec:
        work_subjects = list(w.get('subjects', []))
        for s in rec['subjects']:
            if s not in work_subjects:
                work_subjects.append(s)
                need_work_save = True
        if need_work_save and work_subjects:
            w['subjects'] = work_subjects

    # Add cover to edition
    if 'cover' in rec and not e.get_covers():
        cover_url = rec['cover']
        cover_id = add_cover(cover_url, e.key, account_key=account_key)
        if cover_id:
            e['covers'] = [cover_id]
            need_edition_save = True

    # Add cover to work, if needed
    if not w.get('covers') and e.get_covers():
        w['covers'] = [e['covers'][0]]
        need_work_save = True

    # Add description to work, if needed
    if not w.get('description') and e.get('description'):
        w['description'] = e['description']
        need_work_save = True

    # Add authors to work, if needed
    if not w.get('authors'):
        authors = [import_author(a) for a in rec.get('authors', [])]
        w['authors'] = [{
            'type': {
                'key': '/type/author_role'
            },
            'author': a.key
        } for a in authors if a.get('key')]
        if w.get('authors'):
            need_work_save = True

    # Add ocaid to edition (str), if needed
    if 'ocaid' in rec and not e.ocaid:
        e['ocaid'] = rec['ocaid']
        need_edition_save = True

    # Add list fields to edition as needed
    edition_fields = [
        'local_id',
        'lccn',
        'lc_classifications',
        'source_records',
    ]
    for f in edition_fields:
        if f not in rec:
            continue
        # ensure values is a list
        values = rec[f] if isinstance(rec[f], list) else [rec[f]]
        if f in e:
            # get values from rec that are not currently on the edition
            to_add = [v for v in values if v not in e[f]]
            e[f] += to_add
        else:
            e[f] = to_add = values
        if to_add:
            need_edition_save = True

    edits = []
    reply = {
        'success': True,
        'edition': {
            'key': match,
            'status': 'matched'
        },
        'work': {
            'key': w['key'],
            'status': 'matched'
        },
    }
    if need_edition_save:
        reply['edition']['status'] = 'modified'
        edits.append(e.dict())
    if need_work_save:
        reply['work']['status'] = 'created' if work_created else 'modified'
        edits.append(w)
    if edits:
        web.ctx.site.save_many(edits,
                               comment='import existing book',
                               action='edit-book')
    if 'ocaid' in rec:
        update_ia_metadata_for_ol_edition(match.split('/')[-1])
    return reply
예제 #22
0
                write_log(ia, when, "error: full_title missing")
                continue
            index_fields = make_index_fields(rec)
            if not index_fields:
                print("no index_fields")
                write_log(ia, when, "error: no index fields")
                continue

            edition_pool = pool.build(index_fields)

            if not edition_pool:
                load(ia, use_binary=use_binary)
                write_log(ia, when, "loaded")
                continue

            e1 = build_marc(rec)

            match = False
            seen = set()
            for k, v in edition_pool.iteritems():
                for edition_key in v:
                    if edition_key in seen:
                        continue
                    thing = None
                    found = True
                    while not thing or thing['type']['key'] == '/type/redirect':
                        seen.add(edition_key)
                        thing = withKey(edition_key)
                        assert thing
                        if 'type' not in thing:
                            print(thing)
예제 #23
0
def load(rec):
    """Given a record, tries to add/match that edition in the system.

    Record is a dictionary containing all the metadata of the edition.
    The following fields are mandatory:

        * title
        * source_records
    """
    if not rec.get('title'):
        raise RequiredField('title')
    if not rec.get('source_records'):
        raise RequiredField('source_records')
    if isinstance(rec['source_records'], basestring):
        rec['source_records'] = [rec['source_records']]

    edition_pool = build_pool(rec)
    if not edition_pool:
        return load_data(rec)  # 'no books in pool, loading'

    #matches = set(item for sublist in edition_pool.values() for item in sublist)
    #if len(matches) == 1:
    #    return {'success': True, 'edition': {'key': list(matches)[0]}}

    match = early_exit(rec)
    if not match:
        match = find_exact_match(rec, edition_pool)

    if not match:
        rec['full_title'] = rec['title']
        if rec.get('subtitle'):
            rec['full_title'] += ' ' + rec['subtitle']
        e1 = build_marc(rec)
        add_db_name(e1)

        match = find_match(e1, edition_pool)

    if not match:  # 'match found:', match, rec['ia']
        return load_data(rec)

    need_work_save = False
    need_edition_save = False
    w = None
    e = web.ctx.site.get(match)
    if e.works:
        w = e.works[0].dict()
        work_created = False
    else:
        work_created = True
        need_work_save = True
        need_edition_save = True
        w = {
            'type': {
                'key': '/type/work'
            },
            'title': get_title(rec),
            'key': web.ctx.site.new_key('/type/work'),
        }
        e.works = [{'key': w['key']}]

    reply = {
        'success': True,
        'edition': {
            'key': match,
            'status': 'matched'
        },
        'work': {
            'key': w['key'],
            'status': 'matched'
        },
    }

    if not e.get('source_records'):
        e['source_records'] = []
    existing_source_records = set(e['source_records'])
    for i in rec['source_records']:
        if i not in existing_source_records:
            e['source_records'].append(i)
            need_edition_save = True
    assert e['source_records']

    edits = []
    if False and rec.get('authors'):
        reply['authors'] = []
        east = east_in_by_statement(rec)
        work_authors = list(w.get('authors', []))
        edition_authors = list(e.authors)
        author_in = [import_author(a, eastern=east) for a in rec['authors']]
        for a in author_in:
            new_author = 'key' not in a
            add_to_work = False
            add_to_edition = False
            if new_author:
                a['key'] = web.ctx.site.new_key('/type/author')
                assert isinstance(a, dict)
                edits.append(a)
                add_to_work = True
                add_to_edition = True
            else:
                if not any(i['author'] == a for i in work_authors):
                    add_to_work = True
                if all(i['key'] != a['key'] for i in edition_authors):
                    add_to_edition = True
            if add_to_work:
                need_work_save = True
                work_authors.append({
                    'type': {
                        'key': '/type/author_role'
                    },
                    'author': {
                        'key': a['key']
                    },
                })
            if add_to_edition:
                need_edition_save = True
                edition_authors.append({'key': a['key']})

            reply['authors'].append({
                'key':
                a['key'],
                'name':
                a['name'],
                'status': ('created' if new_author else 'modified'),
            })
        w['authors'] = work_authors
        e['authors'] = edition_authors
    if 'subjects' in rec:
        work_subjects = list(w.get('subjects', []))
        for s in rec['subjects']:
            if s not in work_subjects:
                work_subjects.append(s)
                need_work_save = True
        if need_work_save and work_subjects:
            w['subjects'] = work_subjects
    if 'ocaid' in rec:
        new = 'ia:' + rec['ocaid']
        if not e.ocaid:
            e['ocaid'] = rec['ocaid']
            need_edition_save = True
    if 'cover' in rec and not e.covers:
        cover_url = rec['cover']
        cover_id = add_cover(cover_url, e.key)
        if cover_id:
            e['covers'] = [cover_id]
            need_edition_save = True
            if not w.get('covers'):
                w['covers'] = [cover_id]
                need_work_save = True
    for f in 'ia_box_id', 'ia_loaded_id':
        if f not in rec:
            continue
        if e.get(f):
            assert not isinstance(e[f], basestring)
            assert isinstance(e[f], list)
            if isinstance(rec[f], basestring):
                if rec[f] not in e[f]:
                    e[f].append(rec[f])
                    need_edition_save = True
            else:
                assert isinstance(rec[f], list)
                for x in rec[f]:
                    if x not in e[f]:
                        e[f].append(x)
                        need_edition_save = True
        if isinstance(rec[f], basestring):
            e[f] = [rec[f]]
            need_edition_save = True
        else:
            assert isinstance(rec[f], list)
            e[f] = rec[f]
            need_edition_save = True
        assert not isinstance(e[f], basestring)
        assert isinstance(e[f], list)
    if need_edition_save:
        reply['edition']['status'] = 'modified'
        e_dict = e.dict()
        assert e_dict and isinstance(e_dict, dict)
        edits.append(e_dict)
    if need_work_save:
        reply['work']['status'] = 'created' if work_created else 'modified'
        edits.append(w)
    if edits:
        for i in edits:
            assert i
            assert isinstance(i, dict)

        web.ctx.site.save_many(edits, 'import new book')

    # update_ia_metadata_for_ol_edition(reply['edition']['key'].split('/')[2])

    return reply
예제 #24
0
def load_part(archive_id, part, start_pos=0):
    print('load_part:', archive_id, part)
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print(loc)
            print(fast_parse.get_tag_lines(data, ['245']))
            raise
        except AssertionError:
            print(loc)
            raise
        except fast_parse.NotBook:
            continue
        if not index_fields or 'title' not in index_fields:
            continue

        print(loc)
        edition_pool = pool.build(index_fields)

        if not edition_pool:
            yield loc, data
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                thing = None
                while not thing or thing['type']['key'] == '/type/redirect':
                    seen.add(edition_key)
                    thing = withKey(edition_key)
                    assert thing
                    if thing['type']['key'] == '/type/redirect':
                        print('following redirect %s => %s' %
                              (edition_key, thing['location']))
                        edition_key = thing['location']
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing, data)
                    match = True
                    break
            if match:
                break

        if not match:
            yield loc, data
예제 #25
0
        format = rec['physical_format'].lower()
        if format.startswith('[graphic') or format.startswith('[cartograph'):
            print item, format
    index_fields = make_index_fields(rec)
    if not index_fields:
        print "no index_fields"
        continue
    #print index_fields

    edition_pool = pool.build(index_fields)
    if not edition_pool or not any(v for v in edition_pool.itervalues()):
        print >> new_book, rec
        continue

    print item, edition_pool
    e1 = build_marc(rec)
    print e1

    match = False
    seen = set()
    for k, v in edition_pool.iteritems():
        for edition_key in v:
#            edition_key = '/books/' + re_edition_key.match(edition_key).match(1)
            if edition_key in seen:
                continue
            thing = None
            while not thing or thing['type']['key'] == '/type/redirect':
                seen.add(edition_key)
                thing = withKey(edition_key)
                assert thing
                if thing['type']['key'] == '/type/redirect':
예제 #26
0
def load(rec):
    """Given a record, tries to add/match that edition in the system.

    Record is a dictionary containing all the metadata of the edition.
    The following fields are mandatory:

        * title
        * source_records
    """
    if not rec.get('title'):
        raise RequiredField('title')
    if not rec.get('source_records'):
        raise RequiredField('source_records')
    if isinstance(rec['source_records'], basestring):
        rec['source_records'] = [rec['source_records']]
   
    edition_pool = build_pool(rec)
    if not edition_pool:
        return load_data(rec) # 'no books in pool, loading'

    #matches = set(item for sublist in edition_pool.values() for item in sublist)
    #if len(matches) == 1:
    #    return {'success': True, 'edition': {'key': list(matches)[0]}}

    match = early_exit(rec)
    if not match:
        match = find_exact_match(rec, edition_pool)

    if not match:
        rec['full_title'] = rec['title']
        if rec.get('subtitle'):
            rec['full_title'] += ' ' + rec['subtitle']
        e1 = build_marc(rec)
        add_db_name(e1)

        match = find_match(e1, edition_pool)

    if not match: # 'match found:', match, rec['ia']
        return load_data(rec)

    need_work_save = False
    need_edition_save = False
    w = None
    e = web.ctx.site.get(match)
    if e.works:
        w = e.works[0].dict()
        work_created = False
    else:
        work_created = True
        need_work_save = True
        need_edition_save = True
        w = {
            'type': {'key': '/type/work'},
            'title': get_title(rec),
            'key': web.ctx.site.new_key('/type/work'),
        }
        e.works = [{'key': w['key']}]

    reply = {
        'success': True,
        'edition': {'key': match, 'status': 'matched'},
        'work': {'key': w['key'], 'status': 'matched'},
    }

    if not e.get('source_records'):
        e['source_records'] = []
    existing_source_records = set(e['source_records'])
    for i in rec['source_records']:
        if i not in existing_source_records:
            e['source_records'].append(i)
            need_edition_save = True
    assert e['source_records']

    edits = []
    if False and rec.get('authors'):
        reply['authors'] = []
        east = east_in_by_statement(rec)
        work_authors = list(w.get('authors', []))
        edition_authors = list(e.authors)
        author_in = [import_author(a, eastern=east) for a in rec['authors']]
        for a in author_in:
            new_author = 'key' not in a
            add_to_work = False
            add_to_edition = False
            if new_author:
                a['key'] = web.ctx.site.new_key('/type/author')
                assert isinstance(a, dict)
                edits.append(a)
                add_to_work = True
                add_to_edition = True
            else:
                if not any(i['author'] == a for i in work_authors):
                    add_to_work = True
                if all(i['key'] != a['key'] for i in edition_authors):
                    add_to_edition = True
            if add_to_work:
                need_work_save = True
                work_authors.append({
                    'type': {'key': '/type/author_role'},
                    'author': {'key': a['key'] },
                })
            if add_to_edition:
                need_edition_save = True
                edition_authors.append({'key': a['key'] })

            reply['authors'].append({
                'key': a['key'],
                'name': a['name'],
                'status': ('created' if new_author else 'modified'),
            })
        w['authors'] = work_authors
        e['authors'] = edition_authors
    if 'subjects' in rec:
        work_subjects = list(w.get('subjects', []))
        for s in rec['subjects']:
            if s not in work_subjects:
                work_subjects.append(s)
                need_work_save = True
        if need_work_save and work_subjects:
            w['subjects'] = work_subjects
    if 'ocaid' in rec:
        new = 'ia:' + rec['ocaid']
        if not e.ocaid:
            e['ocaid'] = rec['ocaid']
            need_edition_save = True
    if 'cover' in rec and not e.covers:
        cover_url = rec['cover']
        cover_id = add_cover(cover_url, e.key)
        if cover_id:
            e['covers'] = [cover_id]
            need_edition_save = True
            if not w.get('covers'):
                w['covers'] = [cover_id]
                need_work_save = True
    for f in 'ia_box_id', 'ia_loaded_id':
        if f not in rec:
            continue
        if e.get(f):
            assert not isinstance(e[f], basestring)
            assert isinstance(e[f], list)
            if isinstance(rec[f], basestring):
                if rec[f] not in e[f]:
                    e[f].append(rec[f])
                    need_edition_save = True
            else:
                assert isinstance(rec[f], list)
                for x in rec[f]:
                    if x not in e[f]:
                        e[f].append(x)
                        need_edition_save = True
        if isinstance(rec[f], basestring):
            e[f] = [rec[f]]
            need_edition_save = True
        else:
            assert isinstance(rec[f], list)
            e[f] = rec[f]
            need_edition_save = True
        assert not isinstance(e[f], basestring)
        assert isinstance(e[f], list)
    if need_edition_save:
        reply['edition']['status'] = 'modified'
        e_dict = e.dict()
        assert e_dict and isinstance(e_dict, dict)
        edits.append(e_dict)
    if need_work_save:
        reply['work']['status'] = 'created' if work_created else 'modified'
        edits.append(w)
    if edits:
        for i in edits:
            assert i
            assert isinstance(i, dict)

        web.ctx.site.save_many(edits, 'import new book')
    return reply
예제 #27
0
def load(rec):
    """Given a record, tries to add/match that edition in the system.

    Record is a dictionary containing all the metadata of the edition.
    The following fields are mandatory:

        * title
        * source_records
    """
    if not rec.get('title'):
        raise RequiredField('title')
    if not rec.get('source_records'):
        raise RequiredField('source_records')
    if isinstance(rec['source_records'], basestring):
        rec['source_records'] = [rec['source_records']]

    edition_pool = build_pool(rec)
    if not edition_pool:
        # No match candidates found, add edition
        return load_data(rec)

    #matches = set(item for sublist in edition_pool.values() for item in sublist)
    #if len(matches) == 1:
    #    return {'success': True, 'edition': {'key': list(matches)[0]}}

    match = early_exit(rec)
    if not match:
        match = find_exact_match(rec, edition_pool)

    if not match:
        rec['full_title'] = rec['title']
        if rec.get('subtitle'):
            rec['full_title'] += ' ' + rec['subtitle']
        e1 = build_marc(rec)
        add_db_name(e1)

        match = find_match(e1, edition_pool)

    if not match:
        # No match found, add edition
        return load_data(rec)

    # We have an edition match at this point
    need_work_save = False
    need_edition_save = False
    w = None
    e = web.ctx.site.get(match)
    if e.works:
        w = e.works[0].dict()
        work_created = False
    else:
        # Found an edition without a work
        work_created = True
        need_work_save = True
        need_edition_save = True
        w = {
            'type': {'key': '/type/work'},
            'title': get_title(rec),
            'key': web.ctx.site.new_key('/type/work'),
        }
        #TODO: add edition covers and author to new work
        e.works = [{'key': w['key']}]

    # Add subjects to work, if not already present
    if 'subjects' in rec:
        work_subjects = list(w.get('subjects', []))
        for s in rec['subjects']:
            if s not in work_subjects:
                work_subjects.append(s)
                need_work_save = True
        if need_work_save and work_subjects:
            w['subjects'] = work_subjects

    # Add cover to edition, and work, if needed
    if 'cover' in rec and not e.covers:
        cover_url = rec['cover']
        cover_id = add_cover(cover_url, e.key)
        if cover_id:
            e['covers'] = [cover_id]
            need_edition_save = True
            if not w.get('covers'):
                w['covers'] = [cover_id]
                need_work_save = True

    # Add ocaid to edition (str), if needed
    if 'ocaid' in rec and not e.ocaid:
        e['ocaid'] = rec['ocaid']
        need_edition_save = True

    # add values to edition lists
    for f in 'source_records', 'local_id', 'ia_box_id', 'ia_loaded_id':
        if f not in rec:
            continue
        # ensure values is a list
        values = rec[f] if isinstance(rec[f], list) else [rec[f]]
        if f in e:
            # get values from rec that are not currently on the edition
            to_add = [v for v in values if v not in e[f]]
            e[f] += to_add
        else:
            e[f] = to_add = values
        if to_add:
            need_edition_save = True

    edits = []
    reply = {
        'success': True,
        'edition': {'key': match, 'status': 'matched'},
        'work': {'key': w['key'], 'status': 'matched'},
    }
    if need_edition_save:
        reply['edition']['status'] = 'modified'
        edits.append(e.dict())
    if need_work_save:
        reply['work']['status'] = 'created' if work_created else 'modified'
        edits.append(w)
    if edits:
        web.ctx.site.save_many(edits, 'import existing book')
    return reply
예제 #28
0
def load(rec):
    if not rec.get('title'):
        raise RequiredField('title')
    edition_pool = build_pool(rec)
    #print 'pool:', edition_pool
    if not edition_pool:
        return load_data(rec)  # 'no books in pool, loading'

    #matches = set(item for sublist in edition_pool.values() for item in sublist)
    #if len(matches) == 1:
    #    return {'success': True, 'edition': {'key': list(matches)[0]}}

    match = find_exact_match(rec, edition_pool)

    if not match:
        rec['full_title'] = rec['title']
        if rec.get('subtitle'):
            rec['full_title'] += ' ' + rec['subtitle']
        e1 = build_marc(rec)
        add_db_name(e1)

        #print
        #print 'e1', e1
        #print
        #print 'pool', edition_pool
        match = find_match(e1, edition_pool)

    if match:  # 'match found:', match, rec['ia']
        e = web.ctx.site.get(match)
        w = e['works'][0]
        reply = {
            'success': True,
            'edition': {
                'key': match,
                'status': 'matched'
            },
            'work': {
                'key': w.key,
                'status': 'matched'
            },
        }

        edits = []
        need_work_save = False
        need_edition_save = False
        if rec.get('authors'):
            reply['authors'] = []
            east = east_in_by_statement(rec)
            work_authors = list(w.authors)
            edition_authors = list(e.authors)
            author_in = [
                import_author(a, eastern=east) for a in rec['authors']
            ]
            for a in author_in:
                new_author = 'key' not in a
                add_to_work = False
                add_to_edition = False
                if new_author:
                    a['key'] = web.ctx.site.new_key('/type/author')
                    edits.append(a)
                    add_to_work = True
                    add_to_edition = True
                else:
                    if not any(i.author.key == a['key'] for i in work_authors):
                        add_to_work = True
                    if not any(i.key == a['key'] for i in edition_authors):
                        add_to_edition = True
                if add_to_work:
                    need_work_save = True
                    work_authors.append({
                        'type': {
                            'key': '/type/author_role'
                        },
                        'author': {
                            'key': a['key']
                        },
                    })
                if add_to_edition:
                    need_edition_save = True
                    edition_authors.append({'key': a['key']})

                reply['authors'].append({
                    'key':
                    a['key'],
                    'name':
                    a['name'],
                    'status': ('created' if new_author else 'modified'),
                })
            w.authors = work_authors
            e.authors = edition_authors
        if 'subjects' in rec:
            work_subjects = list(w.subjects)
            for s in rec['subjects']:
                if s not in w.subjects:
                    #print 'w.subjects.append(%s)' % s
                    work_subjects.append(s)
                    need_work_save = True
            if need_work_save:
                w.subjects = work_subjects
        if need_edition_save:
            reply['edition']['status'] = 'modified'
            edits.append(e)
            web.ctx.site.save(e, match, 'update edition')
        if need_work_save:
            reply['work']['status'] = 'modified'
            edits.append(w)
        if edits:
            web.ctx.site.save_many(edits, 'import new book')

        return reply
        #add_source_records(match, ia)
    else:  # 'no match found', rec['ia']
        return load_data(rec)
예제 #29
0
def load(rec):
    """Given a record, tries to add/match that edition in the system.

    Record is a dictionary containing all the metadata of the edition.
    The following fields are mandatory:

        * title
        * source_records
    """
    if not rec.get('title'):
        raise RequiredField('title')
    if not rec.get('source_records'):
        raise RequiredField('source_records')
    if isinstance(rec['source_records'], six.string_types):
        rec['source_records'] = [rec['source_records']]

    edition_pool = build_pool(rec)
    if not edition_pool:
        # No match candidates found, add edition
        return load_data(rec)

    match = early_exit(rec)
    if not match:
        match = find_exact_match(rec, edition_pool)

    if not match:
        rec['full_title'] = rec['title']
        if rec.get('subtitle'):
            rec['full_title'] += ' ' + rec['subtitle']
        e1 = build_marc(rec)
        add_db_name(e1)
        match = find_match(e1, edition_pool)

    if not match:
        # No match found, add edition
        return load_data(rec)

    # We have an edition match at this point
    need_work_save = need_edition_save = False
    w = None
    e = web.ctx.site.get(match)
    if e.get('works'):
        w = e.works[0].dict()
        work_created = False
    else:
        # Found an edition without a work
        work_created = need_work_save = need_edition_save = True
        w = new_work(e.dict(), rec)
        e.works = [{'key': w['key']}]

    # Add subjects to work, if not already present
    if 'subjects' in rec:
        work_subjects = list(w.get('subjects', []))
        for s in rec['subjects']:
            if s not in work_subjects:
                work_subjects.append(s)
                need_work_save = True
        if need_work_save and work_subjects:
            w['subjects'] = work_subjects

    # Add cover to edition
    if 'cover' in rec and not e.covers:
        cover_url = rec['cover']
        cover_id = add_cover(cover_url, e.key)
        if cover_id:
            e['covers'] = [cover_id]
            need_edition_save = True

    # Add cover to work if needed
    if not w.get('covers') and e.get('covers'):
        w['covers'] = [e['covers'][0]]
        need_work_save = True

    # Add authors to work if needed
    if not w.get('authors'):
        authors = [import_author(a) for a in rec.get('authors', [])]
        w['authors'] = [{'type':{'key': '/type/author_role'}, 'author': a.key} for a in authors if a.get('key')]
        if w.get('authors'):
            need_work_save = True

    # Add ocaid to edition (str), if needed
    if 'ocaid' in rec and not e.ocaid:
        e['ocaid'] = rec['ocaid']
        need_edition_save = True

    edition_fields = [
        'local_id', 'ia_box_id', 'ia_loaded_id', 'source_records']
    # XXX Todos:
    # only consider `source_records` for newly created work
    # or if field originally missing:
    #if work_created and not e.get('source_records'):
    #    edition_fields.append('source_records')
    for f in edition_fields:
        if f not in rec:
            continue
        # ensure values is a list
        values = rec[f] if isinstance(rec[f], list) else [rec[f]]
        if f in e:
            # get values from rec that are not currently on the edition
            to_add = [v for v in values if v not in e[f]]
            e[f] += to_add
        else:
            e[f] = to_add = values
        if to_add:
            need_edition_save = True

    edits = []
    reply = {
        'success': True,
        'edition': {'key': match, 'status': 'matched'},
        'work': {'key': w['key'], 'status': 'matched'},
    }
    if need_edition_save:
        reply['edition']['status'] = 'modified'
        edits.append(e.dict())
    if need_work_save:
        reply['work']['status'] = 'created' if work_created else 'modified'
        edits.append(w)
    if edits:
        web.ctx.site.save_many(edits, 'import existing book')
    if 'ocaid' in rec:
        update_ia_metadata_for_ol_edition(match.split('/')[-1])
    return reply
예제 #30
0
def load(rec):
    """Given a record, tries to add/match that edition in the system.

    Record is a dictionary containing all the metadata of the edition.
    The following fields are mandatory:

        * title
        * source_records
    """
    if not rec.get('title'):
        raise RequiredField('title')
    if not rec.get('source_records'):
        raise RequiredField('source_records')
    if isinstance(rec['source_records'], basestring):
        rec['source_records'] = [rec['source_records']]

    edition_pool = build_pool(rec)
    if not edition_pool:
        # No match candidates found, add edition
        return load_data(rec)

    match = early_exit(rec)
    if not match:
        match = find_exact_match(rec, edition_pool)

    if not match:
        rec['full_title'] = rec['title']
        if rec.get('subtitle'):
            rec['full_title'] += ' ' + rec['subtitle']
        e1 = build_marc(rec)
        add_db_name(e1)
        match = find_match(e1, edition_pool)

    if not match:
        # No match found, add edition
        return load_data(rec)

    # We have an edition match at this point
    need_work_save = need_edition_save = False
    w = None
    e = web.ctx.site.get(match)
    if hasattr(e, 'works'):
        w = e.works[0].dict()
        work_created = False
    else:
        # Found an edition without a work
        work_created = need_work_save = need_edition_save = True
        w = new_work(e, rec)
        e.works = [{'key': w['key']}]

    # Add subjects to work, if not already present
    if 'subjects' in rec:
        work_subjects = list(w.get('subjects', []))
        for s in rec['subjects']:
            if s not in work_subjects:
                work_subjects.append(s)
                need_work_save = True
        if need_work_save and work_subjects:
            w['subjects'] = work_subjects

    # Add cover to edition
    if 'cover' in rec and not e.covers:
        cover_url = rec['cover']
        cover_id = add_cover(cover_url, e.key)
        if cover_id:
            e['covers'] = [cover_id]
            need_edition_save = True

    # Add cover to work if needed
    if not w.get('covers') and e.get('covers'):
        w['covers'] = [e['covers'][0]]
        need_work_save = True

    # Add ocaid to edition (str), if needed
    if 'ocaid' in rec and not e.ocaid:
        e['ocaid'] = rec['ocaid']
        need_edition_save = True

    edition_fields = [
        'local_id', 'ia_box_id', 'ia_loaded_id', 'source_records'
    ]
    # XXX Todos:
    # only consider `source_records` for newly created work
    # or if field originally missing:
    #if work_created and not e.get('source_records'):
    #    edition_fields.append('source_records')
    for f in edition_fields:
        if f not in rec:
            continue
        # ensure values is a list
        values = rec[f] if isinstance(rec[f], list) else [rec[f]]
        if f in e:
            # get values from rec that are not currently on the edition
            to_add = [v for v in values if v not in e[f]]
            e[f] += to_add
        else:
            e[f] = to_add = values
        if to_add:
            need_edition_save = True

    edits = []
    reply = {
        'success': True,
        'edition': {
            'key': match,
            'status': 'matched'
        },
        'work': {
            'key': w['key'],
            'status': 'matched'
        },
    }
    if need_edition_save:
        reply['edition']['status'] = 'modified'
        edits.append(e.dict())
    if need_work_save:
        reply['work']['status'] = 'created' if work_created else 'modified'
        edits.append(w)
    if edits:
        web.ctx.site.save_many(edits, 'import existing book')
    if 'ocaid' in rec:
        update_ia_metadata_for_ol_edition(match.split('/')[-1])
    return reply
예제 #31
0
def load(rec):
    if not rec.get('title'):
        raise RequiredField('title')
    edition_pool = build_pool(rec)
    #print 'pool:', edition_pool
    if not edition_pool:
        return load_data(rec) # 'no books in pool, loading'

    #matches = set(item for sublist in edition_pool.values() for item in sublist)
    #if len(matches) == 1:
    #    return {'success': True, 'edition': {'key': list(matches)[0]}}

    match = find_exact_match(rec, edition_pool)

    if not match:
        rec['full_title'] = rec['title']
        if rec.get('subtitle'):
            rec['full_title'] += ' ' + rec['subtitle']
        e1 = build_marc(rec)
        add_db_name(e1)

        #print
        #print 'e1', e1
        #print 
        #print 'pool', edition_pool
        match = find_match(e1, edition_pool)

    if match: # 'match found:', match, rec['ia']
        e = web.ctx.site.get(match)
        w = e['works'][0]
        reply = {
            'success': True,
            'edition': {'key': match, 'status': 'matched'},
            'work': {'key': w.key, 'status': 'matched'},
        }

        need_work_save = False
        need_edition_save = False
        if rec.get('authors'):
            reply['authors'] = []
            east = east_in_by_statement(rec)
            work_authors = list(w.authors)
            edition_authors = list(e.authors)
            author_in = [import_author(a, eastern=east) for a in rec['authors']]
            for a in author_in:
                new_author = 'key' not in a
                add_to_work = False
                add_to_edition = False
                if new_author:
                    a['key'] = web.ctx.site.new_key('/type/author')
                    aobj = web.ctx.site.save(a, comment='new author')
                    add_to_work = True
                    add_to_edition = True
                else:
                    if not any(i.author.key == a['key'] for i in work_authors):
                        add_to_work = True
                    if not any(i.key == a['key'] for i in edition_authors):
                        add_to_edition = True
                if add_to_work:
                    need_work_save = True
                    work_authors.append({
                        'type': {'key': '/type/author_role'},
                        'author': {'key': a['key'] },
                    })
                if add_to_edition:
                    need_edition_save = True
                    edition_authors.append({'key': a['key'] })

                reply['authors'].append({
                    'key': a['key'],
                    'name': a['name'],
                    'status': ('created' if new_author else 'modified'),
                })
            w.authors = work_authors
            e.authors = edition_authors
        if 'subjects' in rec:
            work_subjects = list(w.subjects)
            for s in rec['subjects']:
                if s not in w.subjects:
                    #print 'w.subjects.append(%s)' % s
                    work_subjects.append(s)
                    need_work_save = True
            if need_work_save:
                w.subjects = work_subjects
        if need_edition_save:
            reply['edition']['status'] = 'modified'
            web.ctx.site.save(e, match, 'update edition')
        if need_work_save:
            reply['work']['status'] = 'modified'
            web.ctx.site.save(w, w.key, 'update work')
        return reply
        #add_source_records(match, ia)
    else: # 'no match found', rec['ia']
        return load_data(rec)
예제 #32
0
def load_part(archive_id, part, start_pos=0):
    print 'load_part:', archive_id, part
    global rec_no, t_prev, load_count
    full_part = archive_id + "/" + part
    f = open(rc['marc_path'] + "/" + full_part)
    if start_pos:
        f.seek(start_pos)
    for pos, loc, data in read_marc_file(full_part, f, pos=start_pos):
        rec_no += 1
        if rec_no % chunk == 0:
            progress(archive_id, rec_no, start_pos, pos)

        if is_loaded(loc):
            continue
        want = ['001', '003', '010', '020', '035', '245']
        try:
            index_fields = fast_parse.index_fields(data, want)
        except KeyError:
            print loc
            print fast_parse.get_tag_lines(data, ['245'])
            raise
        except AssertionError:
            print loc
            raise
        except fast_parse.NotBook:
            continue
        if not index_fields or 'title' not in index_fields:
            continue

        print loc
        edition_pool = pool.build(index_fields)

        if not edition_pool:
            yield loc, data
            continue

        rec = fast_parse.read_edition(data)
        e1 = build_marc(rec)

        match = False
        seen = set()
        for k, v in edition_pool.iteritems():
            for edition_key in v:
                if edition_key in seen:
                    continue
                thing = None
                while not thing or thing['type']['key'] == '/type/redirect':
                    seen.add(edition_key)
                    thing = withKey(edition_key)
                    assert thing
                    if thing['type']['key'] == '/type/redirect':
                        print 'following redirect %s => %s' % (edition_key, thing['location'])
                        edition_key = thing['location']
                if try_merge(e1, edition_key, thing):
                    add_source_records(edition_key, loc, thing, data)
                    match = True
                    break
            if match:
                break

        if not match:
            yield loc, data