예제 #1
0
def work_and_marc():
    i = 0
    skip = True
    for w in query_iter({'type': '/type/work', 'title': None}):
        if skip:
            if w['key'] == '/w/OL56814W':
                skip = False
            else:
                continue
        marc = set()
        q = {
            'type': '/type/edition',
            'works': w['key'],
            'title': None,
            'source_records': None
        }
        for e in query_iter(q):
            if e.get('source_records', []):
                marc.update(i[5:] for i in e['source_records']
                            if i.startswith('marc:'))
            mc = get_mc(e['key'])
            if mc and not mc.startswith('ia:') and not mc.startswith(
                    'amazon:'):
                marc.add(mc)
        if marc:
            yield w, marc
        else:
            print('no marc:', w)
예제 #2
0
def by_authors():
    find_new_work_key()

    skipping = False
    skipping = True
    q = { 'type':'/type/author', 'name': None, 'works': None }
    for a in query_iter(q, offset=215000):
        akey = a['key']
        if skipping:
            print('skipping:', akey, a['name'])
            if akey == '/a/OL218496A':
                skipping = False
            continue

        q = {
            'type':'/type/work',
            'authors': akey,
        }
        if query(q):
            print((akey, repr(a['name']), 'has works'))
            continue

    #    print akey, a['name']
        found = find_works(akey)
        works = [i for i in found if len(i['editions']) > 2]
        if works:
            #open('found/' + akey[3:], 'w').write(repr(works))
            print((akey, repr(a['name'])))
            #pprint(works)
            #print_works(works)
            add_works(akey, works)
            print()
예제 #3
0
def update_work(w, obj_cache={}, debug=False, resolve_redirects=False):
    wkey = w['key']
    assert wkey.startswith('/works')
    assert '/' not in wkey[7:]
    q = {'type': '/type/redirect', 'location': wkey}
    redirect_keys = [r['key'][7:] for r in query_iter(q)]
    redirects = ''.join('<query>key:%s</query>' % r for r in redirect_keys
                        if '/' not in r)
    delete_xml = '<delete><query>key:%s</query>%s</delete>' % (wkey[7:],
                                                               redirects)
    requests = [delete_xml]

    if w['type']['key'] == '/type/work' and w.get('title'):
        try:
            doc = build_doc(w, obj_cache, resolve_redirects=resolve_redirects)
        except:
            print w
            raise
        if doc is not None:
            add = Element("add")
            add.append(doc)
            add_xml = tostring(add).encode('utf-8')
            requests.append(add_xml)

    return requests
예제 #4
0
def books_query(akey): # live version
    q = {
        'type':'/type/edition',
        'authors': akey,
        '*': None
    }
    return query_iter(q)
예제 #5
0
def by_authors():
    find_new_work_key()

    skipping = False
    skipping = True
    q = {'type': '/type/author', 'name': None, 'works': None}
    for a in query_iter(q, offset=215000):
        akey = a['key']
        if skipping:
            print('skipping:', akey, a['name'])
            if akey == '/a/OL218496A':
                skipping = False
            continue

        q = {
            'type': '/type/work',
            'authors': akey,
        }
        if query(q):
            print((akey, repr(a['name']), 'has works'))
            continue

    #    print akey, a['name']
        found = find_works(akey)
        works = [i for i in found if len(i['editions']) > 2]
        if works:
            #open('found/' + akey[3:], 'w').write(repr(works))
            print((akey, repr(a['name'])))
            #print_works(works)
            add_works(akey, works)
            print()
예제 #6
0
def update_work(w, obj_cache=None, debug=False, resolve_redirects=False):
    if obj_cache is None:
        obj_cache = {}

    wkey = w['key']
    #assert wkey.startswith('/works')
    #assert '/' not in wkey[7:]
    deletes = []
    requests = []

    q = {'type': '/type/redirect', 'location': wkey}
    redirect_keys = [r['key'][7:] for r in query_iter(q)]

    deletes += redirect_keys
    deletes += [wkey[7:]] # strip /works/ from /works/OL1234W

    # handle edition records as well
    # When an edition is not belonged to a work, create a fake work and index it.
    if w['type']['key'] == '/type/edition' and w.get('title'):
        edition = w
        w = {
            # Use key as /works/OL1M. 
            # In case of single-core-solr, we are using full path as key. So it is required
            # to be unique across all types of documents.
            # The website takes care of redirecting /works/OL1M to /books/OL1M.
            'key': edition['key'].replace("/books/", "/works/"),
            'type': {'key': '/type/work'},
            'title': edition['title'],
            'editions': [edition]
        }
        # Hack to add subjects when indexing /books/ia:xxx
        if edition.get("subjects"):
            w['subjects'] = edition['subjects']

    if w['type']['key'] == '/type/work' and w.get('title'):
        try:
            d = build_data(w, obj_cache=obj_cache, resolve_redirects=resolve_redirects)
            doc = dict2element(d)
        except:
            logger.error("failed to update work %s", w['key'], exc_info=True)
        else:
            if d is not None:
                # Delete all ia:foobar keys
                # XXX-Anand: The works in in_library subject were getting wiped off for unknown reasons.
                # I suspect that this might be a cause. Disabling temporarily.
                #if d.get('ia'):
                #    deletes += ["ia:" + iaid for iaid in d['ia']]

                # In single core solr, we use full path as key, not just the last part
                if is_single_core():
                    deletes = ["/works/" + k for k in deletes]

                requests.append(make_delete_query(deletes))

                add = Element("add")
                add.append(doc)
                add_xml = tostring(add).encode('utf-8')
                requests.append(add_xml)

    return requests
예제 #7
0
def update_edition(e):
    if not is_single_core():
        return []

    ekey = e['key']
    logger.info("updating edition %s", ekey)

    wkey = e.get('works') and e['works'][0]['key']
    w = wkey and withKey(wkey)
    authors = []

    if w:
        authors = [withKey(a['author']['key']) for a in w.get("authors", []) if 'author' in a]

    request_set = SolrRequestSet()
    request_set.delete(ekey)

    q = {'type': '/type/redirect', 'location': ekey}
    redirect_keys = [r['key'] for r in query_iter(q)]
    for k in redirect_keys:
        request_set.delete(k)

    doc = EditionBuilder(e, w, authors).build()
    request_set.add(doc)
    return request_set.get_requests()
예제 #8
0
def is_loaded(loc):
    assert loc.startswith('marc:')
    vars = {'loc': loc[5:]}
    db_iter = marc_index.query('select * from machine_comment where v=$loc', vars)
    if list(db_iter):
        return True
    iter = query_iter({'type': '/type/edition', 'source_records': loc})
    return bool(list(iter))
예제 #9
0
def switch_author(ol, old, new, other, debug=False):
    q = { 'authors': old, 'type': '/type/edition', }
    for e in query_iter(q):
        if debug:
            print 'switch author:', e['key']
        print e
        e = ol.get(e['key'])
        update_edition(ol, e, other, new, debug)
예제 #10
0
def switch_author(ol, old, new, other, debug=False):
    q = { 'authors': old, 'type': '/type/edition', }
    for e in query_iter(q):
        if debug:
            print('switch author:', e['key'])
        print(e)
        e = ol.get(e['key'])
        update_edition(ol, e, other, new, debug)
예제 #11
0
 def title_search(self, v):
     q = {'type': '/type/edition', 'isbn_10': None, 'title': v}
     editions = []
     for e in query_iter(q):
         e['title'] = v
         editions.append(e)
     yield 'searcing for title "' + web.htmlquote(v) + '": '
     yield from self.search(editions)
예제 #12
0
def add_cover_to_work(w):
    if 'cover_edition' in w:
        return
    q = {'type':'/type/edition', 'works':w['key'], 'publish_date': None, 'languages': '/l/eng'}
    cover_edition = pick_cover(query_iter(q))
    if not cover_edition:
        q = {'type':'/type/edition', 'works':w['key'], 'publish_date': None}
        cover_edition = pick_cover(query_iter(q))
        if not cover_edition:
            return
    w['cover_edition'] = Reference(cover_edition)
    if ol is None:
        rc = read_rc()
        ol = OpenLibrary("http://openlibrary.org")
        ol.login('WorkBot', rc['WorkBot']) 

    print ol.save(w['key'], w, 'added cover to work')
예제 #13
0
 def isbn_search(self, v):
     q = {'type': '/type/edition', 'isbn_10': v, 'title': None, 'subtitle': None}
     editions = []
     for e in query_iter(q):
         e['isbn_10'] = v
         editions.append(e)
     yield 'searching for ISBN ' + web.htmlquote(v) + ': '
     for i in self.search(editions):
         yield i
예제 #14
0
def is_loaded(loc):
    assert loc.startswith('marc:')
    vars = {'loc': loc[5:]}
    db_iter = marc_index.query('select * from machine_comment where v=$loc',
                               vars)
    if list(db_iter):
        return True
    iter = query_iter({'type': '/type/edition', 'source_records': loc})
    return bool(list(iter))
예제 #15
0
 def oclc_search(self, v):
     q = {'type': '/type/edition', 'oclc_numbers': v, 'title': None, 'subtitle': None, 'isbn_10': None}
     editions = []
     print(q)
     for e in query_iter(q):
         e['oclc_numbers'] = v
         editions.append(e)
     yield 'searching for OCLC ' + web.htmlquote(v) + ': '
     for i in self.search(editions):
         yield i
예제 #16
0
def work_and_marc():
    i = 0
    skip = True
    for w in query_iter({'type': '/type/work', 'title': None}):
        if skip:
            if w['key'] == '/w/OL56814W':
                skip = False
            else:
                continue
        marc = set()
        q = {'type': '/type/edition', 'works': w['key'], 'title': None, 'source_records': None}
        for e in query_iter(q):
            if e.get('source_records', []):
                marc.update(i[5:] for i in e['source_records'] if i.startswith('marc:'))
            mc = get_mc(e['key'])
            if mc and not mc.startswith('ia:') and not mc.startswith('amazon:'):
                marc.add(mc)
        if marc:
            yield w, marc
        else:
            print('no marc:', w)
예제 #17
0
def books_query(akey):  # live version
    q = {
        'type': '/type/edition',
        'authors': akey,
        'source_records': None,
        'title': None,
        'work_title': None,
        'languages': None,
        'title_prefix': None,
        'subtitle': None,
    }
    return query_iter(q)
예제 #18
0
def books_query(akey): # live version
    q = {
        'type':'/type/edition',
        'authors': akey,
        'source_records': None,
        'title': None,
        'work_title': None,
        'languages': None,
        'title_prefix': None,
        'subtitle': None,
    }
    return query_iter(q)
예제 #19
0
def update_work(w, obj_cache=None, debug=False, resolve_redirects=False):
    if obj_cache is None:
        obj_cache = {}

    wkey = w['key']
    #assert wkey.startswith('/works')
    #assert '/' not in wkey[7:]
    deletes = []
    requests = []

    q = {'type': '/type/redirect', 'location': wkey}
    redirect_keys = [r['key'][7:] for r in query_iter(q)]

    deletes += redirect_keys
    deletes += [wkey[7:]] # strip /works/ from /works/OL1234W

    # handle edition records as well
    # When an edition is not belonged to a work, create a fake work and index it.
    if w['type']['key'] == '/type/edition' and w.get('title'):
        edition = w
        w = {
            'key': edition['key'],
            'type': {'key': '/type/work'},
            'title': edition['title'],
            'editions': [edition]
        }

    if w['type']['key'] == '/type/work' and w.get('title'):
        try:
            d = build_data(w, obj_cache=obj_cache, resolve_redirects=resolve_redirects)
            doc = dict2element(d)
        except:
            logger.error("failed to update work %s", w['key'], exc_info=True)
        else:
            if d is not None:
                # Delete all ia:foobar keys
                # 
                if d.get('ia'):
                    deletes += ["ia:" + iaid for iaid in d['ia']]

                requests.append(make_delete_query(deletes))

                add = Element("add")
                add.append(doc)
                add_xml = tostring(add).encode('utf-8')
                requests.append(add_xml)

    return requests
예제 #20
0
def build_data(w, obj_cache=None, resolve_redirects=False):
    wkey = w['key']

    # Anand - Oct 2013
    # For /works/ia:xxx, editions are already suplied. Querying will empty response.
    if "editions" in w:
        editions = w['editions']
    else:
        q = { 'type':'/type/edition', 'works': wkey, '*': None }
        editions = list(query_iter(q))
    authors = SolrProcessor().extract_authors(w)

    iaids = [e["ocaid"] for e in editions if "ocaid" in e]
    ia = dict((iaid, get_ia_collection_and_box_id(iaid)) for iaid in iaids)

    duplicates = {}

    return build_data2(w, editions, authors, ia, duplicates)
예제 #21
0
def merge_authors(keys):
#    print 'merge author %s:"%s" and %s:"%s"' % (author['key'], author['name'], merge_with['key'], merge_with['name'])
#    print 'becomes: "%s"' % `new_name`
    authors = [a for a in (withKey(k) for k in keys) if a['type']['key'] != '/type/redirect']
    not_redirect = set(a['key'] for a in authors)
    for a in authors:
        print a

    assert all(a['type']['key'] == '/type/author' for a in authors)
    name1 = authors[0]['name']
    assert all(match_with_bad_chars(a['name'], name1) for a in authors[1:])

    best_key = pick_best_author(authors)['key']

    imgs = [a['key'] for a in authors if has_image(a['key'])]
    if len(imgs) == 1:
        new_key = imgs[0]
    else:
        new_key = "/a/OL%dA" % min(key_int(a) for a in authors)
        # Molière and O. J. O. Ferreira
        if len(imgs) != 0:
            print 'imgs:', imgs
            return # skip
        if not (imgs == [u'/a/OL21848A', u'/a/OL4280680A'] \
                or imgs == [u'/a/OL325189A', u'/a/OL266422A'] \
                or imgs == [u'/a/OL5160945A', u'/a/OL5776228A']):
            print imgs
            assert len(imgs) == 0

    do_normalize(new_key, best_key, authors)
    old_keys = set(k for k in keys if k != new_key) 
    print 'old keys:', old_keys
    for old in old_keys:
        # /b/OL21291659M
        switch_author(old, new_key, old_keys)
        if old in not_redirect:
            make_redirect(old, new_key)
        q = { 'authors': old, 'type': '/type/edition', }
        if list(get_things(q)) != []:
            switch_author(old, new_key, old_keys)
        l = list(query_iter(q))
        print old, l
        assert l == []
예제 #22
0
def by_authors():
    skip = '/a/OL25755A'
    q = {'type': '/type/author', 'name': None}
    for a in query_iter(q):
        akey = a['key']
        if skip:
            if akey == skip:
                skip = None
            else:
                continue
        write_log('author', akey, a.get('name', 'name missing'))

        works = find_works(akey, get_books(akey, books_query(akey)))
        print((akey, repr(a['name'])))

        for w in works:
            w['author'] = akey
            wkey = get_work_key(w['title'], akey)
            if wkey:
                w['key'] = wkey
            yield w
예제 #23
0
def by_authors():
    q = { 'type':'/type/author', 'name': None, 'works': None }
    for a in query_iter(q):
        akey = a['key']
        write_log('author', akey, a.get('name', 'name missing'))
        q = {
            'type':'/type/work',
            'authors': akey,
        }

        works = find_works(akey)
        print akey, `a['name']`
        for w in works:
            w['author'] = akey
            work_queue.append(w)
            if len(work_queue) > 1000:
                for e in run_queue(work_queue):
                    yield e
                work_queue = []
    for e in run_queue(work_queue):
        yield e
예제 #24
0
def by_authors():
    skip = '/a/OL25755A'
    q = { 'type':'/type/author', 'name': None }
    for a in query_iter(q):
        akey = a['key']
        if skip:
            if akey == skip:
                skip = None
            else:
                continue
        write_log('author', akey, a.get('name', 'name missing'))

        works = find_works(akey, get_books(akey, books_query(akey)))
        print((akey, repr(a['name'])))

        for w in works:
            w['author'] = akey
            wkey = get_work_key(w['title'], akey)
            if wkey:
                w['key'] = wkey
            yield w
예제 #25
0
def update_work(w):
    wkey = w['key']
    assert wkey.startswith('/works')
    assert '/' not in wkey[7:]
    q = {'type': '/type/redirect', 'location': wkey}
    redirect_keys = [r['key'][7:] for r in query_iter(q)]
    redirects = ''.join('<query>key:%s</query>' % r for r in redirect_keys if '/' not in r)
    delete_xml = '<delete><query>key:%s</query>%s</delete>' % (wkey[7:], redirects)
    requests = [delete_xml]

    if w['type']['key'] == '/type/work' and w.get('title', None):
        try:
            doc = build_doc(w)
        except:
            print w
            raise
        if doc is not None:
            add = Element("add")
            add.append(doc)
            add_xml = tostring(add).encode('utf-8')
            requests.append(add_xml)

    return requests
예제 #26
0
def update_work(w, obj_cache=None, debug=False, resolve_redirects=False):
    if obj_cache is None:
        obj_cache = {}

    wkey = w['key']
    #assert wkey.startswith('/works')
    #assert '/' not in wkey[7:]
    q = {'type': '/type/redirect', 'location': wkey}
    redirect_keys = [r['key'][7:] for r in query_iter(q)]
    redirects = ''.join('<query>key:%s</query>' % r for r in redirect_keys if '/' not in r)
    delete_xml = '<delete><query>key:%s</query>%s</delete>' % (wkey[7:].replace(":", r"\:"), redirects)
    requests = [delete_xml]

    # handle edition records as well
    # When an edition is not belonged to a work, create a fake work and index it.
    if w['type']['key'] == '/type/edition' and w.get('title'):
        edition = w
        w = {
            'key': edition['key'],
            'type': {'key': '/type/work'},
            'title': edition['title'],
            'editions': [edition]
        }

    if w['type']['key'] == '/type/work' and w.get('title'):
        try:
            doc = build_doc(w, obj_cache, resolve_redirects=resolve_redirects)
        except:
            logger.error("failed to update work %s", w['key'], exc_info=True)
        else:
            if doc is not None:
                add = Element("add")
                add.append(doc)
                add_xml = tostring(add).encode('utf-8')
                requests.append(add_xml)

    return requests
예제 #27
0
    akey = sys.argv[1]
#    out = open('book_cache', 'w')
#    for b in books_query(akey):
#        print >> out, b
#    out.close()
#    sys.exit(0)
    works = find_works(akey, get_books(akey, books_query(akey)))
    #works = find_works(akey, get_books(akey, books_from_cache()))

    do_updates = False

    while True: # until redirects repaired
        q = {'type':'/type/edition', 'authors':akey, 'works': None}
        work_to_edition = defaultdict(set)
        edition_to_work = defaultdict(set)
        for e in query_iter(q):
            if e.get('works', None):
                for w in e['works']:
                    work_to_edition[w['key']].add(e['key'])
                    edition_to_work[e['key']].add(w['key'])

        work_title = {}
        fix_redirects = []
        for k, editions in work_to_edition.items():
            w = withKey(k)
            if w['type']['key'] == '/type/redirect':
                print 'redirect found'
                wkey = w['location']
                assert re_work_key.match(wkey)
                for ekey in editions:
                    e = withKey(ekey)
예제 #28
0
def build_data(w, obj_cache=None, resolve_redirects=False):
    if obj_cache:
        obj_cache = {}

    wkey = w['key']
    assert w['type']['key'] == '/type/work'
    title = w.get('title', None)
    if not title:
        return

    p = SolrProcessor(obj_cache, resolve_redirects)
    get_pub_year = p.get_pub_year

    # Load stuff if not already provided
    if 'editions' not in w:
        q = {'type': '/type/edition', 'works': wkey, '*': None}
        w['editions'] = list(query_iter(q))
        #print 'editions:', [e['key'] for e in w['editions']]

    identifiers = defaultdict(list)
    editions = p.process_editions(w, identifiers)
    authors = p.extract_authors(w)

    has_fulltext = any(e.get('ocaid', None) for e in editions)

    subjects = p.get_subject_counts(w, editions, has_fulltext)

    def add_field(doc, name, value):
        doc[name] = value

    def add_field_list(doc, name, field_list):
        doc[name] = list(field_list)

    doc = p.build_data(w, editions, subjects, has_fulltext)

    cover_edition = pick_cover(w, editions)
    if cover_edition:
        add_field(doc, 'cover_edition_key',
                  re_edition_key.match(cover_edition).group(1))
    if w.get('covers'):
        cover = w['covers'][0]
        assert isinstance(cover, int)
        add_field(doc, 'cover_i', cover)

    k = 'first_sentence'
    fs = set(e[k]['value'] if isinstance(e[k], dict) else e[k]
             for e in editions if e.get(k, None))
    add_field_list(doc, k, fs)

    publishers = set()
    for e in editions:
        publishers.update('Sine nomine' if is_sine_nomine(i) else i
                          for i in e.get('publishers', []))
    add_field_list(doc, 'publisher', publishers)
    #    add_field_list(doc, 'publisher_facet', publishers)

    lang = set()
    ia_loaded_id = set()
    ia_box_id = set()

    last_modified_i = datetimestr_to_int(w.get('last_modified'))

    for e in editions:
        for l in e.get('languages', []):
            m = re_lang_key.match(l['key'] if isinstance(l, dict) else l)
            lang.add(m.group(1))
        if e.get('ia_loaded_id'):
            if isinstance(e['ia_loaded_id'], basestring):
                ia_loaded_id.add(e['ia_loaded_id'])
            else:
                try:
                    assert isinstance(e['ia_loaded_id'], list) and isinstance(
                        e['ia_loaded_id'][0], basestring)
                except AssertionError:
                    print e.get('ia')
                    print e['ia_loaded_id']
                    raise
                ia_loaded_id.update(e['ia_loaded_id'])
        if e.get('ia_box_id'):
            if isinstance(e['ia_box_id'], basestring):
                ia_box_id.add(e['ia_box_id'])
            else:
                try:
                    assert isinstance(e['ia_box_id'], list) and isinstance(
                        e['ia_box_id'][0], basestring)
                except AssertionError:
                    print e['key']
                    raise
                ia_box_id.update(e['ia_box_id'])
    if lang:
        add_field_list(doc, 'language', lang)

    #if lending_edition or in_library_edition:
    #    add_field(doc, "borrowed_b", is_borrowed(lending_edition or in_library_edition))

    author_keys = [re_author_key.match(a['key']).group(1) for a in authors]
    author_names = [a.get('name', '') for a in authors]
    add_field_list(doc, 'author_key', author_keys)
    add_field_list(doc, 'author_name', author_names)

    alt_names = set()
    for a in authors:
        if 'alternate_names' in a:
            alt_names.update(a['alternate_names'])

    add_field_list(doc, 'author_alternative_name', alt_names)
    add_field_list(doc, 'author_facet',
                   (' '.join(v) for v in zip(author_keys, author_names)))
    #if subjects:
    #    add_field(doc, 'fiction', subjects['fiction'])

    for k in 'person', 'place', 'subject', 'time':
        if k not in subjects:
            continue
        add_field_list(doc, k, subjects[k].keys())
        add_field_list(doc, k + '_facet', subjects[k].keys())
        subject_keys = [str_to_key(s) for s in subjects[k].keys()]
        add_field_list(doc, k + '_key', subject_keys)

    for k in sorted(identifiers.keys()):
        add_field_list(doc, 'id_' + k, identifiers[k])

    if ia_loaded_id:
        add_field_list(doc, 'ia_loaded_id', ia_loaded_id)

    if ia_box_id:
        add_field_list(doc, 'ia_box_id', ia_box_id)

    return doc
예제 #29
0
set_staging(True)
rc = read_rc()

ol = OpenLibrary("http://dev.openlibrary.org")
ol.login('EdwardBot', rc['EdwardBot'])
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

work_q = {
    'type': '/type/work',
    'authors': None,
    'title': None,
}

queue = []

for w in query_iter(work_q):
    if not w.get('authors'):
        print('no authors')
        continue
    if any(isinstance(a, dict) and 'author' in a for a in w['authors']):
        continue
    print(len(queue), w['key'],
          w['title'])  # , ol.get(w['authors'][0]['key'])['name']
    full = ol.get(w['key'])
    authors = full['authors']
    assert all(isinstance(a, Reference) for a in authors)
    full['authors'] = [{'author': a} for a in authors]
    queue.append(full)
    if len(queue) > 1000:
        print('saving')
        print(
예제 #30
0
def build_doc(w, obj_cache={}, resolve_redirects=False):
    wkey = w['key']
    assert w['type']['key'] == '/type/work'
    title = w.get('title', None)
    if not title:
        return

    def get_pub_year(e):
        pub_date = e.get('publish_date', None)
        if pub_date:
            m = re_iso_date.match(pub_date)
            if m:
                return m.group(1)
            m = re_year.search(pub_date)
            if m:
                return m.group(1)

    if 'editions' not in w:
        q = { 'type':'/type/edition', 'works': wkey, '*': None }
        w['editions'] = list(query_iter(q))
        #print 'editions:', [e['key'] for e in w['editions']]

    identifiers = defaultdict(list)

    editions = []
    for e in w['editions']:
        pub_year = get_pub_year(e)
        if pub_year:
            e['pub_year'] = pub_year
        ia = None
        if 'ocaid' in e:
            ia = e['ocaid']
        elif 'ia_loaded_id' in e:
            loaded = e['ia_loaded_id']
            ia = loaded if isinstance(loaded, basestring) else loaded[0]
        if ia:
            ia_meta_fields = get_ia_collection_and_box_id(ia)
            collection = ia_meta_fields['collection']
            if 'ia_box_id' in e and isinstance(e['ia_box_id'], basestring):
                e['ia_box_id'] = [e['ia_box_id']]
            if ia_meta_fields.get('boxid'):
                box_id = list(ia_meta_fields['boxid'])[0]
                e.setdefault('ia_box_id', [])
                if box_id.lower() not in [x.lower() for x in e['ia_box_id']]:
                    e['ia_box_id'].append(box_id)
            #print 'collection:', collection
            e['ia_collection'] = collection
            e['public_scan'] = ('lendinglibrary' not in collection) and ('printdisabled' not in collection)
        overdrive_id = e.get('identifiers', {}).get('overdrive', None)
        if overdrive_id:
            #print 'overdrive:', overdrive_id
            e['overdrive'] = overdrive_id
        if 'identifiers' in e:
            for k, id_list in e['identifiers'].iteritems():
                k_orig = k
                k = k.replace('.', '_').replace(',', '_').replace('(', '').replace(')', '').replace(':', '_').replace('/', '').replace('#', '').lower()
                m = re_solr_field.match(k)
                if not m:
                    print (k_orig, k)
                assert m
                for v in id_list:
                    v = v.strip()
                    if v not in identifiers[k]:
                        identifiers[k].append(v)
        editions.append(e)

    editions.sort(key=lambda e: e.get('pub_year', None))

    #print len(w['editions']), 'editions found'

    #print w['key']
    work_authors = []
    authors = []
    author_keys = []
    for a in w.get('authors', []):
        if 'author' not in a: # OL Web UI bug
            continue # http://openlibrary.org/works/OL15365167W.yml?m=edit&v=1
        akey = a['author']['key']
        m = re_author_key.match(akey)
        if not m:
            print 'invalid author key:', akey
            continue
        work_authors.append(akey)
        author_keys.append(m.group(1))
        if akey in obj_cache and obj_cache[akey]['type']['key'] != '/type/redirect':
            authors.append(obj_cache[akey])
        else:
            authors.append(withKey(akey))
    if any(a['type']['key'] == '/type/redirect' for a in authors):
        if resolve_redirects:
            def resolve(a):
                if a['type']['key'] == '/type/redirect':
                    a = withKey(a['location'])
                return a
            authors = [resolve(a) for a in authors]
        else:
            print
            for a in authors:
                print 'author:', a
            print w['key']
            print
            raise AuthorRedirect
    assert all(a['type']['key'] == '/type/author' for a in authors)

    try:
        subjects = four_types(get_work_subjects(w))
    except:
        print 'bad work: ', w['key']
        raise

    field_map = {
        'subjects': 'subject',
        'subject_places': 'place',
        'subject_times': 'time',
        'subject_people': 'person',
    }

    has_fulltext = any(e.get('ocaid', None) or e.get('overdrive', None) for e in editions)

    #print 'has_fulltext:', has_fulltext

    for db_field, solr_field in field_map.iteritems():
        if not w.get(db_field, None):
            continue
        cur = subjects.setdefault(solr_field, {})
        for v in w[db_field]:
            try:
                if isinstance(v, dict):
                    if 'value' not in v:
                        continue
                    v = v['value']
                cur[v] = cur.get(v, 0) + 1
            except:
                print 'v:', v
                raise

    if any(e.get('ocaid', None) for e in editions):
        subjects.setdefault('subject', {})
        subjects['subject']['Accessible book'] = subjects['subject'].get('Accessible book', 0) + 1
        if not has_fulltext:
            subjects['subject']['Protected DAISY'] = subjects['subject'].get('Protected DAISY', 0) + 1
        #print w['key'], subjects['subject']

    doc = Element("doc")

    add_field(doc, 'key', w['key'][7:])
    title = w.get('title', None)
    if title:
        add_field(doc, 'title', title)
#        add_field(doc, 'title_suggest', title)

    add_field(doc, 'has_fulltext', has_fulltext)
    if w.get('subtitle', None):
        add_field(doc, 'subtitle', w['subtitle'])

    alt_titles = set()
    for e in editions:
        if 'title' in e and e['title'] != title:
            alt_titles.add(e['title'])
        for f in 'work_titles', 'other_titles':
            for t in e.get(f, []):
                if t != title:
                    alt_titles.add(t)
    add_field_list(doc, 'alternative_title', alt_titles)

    alt_subtitles = set( e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None))
    add_field_list(doc, 'alternative_subtitle', alt_subtitles)

    add_field(doc, 'edition_count', len(editions))
    for e in editions:
        add_field(doc, 'edition_key', re_edition_key.match(e['key']).group(1))

    cover_edition = pick_cover(w, editions)
    if cover_edition:
        add_field(doc, 'cover_edition_key', re_edition_key.match(cover_edition).group(1))
    if w.get('covers'):
        cover = w['covers'][0]
        assert isinstance(cover, int)
        add_field(doc, 'cover_i', cover)

    k = 'by_statement'
    add_field_list(doc, k, set( e[k] for e in editions if e.get(k, None)))

    k = 'publish_date'
    pub_dates = set(e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, pub_dates)
    pub_years = set(m.group(1) for m in (re_year.search(i) for i in pub_dates) if m)
    if pub_years:
        add_field_list(doc, 'publish_year', pub_years)
        add_field(doc, 'first_publish_year', min(int(i) for i in pub_years))

    k = 'first_sentence'
    fs = set( e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, fs)

    publishers = set()
    for e in editions:
        publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', []))
    add_field_list(doc, 'publisher', publishers)
#    add_field_list(doc, 'publisher_facet', publishers)

    field_map = [
        ('lccn', 'lccn'),
        ('publish_places', 'publish_place'),
        ('oclc_numbers', 'oclc'),
        ('contributions', 'contributor'),
    ]

    for db_key, search_key in field_map:
        v = set()
        for e in editions:
            if db_key not in e:
                continue
            v.update(e[db_key])
        add_field_list(doc, search_key, v)

    isbn = set()
    for e in editions:
        for f in 'isbn_10', 'isbn_13':
            for v in e.get(f, []):
                v = v.replace('-', '')
                isbn.add(v)
                alt = opposite_isbn(v)
                if alt:
                    isbn.add(alt)
    add_field_list(doc, 'isbn', isbn)

    lang = set()
    ia_loaded_id = set()
    ia_box_id = set()

    for e in editions:
        for l in e.get('languages', []):
            m = re_lang_key.match(l['key'] if isinstance(l, dict) else l)
            lang.add(m.group(1))
        if e.get('ia_loaded_id'):
            if isinstance(e['ia_loaded_id'], basestring):
                ia_loaded_id.add(e['ia_loaded_id'])
            else:
                try:
                    assert isinstance(e['ia_loaded_id'], list) and isinstance(e['ia_loaded_id'][0], basestring)
                except AssertionError:
                    print e.get('ia')
                    print e['ia_loaded_id']
                    raise
                ia_loaded_id.update(e['ia_loaded_id'])
        if e.get('ia_box_id'):
            if isinstance(e['ia_box_id'], basestring):
                ia_box_id.add(e['ia_box_id'])
            else:
                try:
                    assert isinstance(e['ia_box_id'], list) and isinstance(e['ia_box_id'][0], basestring)
                except AssertionError:
                    print e['key']
                    raise
                ia_box_id.update(e['ia_box_id'])
    if lang:
        add_field_list(doc, 'language', lang)

    pub_goog = set() # google
    pub_nongoog = set()
    nonpub_goog = set()
    nonpub_nongoog = set()

    public_scan = False
    all_collection = set()
    all_overdrive = set()
    lending_edition = None
    in_library_edition = None
    printdisabled = set()
    for e in editions:
        if 'overdrive' in e:
            all_overdrive.update(e['overdrive'])
        if 'ocaid' not in e:
            continue
        if not lending_edition and 'lendinglibrary' in e.get('ia_collection', []):
            lending_edition = re_edition_key.match(e['key']).group(1)
        if not in_library_edition and 'inlibrary' in e.get('ia_collection', []):
            in_library_edition = re_edition_key.match(e['key']).group(1)
        if 'printdisabled' in e.get('ia_collection', []):
            printdisabled.add(re_edition_key.match(e['key']).group(1))
        all_collection.update(e.get('ia_collection', []))
        assert isinstance(e['ocaid'], basestring)
        i = e['ocaid'].strip()
        if e.get('public_scan'):
            public_scan = True
            if i.endswith('goog'):
                pub_goog.add(i)
            else:
                pub_nongoog.add(i)
        else:
            if i.endswith('goog'):
                nonpub_goog.add(i)
            else:
                nonpub_nongoog.add(i)
    #print 'lending_edition:', lending_edition
    ia_list = list(pub_nongoog) + list(pub_goog) + list(nonpub_nongoog) + list(nonpub_goog)
    add_field_list(doc, 'ia', ia_list)
    if has_fulltext:
        add_field(doc, 'public_scan_b', public_scan)
    if all_collection:
        add_field(doc, 'ia_collection_s', ';'.join(all_collection))
    if all_overdrive:
        add_field(doc, 'overdrive_s', ';'.join(all_overdrive))
    if lending_edition:
        add_field(doc, 'lending_edition_s', lending_edition)
    elif in_library_edition:
        add_field(doc, 'lending_edition_s', in_library_edition)
    if printdisabled:
        add_field(doc, 'printdisabled_s', ';'.join(list(printdisabled)))
        
    if lending_edition or in_library_edition:
        add_field(doc, "borrowed_b", is_borrowed(lending_edition or in_library_edition))

    author_keys = [re_author_key.match(a['key']).group(1) for a in authors]
    author_names = [a.get('name', '') for a in authors]
    add_field_list(doc, 'author_key', author_keys)
    add_field_list(doc, 'author_name', author_names)

    alt_names = set()
    for a in authors:
        if 'alternate_names' in a:
            alt_names.update(a['alternate_names'])

    add_field_list(doc, 'author_alternative_name', alt_names)
    add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names)))
    #if subjects:
    #    add_field(doc, 'fiction', subjects['fiction'])

    for k in 'person', 'place', 'subject', 'time':
        if k not in subjects:
            continue
        add_field_list(doc, k, subjects[k].keys())
        add_field_list(doc, k + '_facet', subjects[k].keys())
        subject_keys = [str_to_key(s) for s in subjects[k].keys()]
        add_field_list(doc, k + '_key', subject_keys)

    for k in sorted(identifiers.keys()):
        add_field_list(doc, 'id_' + k, identifiers[k])

    if ia_loaded_id:
        add_field_list(doc, 'ia_loaded_id', ia_loaded_id)

    if ia_box_id:
        add_field_list(doc, 'ia_box_id', ia_box_id)
        
    return doc
예제 #31
0
def build_data(w, obj_cache=None, resolve_redirects=False):
    if obj_cache:
        obj_cache = {}

    wkey = w['key']
    assert w['type']['key'] == '/type/work'
    title = w.get('title', None)
    if not title:
        return
        
    p = SolrProcessor(obj_cache, resolve_redirects)
    get_pub_year = p.get_pub_year

    # Load stuff if not already provided
    if 'editions' not in w:
        q = { 'type':'/type/edition', 'works': wkey, '*': None }
        w['editions'] = list(query_iter(q))
        #print 'editions:', [e['key'] for e in w['editions']]

    identifiers = defaultdict(list)
    editions = p.process_editions(w, identifiers)
    authors = p.extract_authors(w)

    has_fulltext = any(e.get('ocaid', None) for e in editions)
    
    subjects = p.get_subject_counts(w, editions, has_fulltext)
            
    def add_field(doc, name, value):
        doc[name] = value

    def add_field_list(doc, name, field_list):
        doc[name] = list(field_list)
    
    doc = p.build_data(w, editions, subjects, has_fulltext)
    

    cover_edition = pick_cover(w, editions)
    if cover_edition:
        add_field(doc, 'cover_edition_key', re_edition_key.match(cover_edition).group(1))
    if w.get('covers'):
        cover = w['covers'][0]
        assert isinstance(cover, int)
        add_field(doc, 'cover_i', cover)

    k = 'first_sentence'
    fs = set( e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, fs)

    publishers = set()
    for e in editions:
        publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', []))
    add_field_list(doc, 'publisher', publishers)
#    add_field_list(doc, 'publisher_facet', publishers)

    lang = set()
    ia_loaded_id = set()
    ia_box_id = set()

    last_modified_i = datetimestr_to_int(w.get('last_modified'))

    for e in editions:
        for l in e.get('languages', []):
            m = re_lang_key.match(l['key'] if isinstance(l, dict) else l)
            lang.add(m.group(1))
        if e.get('ia_loaded_id'):
            if isinstance(e['ia_loaded_id'], basestring):
                ia_loaded_id.add(e['ia_loaded_id'])
            else:
                try:
                    assert isinstance(e['ia_loaded_id'], list) and isinstance(e['ia_loaded_id'][0], basestring)
                except AssertionError:
                    print e.get('ia')
                    print e['ia_loaded_id']
                    raise
                ia_loaded_id.update(e['ia_loaded_id'])
        if e.get('ia_box_id'):
            if isinstance(e['ia_box_id'], basestring):
                ia_box_id.add(e['ia_box_id'])
            else:
                try:
                    assert isinstance(e['ia_box_id'], list) and isinstance(e['ia_box_id'][0], basestring)
                except AssertionError:
                    print e['key']
                    raise
                ia_box_id.update(e['ia_box_id'])
    if lang:
        add_field_list(doc, 'language', lang)

        
    #if lending_edition or in_library_edition:
    #    add_field(doc, "borrowed_b", is_borrowed(lending_edition or in_library_edition))

    author_keys = [re_author_key.match(a['key']).group(1) for a in authors]
    author_names = [a.get('name', '') for a in authors]
    add_field_list(doc, 'author_key', author_keys)
    add_field_list(doc, 'author_name', author_names)

    alt_names = set()
    for a in authors:
        if 'alternate_names' in a:
            alt_names.update(a['alternate_names'])

    add_field_list(doc, 'author_alternative_name', alt_names)
    add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names)))
    #if subjects:
    #    add_field(doc, 'fiction', subjects['fiction'])

    for k in 'person', 'place', 'subject', 'time':
        if k not in subjects:
            continue
        add_field_list(doc, k, subjects[k].keys())
        add_field_list(doc, k + '_facet', subjects[k].keys())
        subject_keys = [str_to_key(s) for s in subjects[k].keys()]
        add_field_list(doc, k + '_key', subject_keys)

    for k in sorted(identifiers.keys()):
        add_field_list(doc, 'id_' + k, identifiers[k])

    if ia_loaded_id:
        add_field_list(doc, 'ia_loaded_id', ia_loaded_id)

    if ia_box_id:
        add_field_list(doc, 'ia_box_id', ia_box_id)
        
    return doc
예제 #32
0
def update_author(akey, a=None, handle_redirects=True):
    # http://ia331507.us.archive.org:8984/solr/works/select?indent=on&q=author_key:OL22098A&facet=true&rows=1&sort=edition_count%20desc&fl=title&facet.field=subject_facet&facet.mincount=1
    if akey == '/authors/':
        return
    m = re_author_key.match(akey)
    if not m:
        print 'bad key:', akey
    assert m
    author_id = m.group(1)
    if not a:
        a = withKey(akey)
    if a['type']['key'] in ('/type/redirect',
                            '/type/delete') or not a.get('name', None):
        return ['<delete><query>key:%s</query></delete>' % author_id]
    try:
        assert a['type']['key'] == '/type/author'
    except AssertionError:
        print a['type']['key']
        raise

    facet_fields = ['subject', 'time', 'person', 'place']

    url = 'http://' + get_solr(
        'works'
    ) + '/solr/works/select?wt=json&json.nl=arrarr&q=author_key:%s&sort=edition_count+desc&rows=1&fl=title,subtitle&facet=true&facet.mincount=1' % author_id
    url += ''.join('&facet.field=%s_facet' % f for f in facet_fields)
    reply = json.load(urlopen(url))
    work_count = reply['response']['numFound']
    docs = reply['response'].get('docs', [])
    top_work = None
    if docs:
        top_work = docs[0]['title']
        if docs[0].get('subtitle', None):
            top_work += ': ' + docs[0]['subtitle']
    all_subjects = []
    for f in facet_fields:
        for s, num in reply['facet_counts']['facet_fields'][f + '_facet']:
            all_subjects.append((num, s))
    all_subjects.sort(reverse=True)
    top_subjects = [s for num, s in all_subjects[:10]]

    add = Element("add")
    doc = SubElement(add, "doc")
    add_field(doc, 'key', author_id)
    if a.get('name', None):
        add_field(doc, 'name', a['name'])
    for f in 'birth_date', 'death_date', 'date':
        if a.get(f, None):
            add_field(doc, f, a[f])
    if top_work:
        add_field(doc, 'top_work', top_work)
    add_field(doc, 'work_count', work_count)
    add_field_list(doc, 'top_subjects', top_subjects)

    requests = []
    if handle_redirects:
        q = {'type': '/type/redirect', 'location': akey}
        try:
            redirects = ''.join('<id>%s</id>' %
                                re_author_key.match(r['key']).group(1)
                                for r in query_iter(q))
        except AttributeError:
            print 'redirects:', [r['key'] for r in query_iter(q)]
            raise
        if redirects:
            requests.append('<delete>' + redirects + '</delete>')

    requests.append(tostring(add).encode('utf-8'))
    return requests
예제 #33
0
def build_doc(w, obj_cache={}, resolve_redirects=False):
    wkey = w['key']
    assert w['type']['key'] == '/type/work'
    title = w.get('title', None)
    if not title:
        return

    def get_pub_year(e):
        pub_date = e.get('publish_date', None)
        if pub_date:
            m = re_iso_date.match(pub_date)
            if m:
                return m.group(1)
            m = re_year.search(pub_date)
            if m:
                return m.group(1)

    if 'editions' not in w:
        q = {'type': '/type/edition', 'works': wkey, '*': None}
        w['editions'] = list(query_iter(q))
        print 'editions:', [e['key'] for e in w['editions']]

    editions = []
    for e in w['editions']:
        pub_year = get_pub_year(e)
        if pub_year:
            e['pub_year'] = pub_year
        if 'ocaid' in e:
            collection = get_ia_collection(e['ocaid'])
            #print 'collection:', collection
            e['ia_collection'] = collection
            e['public_scan'] = ('lendinglibrary'
                                not in collection) and ('printdisabled'
                                                        not in collection)
        overdrive_id = e.get('identifiers', {}).get('overdrive', None)
        if overdrive_id:
            #print 'overdrive:', overdrive_id
            e['overdrive'] = overdrive_id
        editions.append(e)

    editions.sort(key=lambda e: e.get('pub_year', None))

    #print len(w['editions']), 'editions found'

    #print w['key']
    work_authors = []
    authors = []
    author_keys = []
    for a in w.get('authors', []):
        if 'author' not in a:  # OL Web UI bug
            continue  # http://openlibrary.org/works/OL15365167W.yml?m=edit&v=1
        akey = a['author']['key']
        m = re_author_key.match(akey)
        if not m:
            print 'invalid author key:', akey
            continue
        work_authors.append(akey)
        author_keys.append(m.group(1))
        if akey in obj_cache and obj_cache[akey]['type'][
                'key'] != '/type/redirect':
            authors.append(obj_cache[akey])
        else:
            authors.append(withKey(akey))
    if any(a['type']['key'] == '/type/redirect' for a in authors):
        if resolve_redirects:

            def resolve(a):
                if a['type']['key'] == '/type/redirect':
                    a = withKey(a['location'])
                return a

            authors = [resolve(a) for a in authors]
        else:
            print
            for a in authors:
                print 'author:', a
            print w['key']
            print
            raise AuthorRedirect
    for a in authors:
        print 'author:', a
    assert all(a['type']['key'] == '/type/author' for a in authors)

    try:
        subjects = four_types(get_work_subjects(w))
    except:
        print 'bad work: ', w['key']
        raise

    field_map = {
        'subjects': 'subject',
        'subject_places': 'place',
        'subject_times': 'time',
        'subject_people': 'person',
    }

    has_fulltext = any(
        e.get('ocaid', None) or e.get('overdrive', None) for e in editions)

    #print 'has_fulltext:', has_fulltext

    for db_field, solr_field in field_map.iteritems():
        if not w.get(db_field, None):
            continue
        cur = subjects.setdefault(solr_field, {})
        for v in w[db_field]:
            try:
                if isinstance(v, dict):
                    if 'value' not in v:
                        continue
                    v = v['value']
                cur[v] = cur.get(v, 0) + 1
            except:
                print 'v:', v
                raise

    if any(e.get('ocaid', None) for e in editions):
        subjects.setdefault('subject', {})
        subjects['subject']['Accessible book'] = subjects['subject'].get(
            'Accessible book', 0) + 1
        if not has_fulltext:
            subjects['subject']['Protected DAISY'] = subjects['subject'].get(
                'Protected DAISY', 0) + 1
        #print w['key'], subjects['subject']

    doc = Element("doc")

    add_field(doc, 'key', w['key'][7:])
    title = w.get('title', None)
    if title:
        add_field(doc, 'title', title)


#        add_field(doc, 'title_suggest', title)

    add_field(doc, 'has_fulltext', has_fulltext)
    if w.get('subtitle', None):
        add_field(doc, 'subtitle', w['subtitle'])

    alt_titles = set()
    for e in editions:
        if 'title' in e and e['title'] != title:
            alt_titles.add(e['title'])
        for f in 'work_titles', 'other_titles':
            for t in e.get(f, []):
                if t != title:
                    alt_titles.add(t)
    add_field_list(doc, 'alternative_title', alt_titles)

    alt_subtitles = set(e['subtitle'] for e in editions
                        if e.get('subtitle', None)
                        and e['subtitle'] != w.get('subtitle', None))
    add_field_list(doc, 'alternative_subtitle', alt_subtitles)

    add_field(doc, 'edition_count', len(editions))
    for e in editions:
        add_field(doc, 'edition_key', re_edition_key.match(e['key']).group(1))

    cover_edition = pick_cover(w, editions)
    if cover_edition:
        add_field(doc, 'cover_edition_key',
                  re_edition_key.match(cover_edition).group(1))

    k = 'by_statement'
    add_field_list(doc, k, set(e[k] for e in editions if e.get(k, None)))

    k = 'publish_date'
    pub_dates = set(e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, pub_dates)
    pub_years = set(
        m.group(1) for m in (re_year.search(i) for i in pub_dates) if m)
    if pub_years:
        add_field_list(doc, 'publish_year', pub_years)
        add_field(doc, 'first_publish_year', min(int(i) for i in pub_years))

    k = 'first_sentence'
    fs = set(e[k]['value'] if isinstance(e[k], dict) else e[k]
             for e in editions if e.get(k, None))
    add_field_list(doc, k, fs)

    publishers = set()
    for e in editions:
        publishers.update('Sine nomine' if is_sine_nomine(i) else i
                          for i in e.get('publishers', []))
    add_field_list(doc, 'publisher', publishers)
    #    add_field_list(doc, 'publisher_facet', publishers)

    field_map = [
        ('lccn', 'lccn'),
        ('publish_places', 'publish_place'),
        ('oclc_numbers', 'oclc'),
        ('contributions', 'contributor'),
    ]

    for db_key, search_key in field_map:
        v = set()
        for e in editions:
            if db_key not in e:
                continue
            v.update(e[db_key])
        add_field_list(doc, search_key, v)

    isbn = set()
    for e in editions:
        for f in 'isbn_10', 'isbn_13':
            for v in e.get(f, []):
                isbn.add(v.replace('-', ''))
    add_field_list(doc, 'isbn', isbn)

    lang = set()
    for e in editions:
        for l in e.get('languages', []):
            m = re_lang_key.match(l['key'] if isinstance(l, dict) else l)
            lang.add(m.group(1))
    if lang:
        add_field_list(doc, 'language', lang)

    pub_goog = set()  # google
    pub_nongoog = set()
    nonpub_goog = set()
    nonpub_nongoog = set()

    public_scan = False
    all_collection = set()
    all_overdrive = set()
    lending_edition = None
    in_library_edition = None
    printdisabled = set()
    for e in editions:
        if 'overdrive' in e:
            all_overdrive.update(e['overdrive'])
        if 'ocaid' not in e:
            continue
        if not lending_edition and 'lendinglibrary' in e['ia_collection']:
            lending_edition = re_edition_key.match(e['key']).group(1)
        if not in_library_edition and 'inlibrary' in e['ia_collection']:
            in_library_edition = re_edition_key.match(e['key']).group(1)
        if 'printdisabled' in e['ia_collection']:
            printdisabled.add(re_edition_key.match(e['key']).group(1))
        all_collection.update(e['ia_collection'])
        assert isinstance(e['ocaid'], basestring)
        i = e['ocaid'].strip()
        if e['public_scan']:
            public_scan = True
            if i.endswith('goog'):
                pub_goog.add(i)
            else:
                pub_nongoog.add(i)
        else:
            if i.endswith('goog'):
                nonpub_goog.add(i)
            else:
                nonpub_nongoog.add(i)
    #print 'lending_edition:', lending_edition
    ia_list = list(pub_nongoog) + list(pub_goog) + list(nonpub_nongoog) + list(
        nonpub_goog)
    add_field_list(doc, 'ia', ia_list)
    if has_fulltext:
        add_field(doc, 'public_scan_b', public_scan)
    if all_collection:
        add_field(doc, 'ia_collection_s', ';'.join(all_collection))
    if all_overdrive:
        add_field(doc, 'overdrive_s', ';'.join(all_overdrive))
    if lending_edition:
        add_field(doc, 'lending_edition_s', lending_edition)
    elif in_library_edition:
        add_field(doc, 'lending_edition_s', in_library_edition)
    if printdisabled:
        add_field(doc, 'printdisabled_s', ';'.join(list(printdisabled)))

    author_keys = [re_author_key.match(a['key']).group(1) for a in authors]
    author_names = [a.get('name', '') for a in authors]
    add_field_list(doc, 'author_key', author_keys)
    add_field_list(doc, 'author_name', author_names)

    alt_names = set()
    for a in authors:
        if 'alternate_names' in a:
            alt_names.update(a['alternate_names'])

    add_field_list(doc, 'author_alternative_name', alt_names)
    add_field_list(doc, 'author_facet',
                   (' '.join(v) for v in zip(author_keys, author_names)))
    #if subjects:
    #    add_field(doc, 'fiction', subjects['fiction'])

    for k in 'person', 'place', 'subject', 'time':
        if k not in subjects:
            continue
        add_field_list(doc, k, subjects[k].keys())
        add_field_list(doc, k + '_facet', subjects[k].keys())
        subject_keys = [str_to_key(s) for s in subjects[k].keys()]
        add_field_list(doc, k + '_key', subject_keys)

    return doc
예제 #34
0
def find_author(name):
    q = {'type': '/type/author', 'name': name}
    return [a['key'] for a in query_iter(q)]
예제 #35
0
def books_query(akey):  # live version
    q = {'type': '/type/edition', 'authors': akey, '*': None}
    return query_iter(q)
예제 #36
0
def build_doc(w):
    wkey = w['key']
    assert w['type']['key'] == '/type/work'
    title = w.get('title', None)
    if not title:
        return

    def get_pub_year(e):
        pub_date = e.get('publish_date', None)
        if pub_date:
            m = re_year.search(pub_date)
            if m:
                return m.group(1)

    if 'editions' not in w:
        q = { 'type':'/type/edition', 'works': wkey, '*': None }
        w['editions'] = list(query_iter(q))
        print 'editions:', [e['key'] for e in w['editions']]

    editions = []
    for e in w['editions']:
        pub_year = get_pub_year(e)
        if pub_year:
            e['pub_year'] = pub_year
        if 'ocaid' in e:
            collection = get_ia_collection(e['ocaid'])
            print 'collection:', collection
            e['ia_collection'] = collection
            e['public_scan'] = ('lendinglibrary' not in collection) and ('printdisabled' not in collection)
        overdrive_id = e.get('identifiers', {}).get('overdrive', None)
        if overdrive_id:
            print 'overdrive:', overdrive_id
            e['overdrive'] = overdrive_id
        editions.append(e)

    editions.sort(key=lambda e: e.get('pub_year', None))

    print len(w['editions']), 'editions found'

    print w['key']
    work_authors = []
    authors = []
    author_keys = []
    for a in w.get('authors', []):
        if 'author' not in a:
            continue
        akey = a['author']['key']
        m = re_author_key.match(akey)
        if not m:
            print 'invalid author key:', akey
            continue
        work_authors.append(akey)
        author_keys.append(m.group(1))
        authors.append(withKey(akey))
    if any(a['type']['key'] == '/type/redirect' for a in authors):
        raise AuthorRedirect
    assert all(a['type']['key'] == '/type/author' for a in authors)

    #subjects = four_types(find_subjects(get_marc_subjects(w)))
    subjects = {}
    field_map = {
        'subjects': 'subject',
        'subject_places': 'place',
        'subject_times': 'time',
        'subject_people': 'person',
    }

    has_fulltext = any(e.get('ocaid', None) or e.get('overdrive', None) for e in editions)

    print 'has_fulltext:', has_fulltext

    for db_field, solr_field in field_map.iteritems():
        if not w.get(db_field, None):
            continue
        cur = subjects.setdefault(solr_field, {})
        for v in w[db_field]:
            try:
                if isinstance(v, dict):
                    if 'value' not in v:
                        continue
                    v = v['value']
                cur[v] = cur.get(v, 0) + 1
            except:
                print 'v:', v
                raise

    if any(e.get('ocaid', None) for e in editions):
        subjects.setdefault('subject', {})
        subjects['subject']['Accessible book'] = subjects['subject'].get('Accessible book', 0) + 1
        if not has_fulltext:
            subjects['subject']['Protected DAISY'] = subjects['subject'].get('Protected DAISY', 0) + 1
        print w['key'], subjects['subject']

    doc = Element("doc")

    add_field(doc, 'key', w['key'][7:])
    title = w.get('title', None)
    if title:
        add_field(doc, 'title', title)
#        add_field(doc, 'title_suggest', title)

    add_field(doc, 'has_fulltext', has_fulltext)
    if w.get('subtitle', None):
        add_field(doc, 'subtitle', w['subtitle'])

    alt_titles = set()
    for e in editions:
        if 'title' in e and e['title'] != title:
            alt_titles.add(e['title'])
        for f in 'work_titles', 'other_titles':
            for t in e.get(f, []):
                if t != title:
                    alt_titles.add(t)
    add_field_list(doc, 'alternative_title', alt_titles)

    alt_subtitles = set( e['subtitle'] for e in editions if e.get('subtitle', None) and e['subtitle'] != w.get('subtitle', None))
    add_field_list(doc, 'alternative_subtitle', alt_subtitles)

    add_field(doc, 'edition_count', len(editions))
    for e in editions:
        add_field(doc, 'edition_key', re_edition_key.match(e['key']).group(1))

    cover_edition = pick_cover(w, editions)
    if cover_edition:
        add_field(doc, 'cover_edition_key', re_edition_key.match(cover_edition).group(1))

    k = 'by_statement'
    add_field_list(doc, k, set( e[k] for e in editions if e.get(k, None)))

    k = 'publish_date'
    pub_dates = set(e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, pub_dates)
    pub_years = set(m.group(1) for m in (re_year.search(i) for i in pub_dates) if m)
    if pub_years:
        add_field_list(doc, 'publish_year', pub_years)
        add_field(doc, 'first_publish_year', min(int(i) for i in pub_years))

    k = 'first_sentence'
    fs = set( e[k]['value'] if isinstance(e[k], dict) else e[k] for e in editions if e.get(k, None))
    add_field_list(doc, k, fs)

    publishers = set()
    for e in editions:
        publishers.update('Sine nomine' if is_sine_nomine(i) else i for i in e.get('publishers', []))
    add_field_list(doc, 'publisher', publishers)
#    add_field_list(doc, 'publisher_facet', publishers)

    field_map = [
        ('lccn', 'lccn'),
        ('publish_places', 'publish_place'),
        ('oclc_numbers', 'oclc'),
        ('contributions', 'contributor'),
    ]

    for db_key, search_key in field_map:
        v = set()
        for e in editions:
            if db_key not in e:
                continue
            v.update(e[db_key])
        add_field_list(doc, search_key, v)

    isbn = set()
    for e in editions:
        for f in 'isbn_10', 'isbn_13':
            for v in e.get(f, []):
                isbn.add(v.replace('-', ''))
    add_field_list(doc, 'isbn', isbn)

    lang = set()
    for e in editions:
        for l in e.get('languages', []):
            m = re_lang_key.match(l['key'] if isinstance(l, dict) else l)
            lang.add(m.group(1))
    if lang:
        add_field_list(doc, 'language', lang)

    pub_goog = set() # google
    pub_nongoog = set()
    nonpub_goog = set()
    nonpub_nongoog = set()

    public_scan = False
    all_collection = set()
    all_overdrive = set()
    lending_edition = None
    printdisabled = set()
    for e in editions:
        if 'overdrive' in e:
            all_overdrive.update(e['overdrive'])
        if 'ocaid' not in e:
            continue
        if not lending_edition and 'lendinglibrary' in e['ia_collection']:
            lending_edition = re_edition_key.match(e['key']).group(1)
        if 'printdisabled' in e['ia_collection']:
            printdisabled.add(re_edition_key.match(e['key']).group(1))
        all_collection.update(e['ia_collection'])
        assert isinstance(e['ocaid'], basestring)
        i = e['ocaid'].strip()
        if e['public_scan']:
            public_scan = True
            if i.endswith('goog'):
                pub_goog.add(i)
            else:
                pub_nongoog.add(i)
        else:
            if i.endswith('goog'):
                nonpub_goog.add(i)
            else:
                nonpub_nongoog.add(i)
    print 'lending_edition:', lending_edition
    ia_list = list(pub_nongoog) + list(pub_goog) + list(nonpub_nongoog) + list(nonpub_goog)
    add_field_list(doc, 'ia', ia_list)
    if has_fulltext:
        add_field(doc, 'public_scan_b', public_scan)
    if all_collection:
        add_field(doc, 'ia_collection_s', ';'.join(all_collection))
    if all_overdrive:
        add_field(doc, 'overdrive_s', ';'.join(all_overdrive))
    if lending_edition:
        add_field(doc, 'lending_edition_s', lending_edition)
    if printdisabled:
        add_field(doc, 'printdisabled_s', ';'.join(list(printdisabled)))

    author_keys = [re_author_key.match(a['key']).group(1) for a in authors]
    author_names = [a.get('name', '') for a in authors]
    add_field_list(doc, 'author_key', author_keys)
    add_field_list(doc, 'author_name', author_names)

    alt_names = set()
    for a in authors:
        if 'alternate_names' in a:
            alt_names.update(a['alternate_names'])

    add_field_list(doc, 'author_alternative_name', alt_names)
    add_field_list(doc, 'author_facet', (' '.join(v) for v in zip(author_keys, author_names)))
    #if subjects:
    #    add_field(doc, 'fiction', subjects['fiction'])

    for k in 'person', 'place', 'subject', 'time':
        if k not in subjects:
            continue
        add_field_list(doc, k, subjects[k].keys())
#        add_field_list(doc, k + '_facet', subjects[k].keys())
        subject_keys = [str_to_key(s) for s in subjects[k].keys()]
        add_field_list(doc, k + '_key', subject_keys)

    return doc
예제 #37
0
def switch_author(old, new, other):
    q = { 'authors': old, 'type': '/type/edition', }
    for e in query_iter(q):
        print 'switch author:', e['key']
        update_edition(e['key'], other, new)
예제 #38
0
def update_works(akey, works, do_updates=False):
    # we can now look up all works by an author
    if do_updates:
        rc = read_rc()
        ol.login('WorkBot', rc['WorkBot'])
    assert do_updates

    fh_log = open('/1/var/log/openlibrary/work_finder/' + strftime('%F_%T'), 'w')
    works = list(works)
    print(akey, file=fh_log)
    print('works:', file=fh_log)
    pprint(works, fh_log)

    while True: # until redirects repaired
        q = {'type':'/type/edition', 'authors': akey, 'works': None}
        work_to_edition = defaultdict(set)
        edition_to_work = defaultdict(set)
        for e in query_iter(q):
            if not isinstance(e, dict):
                continue
            if e.get('works', None):
                for w in e['works']:
                    work_to_edition[w['key']].add(e['key'])
                    edition_to_work[e['key']].add(w['key'])

        work_by_key = {}
        fix_redirects = []
        for k, editions in work_to_edition.items():
            w = withKey(k)
            if w['type']['key'] == '/type/redirect':
                wkey = w['location']
                print('redirect found', w['key'], '->', wkey, editions, file=fh_log)
                assert re_work_key.match(wkey)
                for ekey in editions:
                    e = get_with_retry(ekey)
                    e['works'] = [{'key': wkey}]
                    fix_redirects.append(e)
                continue
            work_by_key[k] = w
        if not fix_redirects:
            print('no redirects left', file=fh_log)
            break
        print('save redirects', file=fh_log)
        try:
            ol.save_many(fix_redirects, "merge works")
        except:
            for r in fix_redirects:
                print(r)
            raise

    all_existing = set()
    work_keys = []
    print('edition_to_work:', file=fh_log)
    print(repr(dict(edition_to_work)), file=fh_log)
    print(file=fh_log)
    print('work_to_edition', file=fh_log)
    print(repr(dict(work_to_edition)), file=fh_log)
    print(file=fh_log)

#    open('edition_to_work', 'w').write(repr(dict(edition_to_work)))
#    open('work_to_edition', 'w').write(repr(dict(work_to_edition)))
#    open('work_by_key', 'w').write(repr(dict(work_by_key)))

    work_title_match = {}
    works_by_title = {}
    for w in works: # 1st pass
        for e in w['editions']:
            ekey = e['key'] if isinstance(e, dict) else e
            for wkey in edition_to_work.get(ekey, []):
                try:
                    wtitle = work_by_key[wkey]['title']
                except:
                    print('bad work:', wkey)
                    raise
                if wtitle == w['title']:
                    work_title_match[wkey] = w['title']

    wkey_to_new_title = defaultdict(set)

    for w in works: # 2nd pass
        works_by_title[w['title']] = w
        w['existing_works'] = defaultdict(int)
        for e in w['editions']:
            ekey = e['key'] if isinstance(e, dict) else e
            for wkey in edition_to_work.get(ekey, []):
                if wkey in work_title_match and work_title_match[wkey] != w['title']:
                    continue
                wtitle = work_by_key[wkey]['title']
                w['existing_works'][wkey] += 1
                wkey_to_new_title[wkey].add(w['title'])

    existing_work_with_conflict = defaultdict(set)

    for w in works: # 3rd pass
        for wkey, v in w['existing_works'].iteritems():
            if any(title != w['title'] for title in wkey_to_new_title[wkey]):
                w['has_conflict'] = True
                existing_work_with_conflict[wkey].add(w['title'])
                break

    for wkey, v in existing_work_with_conflict.iteritems():
        cur_work = work_by_key[wkey]
        existing_titles = defaultdict(int)
        for ekey in work_to_edition[wkey]:
            e = withKey(ekey)
            title = e['title']
            if e.get('title_prefix', None):
                title = e['title_prefix'].strip() + ' ' + e['title']
            existing_titles[title] += 1
        best_match = max(v, key=lambda wt: existing_titles[wt])
        works_by_title[best_match]['best_match'] = work_by_key[wkey]
        for wtitle in v:
            del works_by_title[wtitle]['has_conflict']
            if wtitle != best_match:
                works_by_title[wtitle]['existing_works'] = {}

    def other_matches(w, existing_wkey):
        return [title for title in wkey_to_new_title[existing_wkey] if title != w['title']]

    works_updated_this_session = set()

    for w in works: # 4th pass
        if 'has_conflict' in w:
            pprint(w)
        assert 'has_conflict' not in w
        if len(w['existing_works']) == 1:
            existing_wkey = w['existing_works'].keys()[0]
            if not other_matches(w, existing_wkey):
                w['best_match'] = work_by_key[existing_wkey]
        if 'best_match' in w:
            updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log)
            for wkey in updated:
                if wkey in works_updated_this_session:
                    print(wkey, 'already updated!', file=fh_log)
                    print(wkey, 'already updated!')
                works_updated_this_session.update(updated)
            continue
        if not w['existing_works']:
            updated = new_work(akey, w, do_updates, fh_log)
            for wkey in updated:
                assert wkey not in works_updated_this_session
                works_updated_this_session.update(updated)
            continue

        assert not any(other_matches(w, wkey) for wkey in w['existing_works'].iterkeys())
        best_match = max(w['existing_works'].iteritems(), key=lambda i:i[1])[0]
        w['best_match'] = work_by_key[best_match]
        updated = update_work_with_best_match(akey, w, work_to_edition, do_updates, fh_log)
        for wkey in updated:
            if wkey in works_updated_this_session:
                print(wkey, 'already updated!', file=fh_log)
                print(wkey, 'already updated!')
        works_updated_this_session.update(updated)

    #if not do_updates:
    #    return []

    return [withKey(key) for key in works_updated_this_session]
예제 #39
0
def update_author(akey, a=None, handle_redirects=True):
    # http://ia331507.us.archive.org:8984/solr/works/select?indent=on&q=author_key:OL22098A&facet=true&rows=1&sort=edition_count%20desc&fl=title&facet.field=subject_facet&facet.mincount=1
    if akey == '/authors/':
        return
    m = re_author_key.match(akey)
    if not m:
        logger.error('bad key: %s', akey)
    assert m
    author_id = m.group(1)
    if not a:
        a = withKey(akey)
    if a['type']['key'] in ('/type/redirect', '/type/delete') or not a.get('name', None):
        return ['<delete><query>key:%s</query></delete>' % author_id] 
    try:
        assert a['type']['key'] == '/type/author'
    except AssertionError:
        logger.error("AssertionError: %s", a['type']['key'])
        raise

    facet_fields = ['subject', 'time', 'person', 'place']

    if is_single_core():
        base_url = 'http://' + get_solr('works') + '/solr/select'
    else:
        base_url = 'http://' + get_solr('works') + '/solr/works/select'

    url = base_url + '?wt=json&json.nl=arrarr&q=author_key:%s&sort=edition_count+desc&rows=1&fl=title,subtitle&facet=true&facet.mincount=1' % author_id
    url += ''.join('&facet.field=%s_facet' % f for f in facet_fields)

    logger.info("urlopen %s", url)

    reply = json.load(urlopen(url))
    work_count = reply['response']['numFound']
    docs = reply['response'].get('docs', [])
    top_work = None
    if docs:
        top_work = docs[0]['title']
        if docs[0].get('subtitle', None):
            top_work += ': ' + docs[0]['subtitle']
    all_subjects = []
    for f in facet_fields:
        for s, num in reply['facet_counts']['facet_fields'][f + '_facet']:
            all_subjects.append((num, s))
    all_subjects.sort(reverse=True)
    top_subjects = [s for num, s in all_subjects[:10]]

    add = Element("add")
    doc = SubElement(add, "doc")
    
    if is_single_core():
        add_field(doc, 'key', "/authors/" + author_id)
        add_field(doc, 'type', "author")
    else:
        add_field(doc, 'key', author_id)

    if a.get('name', None):
        add_field(doc, 'name', a['name'])
    for f in 'birth_date', 'death_date', 'date':
        if a.get(f, None):
            add_field(doc, f, a[f])
    if top_work:
        add_field(doc, 'top_work', top_work)
    add_field(doc, 'work_count', work_count)
    add_field_list(doc, 'top_subjects', top_subjects)

    requests = []
    if handle_redirects:
        q = {'type': '/type/redirect', 'location': akey}
        try:
            redirects = ''.join('<id>%s</id>' % re_author_key.match(r['key']).group(1) for r in query_iter(q))
        except AttributeError:
            logger.error('AssertionError: redirects: %r', [r['key'] for r in query_iter(q)])
            raise
        if redirects:
            requests.append('<delete>' + redirects + '</delete>')

    requests.append(tostring(add).encode('utf-8'))
    return requests
set_staging(True)
rc = read_rc()

ol = OpenLibrary("http://dev.openlibrary.org")
ol.login('EdwardBot', rc['EdwardBot'])
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

work_q = {
    'type': '/type/work',
    'authors': None,
    'title': None,
}

queue = []

for w in query_iter(work_q):
    if not w.get('authors'):
        print('no authors')
        continue
    if any(isinstance(a, dict) and 'author' in a for a in w['authors']):
        continue
    print(len(queue), w['key'], w['title']) # , ol.get(w['authors'][0]['key'])['name']
    full = ol.get(w['key'])
    authors = full['authors']
    assert all(isinstance(a, Reference) for a in authors)
    full['authors'] = [{'author':a} for a in authors]
    queue.append(full)
    if len(queue) > 1000:
        print('saving')
        print(ol.save_many(queue, 'update format of authors in works to provide roles'))
        queue = []
예제 #41
0
def update_works(akey, works, do_updates=False):
    # we can now look up all works by an author
    if do_updates:
        rc = read_rc()
        ol.login('WorkBot', rc['WorkBot'])
    assert do_updates

    fh_log = open('/1/var/log/openlibrary/work_finder/' + strftime('%F_%T'),
                  'w')
    works = list(works)
    print >> fh_log, akey
    print >> fh_log, 'works:'
    pprint(works, fh_log)

    while True:  # until redirects repaired
        q = {'type': '/type/edition', 'authors': akey, 'works': None}
        work_to_edition = defaultdict(set)
        edition_to_work = defaultdict(set)
        for e in query_iter(q):
            if not isinstance(e, dict):
                continue
            if e.get('works', None):
                for w in e['works']:
                    work_to_edition[w['key']].add(e['key'])
                    edition_to_work[e['key']].add(w['key'])

        work_by_key = {}
        fix_redirects = []
        for k, editions in work_to_edition.items():
            w = withKey(k)
            if w['type']['key'] == '/type/redirect':
                wkey = w['location']
                print >> fh_log, 'redirect found', w[
                    'key'], '->', wkey, editions
                assert re_work_key.match(wkey)
                for ekey in editions:
                    e = get_with_retry(ekey)
                    e['works'] = [{'key': wkey}]
                    fix_redirects.append(e)
                continue
            work_by_key[k] = w
        if not fix_redirects:
            print >> fh_log, 'no redirects left'
            break
        print >> fh_log, 'save redirects'
        try:
            ol.save_many(fix_redirects, "merge works")
        except:
            for r in fix_redirects:
                print r
            raise

    all_existing = set()
    work_keys = []
    print >> fh_log, 'edition_to_work:'
    print >> fh_log, ` dict(edition_to_work) `
    print >> fh_log
    print >> fh_log, 'work_to_edition'
    print >> fh_log, ` dict(work_to_edition) `
    print >> fh_log

    #    open('edition_to_work', 'w').write(`dict(edition_to_work)`)
    #    open('work_to_edition', 'w').write(`dict(work_to_edition)`)
    #    open('work_by_key', 'w').write(`dict(work_by_key)`)

    work_title_match = {}
    works_by_title = {}
    for w in works:  # 1st pass
        for e in w['editions']:
            ekey = e['key'] if isinstance(e, dict) else e
            for wkey in edition_to_work.get(ekey, []):
                try:
                    wtitle = work_by_key[wkey]['title']
                except:
                    print 'bad work:', wkey
                    raise
                if wtitle == w['title']:
                    work_title_match[wkey] = w['title']

    wkey_to_new_title = defaultdict(set)

    for w in works:  # 2nd pass
        works_by_title[w['title']] = w
        w['existing_works'] = defaultdict(int)
        for e in w['editions']:
            ekey = e['key'] if isinstance(e, dict) else e
            for wkey in edition_to_work.get(ekey, []):
                if wkey in work_title_match and work_title_match[wkey] != w[
                        'title']:
                    continue
                wtitle = work_by_key[wkey]['title']
                w['existing_works'][wkey] += 1
                wkey_to_new_title[wkey].add(w['title'])

    existing_work_with_conflict = defaultdict(set)

    for w in works:  # 3rd pass
        for wkey, v in w['existing_works'].iteritems():
            if any(title != w['title'] for title in wkey_to_new_title[wkey]):
                w['has_conflict'] = True
                existing_work_with_conflict[wkey].add(w['title'])
                break

    for wkey, v in existing_work_with_conflict.iteritems():
        cur_work = work_by_key[wkey]
        existing_titles = defaultdict(int)
        for ekey in work_to_edition[wkey]:
            e = withKey(ekey)
            title = e['title']
            if e.get('title_prefix', None):
                title = e['title_prefix'].strip() + ' ' + e['title']
            existing_titles[title] += 1
        best_match = max(v, key=lambda wt: existing_titles[wt])
        works_by_title[best_match]['best_match'] = work_by_key[wkey]
        for wtitle in v:
            del works_by_title[wtitle]['has_conflict']
            if wtitle != best_match:
                works_by_title[wtitle]['existing_works'] = {}

    def other_matches(w, existing_wkey):
        return [
            title for title in wkey_to_new_title[existing_wkey]
            if title != w['title']
        ]

    works_updated_this_session = set()

    for w in works:  # 4th pass
        if 'has_conflict' in w:
            pprint(w)
        assert 'has_conflict' not in w
        if len(w['existing_works']) == 1:
            existing_wkey = w['existing_works'].keys()[0]
            if not other_matches(w, existing_wkey):
                w['best_match'] = work_by_key[existing_wkey]
        if 'best_match' in w:
            updated = update_work_with_best_match(akey, w, work_to_edition,
                                                  do_updates, fh_log)
            for wkey in updated:
                if wkey in works_updated_this_session:
                    print >> fh_log, wkey, 'already updated!'
                    print wkey, 'already updated!'
                works_updated_this_session.update(updated)
            continue
        if not w['existing_works']:
            updated = new_work(akey, w, do_updates, fh_log)
            for wkey in updated:
                assert wkey not in works_updated_this_session
                works_updated_this_session.update(updated)
            continue

        assert not any(
            other_matches(w, wkey) for wkey in w['existing_works'].iterkeys())
        best_match = max(w['existing_works'].iteritems(),
                         key=lambda i: i[1])[0]
        w['best_match'] = work_by_key[best_match]
        updated = update_work_with_best_match(akey, w, work_to_edition,
                                              do_updates, fh_log)
        for wkey in updated:
            if wkey in works_updated_this_session:
                print >> fh_log, wkey, 'already updated!'
                print wkey, 'already updated!'
        works_updated_this_session.update(updated)

    #if not do_updates:
    #    return []

    return [withKey(key) for key in works_updated_this_session]
예제 #42
0
def update_work(w, obj_cache=None, debug=False, resolve_redirects=False):
    if obj_cache is None:
        obj_cache = {}

    wkey = w['key']
    #assert wkey.startswith('/works')
    #assert '/' not in wkey[7:]
    deletes = []
    requests = []

    q = {'type': '/type/redirect', 'location': wkey}
    redirect_keys = [r['key'][7:] for r in query_iter(q)]

    deletes += redirect_keys
    deletes += [wkey[7:]]  # strip /works/ from /works/OL1234W

    # handle edition records as well
    # When an edition is not belonged to a work, create a fake work and index it.
    if w['type']['key'] == '/type/edition' and w.get('title'):
        edition = w
        w = {
            # Use key as /works/OL1M.
            # In case of single-core-solr, we are using full path as key. So it is required
            # to be unique across all types of documents.
            # The website takes care of redirecting /works/OL1M to /books/OL1M.
            'key': edition['key'].replace("/books/", "/works/"),
            'type': {
                'key': '/type/work'
            },
            'title': edition['title'],
            'editions': [edition]
        }
        # Hack to add subjects when indexing /books/ia:xxx
        if edition.get("subjects"):
            w['subjects'] = edition['subjects']

    if w['type']['key'] == '/type/work' and w.get('title'):
        try:
            d = build_data(w,
                           obj_cache=obj_cache,
                           resolve_redirects=resolve_redirects)
            doc = dict2element(d)
        except:
            logger.error("failed to update work %s", w['key'], exc_info=True)
        else:
            if d is not None:
                # Delete all ia:foobar keys
                # XXX-Anand: The works in in_library subject were getting wiped off for unknown reasons.
                # I suspect that this might be a cause. Disabling temporarily.
                #if d.get('ia'):
                #    deletes += ["ia:" + iaid for iaid in d['ia']]

                # In single core solr, we use full path as key, not just the last part
                if is_single_core():
                    deletes = ["/works/" + k for k in deletes]

                requests.append(make_delete_query(deletes))

                add = Element("add")
                add.append(doc)
                add_xml = tostring(add).encode('utf-8')
                requests.append(add_xml)
    elif w['type']['key'] == '/type/delete':
        # In single core solr, we use full path as key, not just the last part
        if is_single_core():
            deletes = ["/works/" + k for k in deletes]
        requests.append(make_delete_query(deletes))

    return requests