Exemplo n.º 1
0
def by_authors():
    find_new_work_key()

    skipping = False
    skipping = True
    q = { 'type':'/type/author', 'name': None, 'works': None }
    for a in query_iter(q, offset=215000):
        akey = a['key']
        if skipping:
            print 'skipping:', akey, a['name']
            if akey == '/a/OL218496A':
                skipping = False
            continue

        q = {
            'type':'/type/work',
            'authors': akey,
        }
        if query(q):
            print akey, `a['name']`, 'has works'
            continue

    #    print akey, a['name']
        found = find_works(akey)
        works = [i for i in found if len(i['editions']) > 2]
        if works:
            #open('found/' + akey[3:], 'w').write(`works`)
            print akey, `a['name']`
            #pprint(works)
            #print_works(works)
            add_works(akey, works)
            print
Exemplo n.º 2
0
def dates():
    f = 'first_publish_date'
    for w in iter_works([f, 'title']):
        if f in w:
            continue
        q = { 'type':'/type/edition', 'works': w['key'], 'publish_date': None }
        years = defaultdict(list)
        for e in query_iter(q):
            date = e.get('publish_date', None)
            if not date or date == '0000':
                continue
            m = re_year.match(date)
            if not m:
                continue
            year = int(m.group(1))
            years[year].append(e['key'])
        if not years:
            continue
        first = min(years.keys())
        assert first != 0
        print w['key'], `w['title']`, first
        q = {
            'key': w['key'],
            f: { 'connect': 'update', 'value': str(first)}
        }
        queue.append(q)
        if len(queue) == 200:
            print ol.write(queue, comment='add first publish date')
            queue = []
    print ol.write(queue, comment='add first publish date')
Exemplo n.º 3
0
def dates():
    global queue
    f = 'first_publish_date'
    for w in iter_works([f, 'title']):
        if f in w:
            continue
        q = {'type': '/type/edition', 'works': w['key'], 'publish_date': None}
        years = defaultdict(list)
        for e in query_iter(q):
            date = e.get('publish_date', None)
            if not date or date == '0000':
                continue
            m = re_year.match(date)
            if not m:
                continue
            year = int(m.group(1))
            years[year].append(e['key'])
        if not years:
            continue
        first = min(years.keys())
        assert first != 0
        print((w['key'], repr(w['title']), first))
        q = {'key': w['key'], f: {'connect': 'update', 'value': str(first)}}
        queue.append(q)
        if len(queue) == 200:
            print(ol.write(queue, comment='add first publish date'))
            queue = []
    print(ol.write(queue, comment='add first publish date'))
Exemplo n.º 4
0
def lang():
    f = 'original_languages'
    queue = []
    for w in iter_works([f, 'title']):
        if f in w and w[f]:
            continue
        q = {
            'type': '/type/edition',
            'works': w['key'],
            'languages': None,
            'title': None,
            'title_prefix': None
        }
        editions = [e for e in query_iter(q) if e['languages']]
        title = mk_norm(w['title'])
        if not editions or any(len(e['languages']) != 1 for e in editions):
            continue
        lang = [
            e['languages'][0]['key'] for e in editions
            if mk_norm(get_title(e)) == title
        ]
        if len(lang) < 2:
            continue
        first = lang[0]
        if any(l != first for l in lang):
            continue
        print((w['key'], repr(w['title']), first, len(lang)))
        q = {'key': w['key'], f: {'connect': 'update_list', 'value': [first]}}
        queue.append(q)
        if len(queue) == 200:
            print(ol.write(queue, comment='add original language'))
            queue = []
    print(ol.write(queue, comment='add original language'))
Exemplo n.º 5
0
def lang():
    f = 'original_languages'
    queue = []
    for w in iter_works([f, 'title']):
        if f in w and w[f]:
            continue
        q = {
            'type':'/type/edition',
            'works': w['key'],
            'languages': None,
            'title': None,
            'title_prefix': None
        }
        editions = [e for e in query_iter(q) if e['languages']]
        title = mk_norm(w['title'])
        if not editions or any(len(e['languages']) != 1 for e in editions):
            continue
        lang = [e['languages'][0]['key'] for e in editions if mk_norm(get_title(e)) == title]
        if len(lang) < 2:
            continue
        first = lang[0]
        if any(l != first for l in lang):
            continue
        print w['key'], `w['title']`, first, len(lang)
        q = {
            'key': w['key'],
            f: { 'connect': 'update_list', 'value': [first]}
        }
        queue.append(q)
        if len(queue) == 200:
            print ol.write(queue, comment='add original language')
            queue = []
    print ol.write(queue, comment='add original language')
Exemplo n.º 6
0
def get_books(akey):
    q = {
        'type':'/type/edition',
        'authors': akey,
        '*': None
    }
    for e in query_iter(q):
        if not e.get('title', None):
            continue
        if len(e.get('authors', [])) != 1:
            continue
#        if 'works' in e:
#            continue
        if 'title_prefix' in e and e['title_prefix']:
            prefix = e['title_prefix']
            if prefix[-1] != ' ':
                prefix += ' '
            title = prefix + e['title']
        else:
            title = e['title']

        if title.strip('. ') in ['Publications', 'Works', 'Report', \
                'Letters', 'Calendar', 'Bulletin', 'Plays', 'Sermons', 'Correspondence']:
            continue

        m = re_parens.match(title)
        if m:
            title = m.group(1)

        n = mk_norm(title)

        book = {
            'title': title,
            'norm_title': n,
            'key': e['key'],
        }

        if 'languages' in e:
            book['lang'] = [l['key'][3:] for l in e['languages']]

        if e.get('table_of_contents', None):
            if isinstance(e['table_of_contents'][0], basestring):
                book['table_of_contents'] = e['table_of_contents']
            else:
                assert isinstance(e['table_of_contents'][0], dict)
                if e['table_of_contents'][0]['type'] == '/type/text':
                    book['table_of_contents'] = [i['value'] for i in e['table_of_contents']]

        if not e.get('work_titles', None):
            yield book
            continue
        wt = e['work_titles'][0].strip('. ')
        if wt in ('Works', 'Selections'):
            yield book
            continue
        n_wt = mk_norm(wt)
        book['work_title'] = wt
        book['norm_wt'] = n_wt
        yield book
Exemplo n.º 7
0
def is_loaded(loc):
    assert loc.startswith('marc:')
    vars = {'loc': loc[5:]}
    db_iter = marc_index.query('select * from machine_comment where v=$loc', vars)
    if list(db_iter):
        return True
    iter = query_iter({'type': '/type/edition', 'source_records': loc})
    return bool(list(iter))
def is_loaded(loc):
    assert loc.startswith('marc:')
    vars = {'loc': loc[5:]}
    db_iter = marc_index.query('select * from machine_comment where v=$loc', vars)
    if list(db_iter):
        return True
    iter = query_iter({'type': '/type/edition', 'source_records': loc})
    return bool(list(iter))
Exemplo n.º 9
0
 def isbn_search(self, v):
     q = {'type': '/type/edition', 'isbn_10': v, 'title': None, 'subtitle': None}
     editions = []
     for e in query_iter(q):
         e['isbn_10'] = v
         editions.append(e)
     yield 'searching for ISBN ' + web.htmlquote(v) + ': '
     for i in self.search(editions):
         yield i
Exemplo n.º 10
0
def author_search(name):
    q = {
        'type':'/type/author',
        'name': name,
        'birth_date': None,
        'death_date': None,
        'dates': None
    }
    return [a for a in query_iter(q) if a.get('birth_date', None) or a.get('death_date', None) or a.get('dates', None)]
Exemplo n.º 11
0
def author_search(name):
    q = {
        'type':'/type/author',
        'name': name,
        'birth_date': None,
        'death_date': None,
        'dates': None
    }
    return [a for a in query_iter(q) if a.get('birth_date', None) or a.get('death_date', None) or a.get('dates', None)]
Exemplo n.º 12
0
 def title_search(self, v):
     q = {"type": "/type/edition", "isbn_10": None, "title": v}
     editions = []
     for e in query_iter(q):
         e["title"] = v
         editions.append(e)
     yield 'searcing for title "' + web.htmlquote(v) + '": '
     for i in self.search(editions):
         yield i
Exemplo n.º 13
0
 def title_search(self, v):
     q = {'type': '/type/edition', 'isbn_10': None, 'title': v}
     editions = []
     for e in query_iter(q):
         e['title'] = v
         editions.append(e)
     yield 'searcing for title "' + web.htmlquote(v) + '": '
     for i in self.search(editions):
         yield i
Exemplo n.º 14
0
 def isbn_search(self, v):
     q = {"type": "/type/edition", "isbn_10": v, "title": None, "subtitle": None}
     editions = []
     for e in query_iter(q):
         e["isbn_10"] = v
         editions.append(e)
     yield "searching for ISBN " + web.htmlquote(v) + ": "
     for i in self.search(editions):
         yield i
Exemplo n.º 15
0
 def isbn_search(self, v):
     q = {'type': '/type/edition', 'isbn_10': v, 'title': None, 'subtitle': None}
     editions = []
     for e in query_iter(q):
         e['isbn_10'] = v
         editions.append(e)
     yield 'searching for ISBN ' + web.htmlquote(v) + ': '
     for i in self.search(editions):
         yield i
Exemplo n.º 16
0
def get_keys(loc):
    assert loc.startswith('marc:')
    vars = {'loc': loc[5:]}
    db_iter = marc_index.query('select k from machine_comment where v=$loc', vars)
    mc = list(db_iter)
    if mc:
        return [r.k for r in mc]
    iter = query_iter({'type': '/type/edition', 'source_records': loc})
    return [e['key'] for e in iter]
Exemplo n.º 17
0
 def title_search(self, v):
     q = {'type': '/type/edition', 'isbn_10': None, 'title': v}
     editions = []
     for e in query_iter(q):
         e['title'] = v
         editions.append(e)
     yield 'searcing for title "' + web.htmlquote(v) + '": '
     for i in self.search(editions):
         yield i
Exemplo n.º 18
0
def get_keys(loc):
    assert loc.startswith('marc:')
    vars = {'loc': loc[5:]}
    db_iter = marc_index.query('select k from machine_comment where v=$loc', vars)
    mc = list(db_iter)
    if mc:
        return [r.k for r in mc]
    iter = query_iter({'type': '/type/edition', 'source_records': loc})
    return [e['key'] for e in iter]
Exemplo n.º 19
0
 def oclc_search(self, v):
     q = {'type': '/type/edition', 'oclc_numbers': v, 'title': None, 'subtitle': None, 'isbn_10': None}
     editions = []
     print q
     for e in query_iter(q):
         e['oclc_numbers'] = v
         editions.append(e)
     yield 'searching for OCLC ' + web.htmlquote(v) + ': '
     for i in self.search(editions):
         yield i
Exemplo n.º 20
0
 def oclc_search(self, v):
     q = {'type': '/type/edition', 'oclc_numbers': v, 'title': None, 'subtitle': None, 'isbn_10': None}
     editions = []
     print q
     for e in query_iter(q):
         e['oclc_numbers'] = v
         editions.append(e)
     yield 'searching for OCLC ' + web.htmlquote(v) + ': '
     for i in self.search(editions):
         yield i
Exemplo n.º 21
0
def other_editions(title, wkey, work_author):
    # look for other editions with the same title
    wakey = work_author['key']
    q = {'type': '/type/edition', 'title': title}
    for k in 'works', 'title_prefix', 'key', 'authors':
        q[k] = None
    found = []
    for e in query_iter(q):
        if not e.get('authors', None):
            continue
        if e.get('works', None) and any(i['key'] == wkey for i in e['works']):
            continue
        if any(i['key'] == wakey for i in e['authors']):
            continue
        for akey in (a['key'] for a in e.get('authors', [])):
            a = withKey(akey)
            name = a.get('name', '')
            if match_name(name, work_author['name'], last_name_only_ok=True):
                yield (e, a)
Exemplo n.º 22
0
def other_editions(title, wkey, work_author):
    # look for other editions with the same title
    wakey = work_author['key']
    q = { 'type': '/type/edition', 'title': title }
    for k in 'works', 'title_prefix', 'key', 'authors':
        q[k] = None
    found = []
    for e in query_iter(q):
        if not e.get('authors', None):
            continue
        if e.get('works', None) and any(i['key'] == wkey for i in e['works']):
            continue
        if any(i['key'] == wakey for i in e['authors']):
            continue
        for akey in (a['key'] for a in e.get('authors', [])):
            a = withKey(akey)
            name = a.get('name', '')
            if match_name(name, work_author['name'], last_name_only_ok=True):
                yield (e, a)
Exemplo n.º 23
0
        if e.get('works', None) and any(i['key'] == wkey for i in e['works']):
            continue
        if any(i['key'] == wakey for i in e['authors']):
            continue
        for akey in (a['key'] for a in e.get('authors', [])):
            a = withKey(akey)
            name = a.get('name', '')
            if match_name(name, work_author['name'], last_name_only_ok=True):
                yield (e, a)


q = {'type': '/type/work'}
for k in 'key', 'title', 'authors':
    q[k] = None

for w in query_iter(q):
    wkey = w['key']
    titles = set([w['title']])
    q = {'type': '/type/edition', 'works': wkey}
    for k in 'title', 'title_prefix', 'key', 'authors':
        q[k] = None

    wakey = w['authors'][0]['key']
    work_author = withKey(wakey)

    for e in query_iter(q):
        if not e.get('title', None):
            continue
        titles.update([get_title(e), e['title']])

    found = []
Exemplo n.º 24
0
def add_fields():
    comment = 'add fields to works'
    queue = []
    seen = set()
    fields = ['genres', 'first_sentence', 'dewey_number', \
            'lc_classifications', 'publish_date'] #, 'table_of_contents']
    for w in iter_works(fields + ['title']):
        if w['key'] in seen or all(w.get(f, None) for f in fields):
            continue
        seen.add(w['key'])
        q = {'type': '/type/edition', 'works': w['key']}
        for f in fields:
            q[f] = None
        editions = list(query_iter(q))

        found = {}

        for f in fields:
            if not w.get(f, None):
                if f == 'publish_date':
                    years = defaultdict(list)
                    for e in editions:
                        date = e.get(f, None)
                        if not date or date == '0000':
                            continue
                        m = re_year.match(date)
                        if not m:
                            continue
                        year = int(m.group(1))
                        years[year].append(e['key'])
                    if years:
                        found[f] = str(min(years.keys()))
                    continue
                if f == 'genres':
                    found_list = [[g.strip('.') for g in e[f]] for e in editions \
                        if e.get(f, None) and not any('translation' in i for i in e[f])]
                if f == 'table_of_contents':
                    found_list = []
                    for e in query_iter(q):
                        if not e.get(f, None):
                            continue
                        toc = e[f]
                        print(e['key'], toc)
                        print(e)
                        print()
                        if isinstance(toc[0], six.string_types):
                            found_list.append(toc_items(toc))
                        else:
                            assert isinstance(toc[0], dict)
                            if toc[0]['type'] == '/type/text':
                                found_list.append(
                                    toc_items([i['value'] for i in toc]))
                            else:
                                assert toc[0]['type'][
                                    'key'] == '/type/toc_item'
                                found_list.append(toc)
                else:
                    found_list = [
                        e[f] for e in query_iter(q) if e.get(f, None)
                    ]
                if found_list:
                    first = found_list[0]
                    if all(i == first for i in found_list):
                        found[f] = first

        if not found:
            continue

        print(len(queue) + 1, w['key'], len(editions), w['title'])
        print(found)

        q = {
            'key': w['key'],
        }
        for f in fields:
            if not f in found:
                continue
            if f == 'publish_date':
                q['first_publish_date'] = {
                    'connect': 'update',
                    'value': found[f]
                }
            elif f == 'first_sentence':
                q[f] = {'connect': 'update', 'value': found[f]}
            else:
                q[f] = {'connect': 'update_list', 'value': found[f]}
        queue.append(q)
        if len(queue) == 200:
            print(ol.write(queue, comment=comment))
            queue = []
    print(ol.write(queue, comment=comment))
Exemplo n.º 25
0
def iter_works(fields):
    q = { 'type':'/type/work', 'key': None }
    for f in fields: q[f] = None
    return query_iter(q)
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
set_staging(True)

ol = OpenLibrary("http://dev.openlibrary.org")
ol.login('EdwardBot', rc['EdwardBot'])

re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$')

def has_dot(s):
    return s.endswith('.') and not re_skip.search(s)

q = { 'type': '/type/edition', 'table_of_contents': None, 'subjects': None }
queue = []
count = 0
for e in query_iter(q):
    if not e.get('subjects', None) or not any(has_dot(s) for s in e['subjects']):
        continue
    subjects = [s[:-1] if has_dot(s) else s for s in e['subjects']]
    q = {
        'key': e['key'],
        'subjects': {'connect': 'update_list', 'value': subjects },
    }
    # need to fix table_of_contents to pass validation
    toc = e['table_of_contents']
    if toc and (isinstance(toc[0], six.string_types) or toc[0]['type'] == '/type/text'):
        if isinstance(toc[0], six.string_types):
            assert all(isinstance(i, six.string_types) for i in toc)
            new_toc = [{'title': i, 'type': '/type/toc_item'} for i in toc]
        else:
            assert all(i['type'] == '/type/text' for i in toc)
Exemplo n.º 27
0
from catalog.utils.query import query_iter, set_staging, withKey, get_mc
import sys, codecs, re
sys.path.append('/home/edward/src/olapi')
from olapi import OpenLibrary, Reference
from catalog.read_rc import read_rc
from catalog.get_ia import get_from_archive, get_from_local
from catalog.marc.fast_parse import get_first_tag, get_all_subfields
rc = read_rc()

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
set_staging(True)

ol = OpenLibrary("http://dev.openlibrary.org")
ol.login('EdwardBot', rc['EdwardBot'])

q = {'type': '/type/edition', 'table_of_contents': None, 'subjects': None}
queue = []
count = 0
for e in query_iter(q, limit=100):
    key = e['key']
    mc = get_mc(key)
    if not mc:
        continue
    data = get_from_local(mc)
    line = get_first_tag(data, set(['041']))
    if not line:
        continue
    print key, line[0:2], list(get_all_subfields(line))
Exemplo n.º 28
0
def find_author(name):
    q = {'type': '/type/author', 'name': name}
    return [a['key'] for a in query_iter(q)]
Exemplo n.º 29
0
from catalog.utils.query import query_iter, set_staging, withKey, get_mc
import sys, codecs, re
sys.path.append('/home/edward/src/olapi')
from olapi import OpenLibrary, Reference
from catalog.read_rc import read_rc
from catalog.get_ia import get_from_archive, get_from_local
from catalog.marc.fast_parse import get_first_tag, get_all_subfields
rc = read_rc()

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
set_staging(True)

ol = OpenLibrary("http://dev.openlibrary.org")
ol.login('EdwardBot', rc['EdwardBot'])

q = { 'type': '/type/edition', 'table_of_contents': None, 'subjects': None }
queue = []
count = 0
for e in query_iter(q, limit=100):
    key = e['key']
    mc = get_mc(key)
    if not mc:
        continue
    data = get_from_local(mc)
    line = get_first_tag(data, set(['041']))
    if not line:
        continue
    print key, line[0:2], list(get_all_subfields(line))

Exemplo n.º 30
0
def iter_works(fields):
    q = {'type': '/type/work', 'key': None}
    for f in fields:
        q[f] = None
    return query_iter(q)
Exemplo n.º 31
0
def search(author, name):
    book_fields = ('title_prefix', 'title');
    q = { 'type': '/type/edition', 'authors': author, 'title_prefix': None, 'title': None, 'isbn_10': None}
    found = list(query_iter(q))
    db_author = ''
    names = set([name])
    t = ''
    books = []
    for e in found:
        locs = set()
        for i in e['isbn_10'] or []:
            locs.update(search_query('isbn', i))
        if not locs:
            books.append((e['key'], (e['title_prefix'] or '') + e['title'], None, []))
            continue
        found = data_from_marc(locs, name)
        if len(found) != 1:
            locs = []
            for i in found.values():
                locs.append(i)
            books.append((e['key'], (e['title_prefix'] or '') + e['title'], None, locs))
            continue
        marc_author = found.keys()[0]
        locs = found.values()[0]
        names.update(marc_author[0:2])
        books.append((e['key'], (e['title_prefix'] or '') + e['title'], marc_author, locs))

    authors = []
    names2 = set()
    for n in names:
        if ', ' in n:
            continue
        i = n.rfind(' ')
        names2.add("%s, %s" % (n[i+1:], n[:i]))
    names.update(names2)

    for n in names:
        for a in author_search(n):
            authors.append(a)

    for a in authors:
        q = {
            'type': '/type/edition',
            'authors': a['key'],
            'title_prefix': None,
            'title': None,
            'isbn_10': None
        }
        a['editions'] = list(query_iter(q))

    author_map = {}

    for key, title, a, locs in books:
        t += '<tr><td><a href="http://openlibrary.org' + key + '">' + web.htmlquote(title) + '</a>'
        t += '<br>' + ', '.join('<a href="http://openlibrary.org/show-marc/%s">%s</a>' % (i, i) for i in locs) + '</td>'
#        t += '<td>' + web.htmlquote(`a[2]`) + '</td>'
        if a:
            if a[2] not in author_map:
                dates = {'birth_date': a[2][0], 'death_date': a[2][1], 'dates': a[2][2]}
                db_match = [db for db in authors if author_dates_match(dates, db)]
                author_map[a[2]] = db_match[0] if len(db_match) == 1 else None

            match = author_map[a[2]]
            if match:
                t += '<td><a href="http://openlibrary.org%s">%s-%s</a></td>' % (match['key'], match['birth_date'] or '', match['death_date'] or '')
            else:
                t += '<td>%s-%s (no match)</td>' % (dates['birth_date'] or '', dates['death_date'] or '')
        t += '</tr>\n'

    ret = ''
    if authors:
        ret += '<ul>'
        for a in authors:
            ret += '<li><a href="http://openlibrary.org%s">%s</a> (%s-%s) %d editions' % (a['key'], web.htmlquote(name), a['birth_date'] or '', a['death_date'] or '', len(a['editions']))
        ret += '</ul>'

    return ret + '<table>' + t + '</table>'
Exemplo n.º 32
0
set_staging(True)

ol = OpenLibrary("http://dev.openlibrary.org")
ol.login('EdwardBot', rc['EdwardBot'])

re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$')


def has_dot(s):
    return s.endswith('.') and not re_skip.search(s)


q = {'type': '/type/edition', 'table_of_contents': None, 'subjects': None}
queue = []
count = 0
for e in query_iter(q):
    if not e.get('subjects', None) or not any(
            has_dot(s) for s in e['subjects']):
        continue
    subjects = [s[:-1] if has_dot(s) else s for s in e['subjects']]
    q = {
        'key': e['key'],
        'subjects': {
            'connect': 'update_list',
            'value': subjects
        },
    }
    # need to fix table_of_contents to pass validation
    toc = e['table_of_contents']
    if toc and (isinstance(toc[0], six.string_types)
                or toc[0]['type'] == '/type/text'):
Exemplo n.º 33
0
def search(author, name):
    book_fields = ('title_prefix', 'title');
    q = { 'type': '/type/edition', 'authors': author, 'title_prefix': None, 'title': None, 'isbn_10': None}
    found = list(query_iter(q))
    db_author = ''
    names = set([name])
    t = ''
    books = []
    for e in found:
        locs = set()
        for i in e['isbn_10'] or []:
            locs.update(search_query('isbn', i))
        if not locs:
            books.append((e['key'], (e['title_prefix'] or '') + e['title'], None, []))
            continue
        found = data_from_marc(locs, name)
        if len(found) != 1:
            locs = []
            for i in found.values():
                locs.append(i)
            books.append((e['key'], (e['title_prefix'] or '') + e['title'], None, locs))
            continue
        marc_author = found.keys()[0]
        locs = found.values()[0]
        names.update(marc_author[0:2])
        books.append((e['key'], (e['title_prefix'] or '') + e['title'], marc_author, locs))

    authors = []
    names2 = set()
    for n in names:
        if ', ' in n:
            continue
        i = n.rfind(' ')
        names2.add("%s, %s" % (n[i+1:], n[:i]))
    names.update(names2)

    for n in names:
        for a in author_search(n):
            authors.append(a)

    for a in authors:
        q = {
            'type': '/type/edition',
            'authors': a['key'],
            'title_prefix': None,
            'title': None,
            'isbn_10': None
        }
        a['editions'] = list(query_iter(q))

    author_map = {}

    for key, title, a, locs in books:
        t += '<tr><td><a href="http://openlibrary.org' + key + '">' + web.htmlquote(title) + '</a>'
        t += '<br>' + ', '.join('<a href="http://openlibrary.org/show-marc/%s">%s</a>' % (i, i) for i in locs) + '</td>'
#        t += '<td>' + web.htmlquote(repr(a[2])) + '</td>'
        if a:
            if a[2] not in author_map:
                dates = {'birth_date': a[2][0], 'death_date': a[2][1], 'dates': a[2][2]}
                db_match = [db for db in authors if author_dates_match(dates, db)]
                author_map[a[2]] = db_match[0] if len(db_match) == 1 else None

            match = author_map[a[2]]
            if match:
                t += '<td><a href="http://openlibrary.org%s">%s-%s</a></td>' % (match['key'], match['birth_date'] or '', match['death_date'] or '')
            else:
                t += '<td>%s-%s (no match)</td>' % (dates['birth_date'] or '', dates['death_date'] or '')
        t += '</tr>\n'

    ret = ''
    if authors:
        ret += '<ul>'
        for a in authors:
            ret += '<li><a href="http://openlibrary.org%s">%s</a> (%s-%s) %d editions' % (a['key'], web.htmlquote(name), a['birth_date'] or '', a['death_date'] or '', len(a['editions']))
        ret += '</ul>'

    return ret + '<table>' + t + '</table>'
Exemplo n.º 34
0
            continue
        if e.get('works', None) and any(i['key'] == wkey for i in e['works']):
            continue
        if any(i['key'] == wakey for i in e['authors']):
            continue
        for akey in (a['key'] for a in e.get('authors', [])):
            a = withKey(akey)
            name = a.get('name', '')
            if match_name(name, work_author['name'], last_name_only_ok=True):
                yield (e, a)

q = { 'type':'/type/work' }
for k in 'key', 'title', 'authors':
    q[k] = None

for w in query_iter(q):
    wkey = w['key']
    titles = set([w['title']])
    q = { 'type': '/type/edition', 'works': wkey }
    for k in 'title', 'title_prefix', 'key', 'authors':
        q[k] = None

    wakey = w['authors'][0]['key']
    work_author = withKey(wakey)

    for e in query_iter(q):
        if not e.get('title', None): 
            continue
        titles.update([get_title(e), e['title']])

    found = []
Exemplo n.º 35
0
def add_fields():
    comment = 'add fields to works'
    queue = []
    seen = set()
    fields = ['genres', 'first_sentence', 'dewey_number', \
            'lc_classifications', 'publish_date'] #, 'table_of_contents']
    for w in iter_works(fields + ['title']):
        if w['key'] in seen or all(w.get(f, None) for f in fields):
            continue
        seen.add(w['key'])
        q = { 'type':'/type/edition', 'works': w['key']}
        for f in fields: q[f] = None
        editions = list(query_iter(q))

        found = {}

        for f in fields:
            if not w.get(f, None):
                if f == 'publish_date':
                    years = defaultdict(list)
                    for e in editions:
                        date = e.get(f, None)
                        if not date or date == '0000':
                            continue
                        m = re_year.match(date)
                        if not m:
                            continue
                        year = int(m.group(1))
                        years[year].append(e['key'])
                    if years:
                        found[f] = str(min(years.keys()))
                    continue
                if f == 'genres':
                    found_list = [[g.strip('.') for g in e[f]] for e in editions \
                        if e.get(f, None) and not any('ranslation' in i for i in e[f])]
                if f == 'table_of_contents':
                    found_list = []
                    for e in query_iter(q):
                        if not e.get(f, None):
                            continue
                        toc = e[f]
                        print e['key'], toc
                        print e
                        print
                        if isinstance(toc[0], basestring):
                            found_list.append(toc_items(toc))
                        else:
                            assert isinstance(toc[0], dict)
                            if toc[0]['type'] == '/type/text':
                                found_list.append(toc_items([i['value'] for i in toc]))
                            else:
                                assert toc[0]['type']['key'] == '/type/toc_item'
                                found_list.append(toc)
                else:
                    found_list = [e[f] for e in query_iter(q) if e.get(f, None)]
                if found_list:
                    first = found_list[0]
                    if all(i == first for i in found_list):
                        found[f] = first

        if not found:
            continue

        print len(queue) + 1, w['key'], len(editions), w['title']
        print found

        q = { 'key': w['key'], }
        for f in fields:
            if not f in found:
                continue
            if f == 'publish_date':
                q['first_publish_date'] = { 'connect': 'update', 'value': found[f]}
            elif f == 'first_sentence':
                q[f] = { 'connect': 'update', 'value': found[f]}
            else:
                q[f] = { 'connect': 'update_list', 'value': found[f]}
        queue.append(q)
        if len(queue) == 200:
            print ol.write(queue, comment=comment)
            queue = []
    print ol.write(queue, comment=comment)