def by_authors(): find_new_work_key() skipping = False skipping = True q = { 'type':'/type/author', 'name': None, 'works': None } for a in query_iter(q, offset=215000): akey = a['key'] if skipping: print 'skipping:', akey, a['name'] if akey == '/a/OL218496A': skipping = False continue q = { 'type':'/type/work', 'authors': akey, } if query(q): print akey, `a['name']`, 'has works' continue # print akey, a['name'] found = find_works(akey) works = [i for i in found if len(i['editions']) > 2] if works: #open('found/' + akey[3:], 'w').write(`works`) print akey, `a['name']` #pprint(works) #print_works(works) add_works(akey, works) print
def dates(): f = 'first_publish_date' for w in iter_works([f, 'title']): if f in w: continue q = { 'type':'/type/edition', 'works': w['key'], 'publish_date': None } years = defaultdict(list) for e in query_iter(q): date = e.get('publish_date', None) if not date or date == '0000': continue m = re_year.match(date) if not m: continue year = int(m.group(1)) years[year].append(e['key']) if not years: continue first = min(years.keys()) assert first != 0 print w['key'], `w['title']`, first q = { 'key': w['key'], f: { 'connect': 'update', 'value': str(first)} } queue.append(q) if len(queue) == 200: print ol.write(queue, comment='add first publish date') queue = [] print ol.write(queue, comment='add first publish date')
def dates(): global queue f = 'first_publish_date' for w in iter_works([f, 'title']): if f in w: continue q = {'type': '/type/edition', 'works': w['key'], 'publish_date': None} years = defaultdict(list) for e in query_iter(q): date = e.get('publish_date', None) if not date or date == '0000': continue m = re_year.match(date) if not m: continue year = int(m.group(1)) years[year].append(e['key']) if not years: continue first = min(years.keys()) assert first != 0 print((w['key'], repr(w['title']), first)) q = {'key': w['key'], f: {'connect': 'update', 'value': str(first)}} queue.append(q) if len(queue) == 200: print(ol.write(queue, comment='add first publish date')) queue = [] print(ol.write(queue, comment='add first publish date'))
def lang(): f = 'original_languages' queue = [] for w in iter_works([f, 'title']): if f in w and w[f]: continue q = { 'type': '/type/edition', 'works': w['key'], 'languages': None, 'title': None, 'title_prefix': None } editions = [e for e in query_iter(q) if e['languages']] title = mk_norm(w['title']) if not editions or any(len(e['languages']) != 1 for e in editions): continue lang = [ e['languages'][0]['key'] for e in editions if mk_norm(get_title(e)) == title ] if len(lang) < 2: continue first = lang[0] if any(l != first for l in lang): continue print((w['key'], repr(w['title']), first, len(lang))) q = {'key': w['key'], f: {'connect': 'update_list', 'value': [first]}} queue.append(q) if len(queue) == 200: print(ol.write(queue, comment='add original language')) queue = [] print(ol.write(queue, comment='add original language'))
def lang(): f = 'original_languages' queue = [] for w in iter_works([f, 'title']): if f in w and w[f]: continue q = { 'type':'/type/edition', 'works': w['key'], 'languages': None, 'title': None, 'title_prefix': None } editions = [e for e in query_iter(q) if e['languages']] title = mk_norm(w['title']) if not editions or any(len(e['languages']) != 1 for e in editions): continue lang = [e['languages'][0]['key'] for e in editions if mk_norm(get_title(e)) == title] if len(lang) < 2: continue first = lang[0] if any(l != first for l in lang): continue print w['key'], `w['title']`, first, len(lang) q = { 'key': w['key'], f: { 'connect': 'update_list', 'value': [first]} } queue.append(q) if len(queue) == 200: print ol.write(queue, comment='add original language') queue = [] print ol.write(queue, comment='add original language')
def get_books(akey): q = { 'type':'/type/edition', 'authors': akey, '*': None } for e in query_iter(q): if not e.get('title', None): continue if len(e.get('authors', [])) != 1: continue # if 'works' in e: # continue if 'title_prefix' in e and e['title_prefix']: prefix = e['title_prefix'] if prefix[-1] != ' ': prefix += ' ' title = prefix + e['title'] else: title = e['title'] if title.strip('. ') in ['Publications', 'Works', 'Report', \ 'Letters', 'Calendar', 'Bulletin', 'Plays', 'Sermons', 'Correspondence']: continue m = re_parens.match(title) if m: title = m.group(1) n = mk_norm(title) book = { 'title': title, 'norm_title': n, 'key': e['key'], } if 'languages' in e: book['lang'] = [l['key'][3:] for l in e['languages']] if e.get('table_of_contents', None): if isinstance(e['table_of_contents'][0], basestring): book['table_of_contents'] = e['table_of_contents'] else: assert isinstance(e['table_of_contents'][0], dict) if e['table_of_contents'][0]['type'] == '/type/text': book['table_of_contents'] = [i['value'] for i in e['table_of_contents']] if not e.get('work_titles', None): yield book continue wt = e['work_titles'][0].strip('. ') if wt in ('Works', 'Selections'): yield book continue n_wt = mk_norm(wt) book['work_title'] = wt book['norm_wt'] = n_wt yield book
def is_loaded(loc): assert loc.startswith('marc:') vars = {'loc': loc[5:]} db_iter = marc_index.query('select * from machine_comment where v=$loc', vars) if list(db_iter): return True iter = query_iter({'type': '/type/edition', 'source_records': loc}) return bool(list(iter))
def isbn_search(self, v): q = {'type': '/type/edition', 'isbn_10': v, 'title': None, 'subtitle': None} editions = [] for e in query_iter(q): e['isbn_10'] = v editions.append(e) yield 'searching for ISBN ' + web.htmlquote(v) + ': ' for i in self.search(editions): yield i
def author_search(name): q = { 'type':'/type/author', 'name': name, 'birth_date': None, 'death_date': None, 'dates': None } return [a for a in query_iter(q) if a.get('birth_date', None) or a.get('death_date', None) or a.get('dates', None)]
def title_search(self, v): q = {"type": "/type/edition", "isbn_10": None, "title": v} editions = [] for e in query_iter(q): e["title"] = v editions.append(e) yield 'searcing for title "' + web.htmlquote(v) + '": ' for i in self.search(editions): yield i
def title_search(self, v): q = {'type': '/type/edition', 'isbn_10': None, 'title': v} editions = [] for e in query_iter(q): e['title'] = v editions.append(e) yield 'searcing for title "' + web.htmlquote(v) + '": ' for i in self.search(editions): yield i
def isbn_search(self, v): q = {"type": "/type/edition", "isbn_10": v, "title": None, "subtitle": None} editions = [] for e in query_iter(q): e["isbn_10"] = v editions.append(e) yield "searching for ISBN " + web.htmlquote(v) + ": " for i in self.search(editions): yield i
def get_keys(loc): assert loc.startswith('marc:') vars = {'loc': loc[5:]} db_iter = marc_index.query('select k from machine_comment where v=$loc', vars) mc = list(db_iter) if mc: return [r.k for r in mc] iter = query_iter({'type': '/type/edition', 'source_records': loc}) return [e['key'] for e in iter]
def oclc_search(self, v): q = {'type': '/type/edition', 'oclc_numbers': v, 'title': None, 'subtitle': None, 'isbn_10': None} editions = [] print q for e in query_iter(q): e['oclc_numbers'] = v editions.append(e) yield 'searching for OCLC ' + web.htmlquote(v) + ': ' for i in self.search(editions): yield i
def other_editions(title, wkey, work_author): # look for other editions with the same title wakey = work_author['key'] q = {'type': '/type/edition', 'title': title} for k in 'works', 'title_prefix', 'key', 'authors': q[k] = None found = [] for e in query_iter(q): if not e.get('authors', None): continue if e.get('works', None) and any(i['key'] == wkey for i in e['works']): continue if any(i['key'] == wakey for i in e['authors']): continue for akey in (a['key'] for a in e.get('authors', [])): a = withKey(akey) name = a.get('name', '') if match_name(name, work_author['name'], last_name_only_ok=True): yield (e, a)
def other_editions(title, wkey, work_author): # look for other editions with the same title wakey = work_author['key'] q = { 'type': '/type/edition', 'title': title } for k in 'works', 'title_prefix', 'key', 'authors': q[k] = None found = [] for e in query_iter(q): if not e.get('authors', None): continue if e.get('works', None) and any(i['key'] == wkey for i in e['works']): continue if any(i['key'] == wakey for i in e['authors']): continue for akey in (a['key'] for a in e.get('authors', [])): a = withKey(akey) name = a.get('name', '') if match_name(name, work_author['name'], last_name_only_ok=True): yield (e, a)
if e.get('works', None) and any(i['key'] == wkey for i in e['works']): continue if any(i['key'] == wakey for i in e['authors']): continue for akey in (a['key'] for a in e.get('authors', [])): a = withKey(akey) name = a.get('name', '') if match_name(name, work_author['name'], last_name_only_ok=True): yield (e, a) q = {'type': '/type/work'} for k in 'key', 'title', 'authors': q[k] = None for w in query_iter(q): wkey = w['key'] titles = set([w['title']]) q = {'type': '/type/edition', 'works': wkey} for k in 'title', 'title_prefix', 'key', 'authors': q[k] = None wakey = w['authors'][0]['key'] work_author = withKey(wakey) for e in query_iter(q): if not e.get('title', None): continue titles.update([get_title(e), e['title']]) found = []
def add_fields(): comment = 'add fields to works' queue = [] seen = set() fields = ['genres', 'first_sentence', 'dewey_number', \ 'lc_classifications', 'publish_date'] #, 'table_of_contents'] for w in iter_works(fields + ['title']): if w['key'] in seen or all(w.get(f, None) for f in fields): continue seen.add(w['key']) q = {'type': '/type/edition', 'works': w['key']} for f in fields: q[f] = None editions = list(query_iter(q)) found = {} for f in fields: if not w.get(f, None): if f == 'publish_date': years = defaultdict(list) for e in editions: date = e.get(f, None) if not date or date == '0000': continue m = re_year.match(date) if not m: continue year = int(m.group(1)) years[year].append(e['key']) if years: found[f] = str(min(years.keys())) continue if f == 'genres': found_list = [[g.strip('.') for g in e[f]] for e in editions \ if e.get(f, None) and not any('translation' in i for i in e[f])] if f == 'table_of_contents': found_list = [] for e in query_iter(q): if not e.get(f, None): continue toc = e[f] print(e['key'], toc) print(e) print() if isinstance(toc[0], six.string_types): found_list.append(toc_items(toc)) else: assert isinstance(toc[0], dict) if toc[0]['type'] == '/type/text': found_list.append( toc_items([i['value'] for i in toc])) else: assert toc[0]['type'][ 'key'] == '/type/toc_item' found_list.append(toc) else: found_list = [ e[f] for e in query_iter(q) if e.get(f, None) ] if found_list: first = found_list[0] if all(i == first for i in found_list): found[f] = first if not found: continue print(len(queue) + 1, w['key'], len(editions), w['title']) print(found) q = { 'key': w['key'], } for f in fields: if not f in found: continue if f == 'publish_date': q['first_publish_date'] = { 'connect': 'update', 'value': found[f] } elif f == 'first_sentence': q[f] = {'connect': 'update', 'value': found[f]} else: q[f] = {'connect': 'update_list', 'value': found[f]} queue.append(q) if len(queue) == 200: print(ol.write(queue, comment=comment)) queue = [] print(ol.write(queue, comment=comment))
def iter_works(fields): q = { 'type':'/type/work', 'key': None } for f in fields: q[f] = None return query_iter(q)
sys.stdout = codecs.getwriter('utf-8')(sys.stdout) set_staging(True) ol = OpenLibrary("http://dev.openlibrary.org") ol.login('EdwardBot', rc['EdwardBot']) re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$') def has_dot(s): return s.endswith('.') and not re_skip.search(s) q = { 'type': '/type/edition', 'table_of_contents': None, 'subjects': None } queue = [] count = 0 for e in query_iter(q): if not e.get('subjects', None) or not any(has_dot(s) for s in e['subjects']): continue subjects = [s[:-1] if has_dot(s) else s for s in e['subjects']] q = { 'key': e['key'], 'subjects': {'connect': 'update_list', 'value': subjects }, } # need to fix table_of_contents to pass validation toc = e['table_of_contents'] if toc and (isinstance(toc[0], six.string_types) or toc[0]['type'] == '/type/text'): if isinstance(toc[0], six.string_types): assert all(isinstance(i, six.string_types) for i in toc) new_toc = [{'title': i, 'type': '/type/toc_item'} for i in toc] else: assert all(i['type'] == '/type/text' for i in toc)
from catalog.utils.query import query_iter, set_staging, withKey, get_mc import sys, codecs, re sys.path.append('/home/edward/src/olapi') from olapi import OpenLibrary, Reference from catalog.read_rc import read_rc from catalog.get_ia import get_from_archive, get_from_local from catalog.marc.fast_parse import get_first_tag, get_all_subfields rc = read_rc() sys.stdout = codecs.getwriter('utf-8')(sys.stdout) set_staging(True) ol = OpenLibrary("http://dev.openlibrary.org") ol.login('EdwardBot', rc['EdwardBot']) q = {'type': '/type/edition', 'table_of_contents': None, 'subjects': None} queue = [] count = 0 for e in query_iter(q, limit=100): key = e['key'] mc = get_mc(key) if not mc: continue data = get_from_local(mc) line = get_first_tag(data, set(['041'])) if not line: continue print key, line[0:2], list(get_all_subfields(line))
def find_author(name): q = {'type': '/type/author', 'name': name} return [a['key'] for a in query_iter(q)]
from catalog.utils.query import query_iter, set_staging, withKey, get_mc import sys, codecs, re sys.path.append('/home/edward/src/olapi') from olapi import OpenLibrary, Reference from catalog.read_rc import read_rc from catalog.get_ia import get_from_archive, get_from_local from catalog.marc.fast_parse import get_first_tag, get_all_subfields rc = read_rc() sys.stdout = codecs.getwriter('utf-8')(sys.stdout) set_staging(True) ol = OpenLibrary("http://dev.openlibrary.org") ol.login('EdwardBot', rc['EdwardBot']) q = { 'type': '/type/edition', 'table_of_contents': None, 'subjects': None } queue = [] count = 0 for e in query_iter(q, limit=100): key = e['key'] mc = get_mc(key) if not mc: continue data = get_from_local(mc) line = get_first_tag(data, set(['041'])) if not line: continue print key, line[0:2], list(get_all_subfields(line))
def iter_works(fields): q = {'type': '/type/work', 'key': None} for f in fields: q[f] = None return query_iter(q)
def search(author, name): book_fields = ('title_prefix', 'title'); q = { 'type': '/type/edition', 'authors': author, 'title_prefix': None, 'title': None, 'isbn_10': None} found = list(query_iter(q)) db_author = '' names = set([name]) t = '' books = [] for e in found: locs = set() for i in e['isbn_10'] or []: locs.update(search_query('isbn', i)) if not locs: books.append((e['key'], (e['title_prefix'] or '') + e['title'], None, [])) continue found = data_from_marc(locs, name) if len(found) != 1: locs = [] for i in found.values(): locs.append(i) books.append((e['key'], (e['title_prefix'] or '') + e['title'], None, locs)) continue marc_author = found.keys()[0] locs = found.values()[0] names.update(marc_author[0:2]) books.append((e['key'], (e['title_prefix'] or '') + e['title'], marc_author, locs)) authors = [] names2 = set() for n in names: if ', ' in n: continue i = n.rfind(' ') names2.add("%s, %s" % (n[i+1:], n[:i])) names.update(names2) for n in names: for a in author_search(n): authors.append(a) for a in authors: q = { 'type': '/type/edition', 'authors': a['key'], 'title_prefix': None, 'title': None, 'isbn_10': None } a['editions'] = list(query_iter(q)) author_map = {} for key, title, a, locs in books: t += '<tr><td><a href="http://openlibrary.org' + key + '">' + web.htmlquote(title) + '</a>' t += '<br>' + ', '.join('<a href="http://openlibrary.org/show-marc/%s">%s</a>' % (i, i) for i in locs) + '</td>' # t += '<td>' + web.htmlquote(`a[2]`) + '</td>' if a: if a[2] not in author_map: dates = {'birth_date': a[2][0], 'death_date': a[2][1], 'dates': a[2][2]} db_match = [db for db in authors if author_dates_match(dates, db)] author_map[a[2]] = db_match[0] if len(db_match) == 1 else None match = author_map[a[2]] if match: t += '<td><a href="http://openlibrary.org%s">%s-%s</a></td>' % (match['key'], match['birth_date'] or '', match['death_date'] or '') else: t += '<td>%s-%s (no match)</td>' % (dates['birth_date'] or '', dates['death_date'] or '') t += '</tr>\n' ret = '' if authors: ret += '<ul>' for a in authors: ret += '<li><a href="http://openlibrary.org%s">%s</a> (%s-%s) %d editions' % (a['key'], web.htmlquote(name), a['birth_date'] or '', a['death_date'] or '', len(a['editions'])) ret += '</ul>' return ret + '<table>' + t + '</table>'
set_staging(True) ol = OpenLibrary("http://dev.openlibrary.org") ol.login('EdwardBot', rc['EdwardBot']) re_skip = re.compile('\b([A-Z]|Co|Dr|Jr|Capt|Mr|Mrs|Ms|Prof|Rev|Revd|Hon)\.$') def has_dot(s): return s.endswith('.') and not re_skip.search(s) q = {'type': '/type/edition', 'table_of_contents': None, 'subjects': None} queue = [] count = 0 for e in query_iter(q): if not e.get('subjects', None) or not any( has_dot(s) for s in e['subjects']): continue subjects = [s[:-1] if has_dot(s) else s for s in e['subjects']] q = { 'key': e['key'], 'subjects': { 'connect': 'update_list', 'value': subjects }, } # need to fix table_of_contents to pass validation toc = e['table_of_contents'] if toc and (isinstance(toc[0], six.string_types) or toc[0]['type'] == '/type/text'):
def search(author, name): book_fields = ('title_prefix', 'title'); q = { 'type': '/type/edition', 'authors': author, 'title_prefix': None, 'title': None, 'isbn_10': None} found = list(query_iter(q)) db_author = '' names = set([name]) t = '' books = [] for e in found: locs = set() for i in e['isbn_10'] or []: locs.update(search_query('isbn', i)) if not locs: books.append((e['key'], (e['title_prefix'] or '') + e['title'], None, [])) continue found = data_from_marc(locs, name) if len(found) != 1: locs = [] for i in found.values(): locs.append(i) books.append((e['key'], (e['title_prefix'] or '') + e['title'], None, locs)) continue marc_author = found.keys()[0] locs = found.values()[0] names.update(marc_author[0:2]) books.append((e['key'], (e['title_prefix'] or '') + e['title'], marc_author, locs)) authors = [] names2 = set() for n in names: if ', ' in n: continue i = n.rfind(' ') names2.add("%s, %s" % (n[i+1:], n[:i])) names.update(names2) for n in names: for a in author_search(n): authors.append(a) for a in authors: q = { 'type': '/type/edition', 'authors': a['key'], 'title_prefix': None, 'title': None, 'isbn_10': None } a['editions'] = list(query_iter(q)) author_map = {} for key, title, a, locs in books: t += '<tr><td><a href="http://openlibrary.org' + key + '">' + web.htmlquote(title) + '</a>' t += '<br>' + ', '.join('<a href="http://openlibrary.org/show-marc/%s">%s</a>' % (i, i) for i in locs) + '</td>' # t += '<td>' + web.htmlquote(repr(a[2])) + '</td>' if a: if a[2] not in author_map: dates = {'birth_date': a[2][0], 'death_date': a[2][1], 'dates': a[2][2]} db_match = [db for db in authors if author_dates_match(dates, db)] author_map[a[2]] = db_match[0] if len(db_match) == 1 else None match = author_map[a[2]] if match: t += '<td><a href="http://openlibrary.org%s">%s-%s</a></td>' % (match['key'], match['birth_date'] or '', match['death_date'] or '') else: t += '<td>%s-%s (no match)</td>' % (dates['birth_date'] or '', dates['death_date'] or '') t += '</tr>\n' ret = '' if authors: ret += '<ul>' for a in authors: ret += '<li><a href="http://openlibrary.org%s">%s</a> (%s-%s) %d editions' % (a['key'], web.htmlquote(name), a['birth_date'] or '', a['death_date'] or '', len(a['editions'])) ret += '</ul>' return ret + '<table>' + t + '</table>'
continue if e.get('works', None) and any(i['key'] == wkey for i in e['works']): continue if any(i['key'] == wakey for i in e['authors']): continue for akey in (a['key'] for a in e.get('authors', [])): a = withKey(akey) name = a.get('name', '') if match_name(name, work_author['name'], last_name_only_ok=True): yield (e, a) q = { 'type':'/type/work' } for k in 'key', 'title', 'authors': q[k] = None for w in query_iter(q): wkey = w['key'] titles = set([w['title']]) q = { 'type': '/type/edition', 'works': wkey } for k in 'title', 'title_prefix', 'key', 'authors': q[k] = None wakey = w['authors'][0]['key'] work_author = withKey(wakey) for e in query_iter(q): if not e.get('title', None): continue titles.update([get_title(e), e['title']]) found = []
def add_fields(): comment = 'add fields to works' queue = [] seen = set() fields = ['genres', 'first_sentence', 'dewey_number', \ 'lc_classifications', 'publish_date'] #, 'table_of_contents'] for w in iter_works(fields + ['title']): if w['key'] in seen or all(w.get(f, None) for f in fields): continue seen.add(w['key']) q = { 'type':'/type/edition', 'works': w['key']} for f in fields: q[f] = None editions = list(query_iter(q)) found = {} for f in fields: if not w.get(f, None): if f == 'publish_date': years = defaultdict(list) for e in editions: date = e.get(f, None) if not date or date == '0000': continue m = re_year.match(date) if not m: continue year = int(m.group(1)) years[year].append(e['key']) if years: found[f] = str(min(years.keys())) continue if f == 'genres': found_list = [[g.strip('.') for g in e[f]] for e in editions \ if e.get(f, None) and not any('ranslation' in i for i in e[f])] if f == 'table_of_contents': found_list = [] for e in query_iter(q): if not e.get(f, None): continue toc = e[f] print e['key'], toc print e print if isinstance(toc[0], basestring): found_list.append(toc_items(toc)) else: assert isinstance(toc[0], dict) if toc[0]['type'] == '/type/text': found_list.append(toc_items([i['value'] for i in toc])) else: assert toc[0]['type']['key'] == '/type/toc_item' found_list.append(toc) else: found_list = [e[f] for e in query_iter(q) if e.get(f, None)] if found_list: first = found_list[0] if all(i == first for i in found_list): found[f] = first if not found: continue print len(queue) + 1, w['key'], len(editions), w['title'] print found q = { 'key': w['key'], } for f in fields: if not f in found: continue if f == 'publish_date': q['first_publish_date'] = { 'connect': 'update', 'value': found[f]} elif f == 'first_sentence': q[f] = { 'connect': 'update', 'value': found[f]} else: q[f] = { 'connect': 'update_list', 'value': found[f]} queue.append(q) if len(queue) == 200: print ol.write(queue, comment=comment) queue = [] print ol.write(queue, comment=comment)