def process_triples(path): """Takes a file with triples, sort it using first column and groups it by first column. """ print("processing triples from", path) cmd = ["sort", path] p = subprocess.Popen(cmd, stdout=subprocess.PIPE) for key, chunk in itertools.groupby(read_tsv(p.stdout), lambda t: t[0]): d = collections.defaultdict(list) for k, name, value in chunk: if name in ['json', 'edition', 'author']: value = simplejson.loads(value) d[name].append(value) yield key, d
def process_author_dump(writer, authors_dump): import bsddb db = bsddb.btopen('solrdump/authors.db', 'w', cachesize=1024*1024*1024) properties = ['key', 'name', 'alternate_names', 'personal_name'] for type, key, revision, timestamp, json in read_tsv(authors_dump): author = simplejson.loads(json) olid = key.split("/")[-1] db[olid] = simplejson.dumps(subdict(author, properties)) writer.write(process_author(author)) return db
def process_triples(path): """Takes a file with triples, sort it using first column and groups it by first column. """ print "processing triples from", path cmd = ["sort", path] p = subprocess.Popen(cmd, stdout=subprocess.PIPE) for key, chunk in itertools.groupby(read_tsv(p.stdout), lambda t: t[0]): d = collections.defaultdict(list) for k, name, value in chunk: if name in ['json', 'edition', 'author']: value = simplejson.loads(value) d[name].append(value) yield key, d
def process_redirect_dump(writer, redirects_dump): import bsddb db = bsddb.btopen('solrdump/redirects.db', 'w', cachesize=1024*1024*1024) for type, key, revision, timestamp, json in read_tsv(redirects_dump): d = simplejson.loads(json) if not key.startswith("/authors/") and not key.startswith("/works/"): continue location = d.get('location') if location: # Old redirects still start with /a/ instead of /authors/. location = location.replace("/a/", "/authors/") db[key] = location for key in db: if key.startswith("/works/"): redirect = find_redirect(db, key) if redirect: writer.write([(redirect, "redirect", key)]) return db
def process_edition_dump(writer, editions_dump): for type, key, revision, timestamp, json in read_tsv(editions_dump): doc = simplejson.loads(json) writer.write(process_edition(doc))
def process_work_dump(writer, works_dump, author_db, redirect_db): for type, key, revision, timestamp, json in read_tsv(works_dump): doc = simplejson.loads(json) writer.write(process_work(doc, author_db, redirect_db))