Exemplo n.º 1
0
def process_triples(path):
    """Takes a file with triples, sort it using first column and groups it by first column.
    """
    print("processing triples from", path)
    cmd = ["sort", path]
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    for key, chunk in itertools.groupby(read_tsv(p.stdout), lambda t: t[0]):
        d = collections.defaultdict(list)
        for k, name, value in chunk:
            if name in ['json', 'edition', 'author']:
                value = simplejson.loads(value)
            d[name].append(value)
        yield key, d
Exemplo n.º 2
0
def process_author_dump(writer, authors_dump):
    import bsddb
    db = bsddb.btopen('solrdump/authors.db', 'w', cachesize=1024*1024*1024)

    properties = ['key', 'name', 'alternate_names', 'personal_name']
    for type, key, revision, timestamp, json in read_tsv(authors_dump):
        author = simplejson.loads(json)

        olid = key.split("/")[-1]
        db[olid] = simplejson.dumps(subdict(author, properties))

        writer.write(process_author(author))
    return db
Exemplo n.º 3
0
def process_triples(path):
    """Takes a file with triples, sort it using first column and groups it by first column.
    """
    print "processing triples from", path
    cmd = ["sort", path]
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    for key, chunk in itertools.groupby(read_tsv(p.stdout), lambda t: t[0]):
        d = collections.defaultdict(list)
        for k, name, value in chunk:
            if name in ['json', 'edition', 'author']:
                value = simplejson.loads(value)
            d[name].append(value)
        yield key, d
Exemplo n.º 4
0
def process_author_dump(writer, authors_dump):
    import bsddb
    db = bsddb.btopen('solrdump/authors.db', 'w', cachesize=1024*1024*1024)

    properties = ['key', 'name', 'alternate_names', 'personal_name']
    for type, key, revision, timestamp, json in read_tsv(authors_dump):
        author = simplejson.loads(json)

        olid = key.split("/")[-1]
        db[olid] = simplejson.dumps(subdict(author, properties))

        writer.write(process_author(author))
    return db
Exemplo n.º 5
0
def process_redirect_dump(writer, redirects_dump):
    import bsddb
    db = bsddb.btopen('solrdump/redirects.db', 'w', cachesize=1024*1024*1024)

    for type, key, revision, timestamp, json in read_tsv(redirects_dump):
        d = simplejson.loads(json)
        if not key.startswith("/authors/") and not key.startswith("/works/"):
            continue

        location = d.get('location')
        if location:
            # Old redirects still start with /a/ instead of /authors/.
            location = location.replace("/a/", "/authors/")
            db[key] = location

    for key in db:
        if key.startswith("/works/"):
            redirect = find_redirect(db, key)
            if redirect:
                writer.write([(redirect, "redirect", key)])

    return db
Exemplo n.º 6
0
def process_redirect_dump(writer, redirects_dump):
    import bsddb
    db = bsddb.btopen('solrdump/redirects.db', 'w', cachesize=1024*1024*1024)

    for type, key, revision, timestamp, json in read_tsv(redirects_dump):
        d = simplejson.loads(json)
        if not key.startswith("/authors/") and not key.startswith("/works/"):
            continue

        location = d.get('location')
        if location:
            # Old redirects still start with /a/ instead of /authors/.
            location = location.replace("/a/", "/authors/")
            db[key] = location

    for key in db:
        if key.startswith("/works/"):
            redirect = find_redirect(db, key)
            if redirect:
                writer.write([(redirect, "redirect", key)])

    return db
Exemplo n.º 7
0
def process_edition_dump(writer, editions_dump):
    for type, key, revision, timestamp, json in read_tsv(editions_dump):
        doc = simplejson.loads(json)
        writer.write(process_edition(doc))
Exemplo n.º 8
0
def process_work_dump(writer, works_dump, author_db, redirect_db):
    for type, key, revision, timestamp, json in read_tsv(works_dump):
        doc = simplejson.loads(json)
        writer.write(process_work(doc, author_db, redirect_db))
Exemplo n.º 9
0
def process_edition_dump(writer, editions_dump):
    for type, key, revision, timestamp, json in read_tsv(editions_dump):
        doc = simplejson.loads(json)
        writer.write(process_edition(doc))
Exemplo n.º 10
0
def process_work_dump(writer, works_dump, author_db, redirect_db):
    for type, key, revision, timestamp, json in read_tsv(works_dump):
        doc = simplejson.loads(json)
        writer.write(process_work(doc, author_db, redirect_db))