Python read_tsv示例，dump.read_tsv Python示例

示例#1

0

显示文件

文件： solr.py 项目： hornc/openlibrary-1

def process_triples(path):
    """Takes a file with triples, sort it using first column and groups it by first column.
    """
    print("processing triples from", path)
    cmd = ["sort", path]
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    for key, chunk in itertools.groupby(read_tsv(p.stdout), lambda t: t[0]):
        d = collections.defaultdict(list)
        for k, name, value in chunk:
            if name in ['json', 'edition', 'author']:
                value = simplejson.loads(value)
            d[name].append(value)
        yield key, d

示例#2

0

显示文件

文件： solr.py 项目： hornc/openlibrary-1

def process_author_dump(writer, authors_dump):
    import bsddb
    db = bsddb.btopen('solrdump/authors.db', 'w', cachesize=1024*1024*1024)

    properties = ['key', 'name', 'alternate_names', 'personal_name']
    for type, key, revision, timestamp, json in read_tsv(authors_dump):
        author = simplejson.loads(json)

        olid = key.split("/")[-1]
        db[olid] = simplejson.dumps(subdict(author, properties))

        writer.write(process_author(author))
    return db

示例#3

0

显示文件

def process_triples(path):
    """Takes a file with triples, sort it using first column and groups it by first column.
    """
    print "processing triples from", path
    cmd = ["sort", path]
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    for key, chunk in itertools.groupby(read_tsv(p.stdout), lambda t: t[0]):
        d = collections.defaultdict(list)
        for k, name, value in chunk:
            if name in ['json', 'edition', 'author']:
                value = simplejson.loads(value)
            d[name].append(value)
        yield key, d

示例#4

0

显示文件

def process_author_dump(writer, authors_dump):
    import bsddb
    db = bsddb.btopen('solrdump/authors.db', 'w', cachesize=1024*1024*1024)

    properties = ['key', 'name', 'alternate_names', 'personal_name']
    for type, key, revision, timestamp, json in read_tsv(authors_dump):
        author = simplejson.loads(json)

        olid = key.split("/")[-1]
        db[olid] = simplejson.dumps(subdict(author, properties))

        writer.write(process_author(author))
    return db

示例#5

0

显示文件

文件： solr.py 项目： hornc/openlibrary-1

def process_redirect_dump(writer, redirects_dump):
    import bsddb
    db = bsddb.btopen('solrdump/redirects.db', 'w', cachesize=1024*1024*1024)

    for type, key, revision, timestamp, json in read_tsv(redirects_dump):
        d = simplejson.loads(json)
        if not key.startswith("/authors/") and not key.startswith("/works/"):
            continue

        location = d.get('location')
        if location:
            # Old redirects still start with /a/ instead of /authors/.
            location = location.replace("/a/", "/authors/")
            db[key] = location

    for key in db:
        if key.startswith("/works/"):
            redirect = find_redirect(db, key)
            if redirect:
                writer.write([(redirect, "redirect", key)])

    return db

示例#6

0

显示文件

def process_redirect_dump(writer, redirects_dump):
    import bsddb
    db = bsddb.btopen('solrdump/redirects.db', 'w', cachesize=1024*1024*1024)

    for type, key, revision, timestamp, json in read_tsv(redirects_dump):
        d = simplejson.loads(json)
        if not key.startswith("/authors/") and not key.startswith("/works/"):
            continue

        location = d.get('location')
        if location:
            # Old redirects still start with /a/ instead of /authors/.
            location = location.replace("/a/", "/authors/")
            db[key] = location

    for key in db:
        if key.startswith("/works/"):
            redirect = find_redirect(db, key)
            if redirect:
                writer.write([(redirect, "redirect", key)])

    return db

示例#7

0

显示文件

文件： solr.py 项目： hornc/openlibrary-1

def process_edition_dump(writer, editions_dump):
    for type, key, revision, timestamp, json in read_tsv(editions_dump):
        doc = simplejson.loads(json)
        writer.write(process_edition(doc))

示例#8

0

显示文件

文件： solr.py 项目： hornc/openlibrary-1

def process_work_dump(writer, works_dump, author_db, redirect_db):
    for type, key, revision, timestamp, json in read_tsv(works_dump):
        doc = simplejson.loads(json)
        writer.write(process_work(doc, author_db, redirect_db))

示例#9

0

显示文件

def process_edition_dump(writer, editions_dump):
    for type, key, revision, timestamp, json in read_tsv(editions_dump):
        doc = simplejson.loads(json)
        writer.write(process_edition(doc))

示例#10

0

显示文件

def process_work_dump(writer, works_dump, author_db, redirect_db):
    for type, key, revision, timestamp, json in read_tsv(works_dump):
        doc = simplejson.loads(json)
        writer.write(process_work(doc, author_db, redirect_db))