Пример #1
0
def _parse_stuff_to_db(fname, db):
    """Parses a wikidump, stores the model supplied db."""
    cur = db.cursor()
    with open(createtables_path()) as create:
        cur.executescript(create.read())
    dump = join(dirname(abspath(__file__)), fname)
    parse_dump(dump, db, N=2)

    return db
def test_parse_dump():
    db = sqlite3.connect(":memory:")
    cur = db.cursor()
    with open(createtables_path()) as create:
        cur.executescript(create.read())

    dump = join(dirname(abspath(__file__)), "nlwiki-20140927-pages-articles-sample.xml")
    parse_dump(dump, db, N=None)

    ngram_count = dict(cur.execute("select ngram, tf from ngrams;"))
    link_count = dict(cur.execute("select target, count from linkstats;"))

    assert_in("Heinrich Tessenow", ngram_count)
    assert_in("Heinrich Tessenow", link_count)
Пример #3
0
def test_parse_dump():
    db = sqlite3.connect(':memory:')
    cur = db.cursor()
    with open(createtables_path()) as create:
        cur.executescript(create.read())

    dump = join(dirname(abspath(__file__)),
                'nlwiki-20140927-pages-articles-sample.xml')
    parse_dump(dump, db, N=None)

    ngram_count = dict(cur.execute('select ngram, tf from ngrams;'))
    link_count = dict(cur.execute('select target, count from linkstats;'))

    assert_in('Heinrich Tessenow', ngram_count)
    assert_in('Heinrich Tessenow', link_count)
Пример #4
0
def test_parse_dump_ngrams():
    db = sqlite3.connect(':memory:')
    cur = db.cursor()
    with open(createtables_path()) as create:
        cur.executescript(create.read())

    dump = _test_dump_path()
    parse_dump(dump, db, N=2)

    ngram_count = dict(cur.execute('select ngram, tf from ngrams;'))
    link_count = dict(cur.execute('select target, count from linkstats;'))

    assert_in(ur'van München', ngram_count)
    assert_in(u'Vrede van M\xfcnster', link_count)
    # assert_greater(link_count[('AMX Index', 'Amsterdam Midkap Index')], 0)
    assert_greater(link_count['AMX Index'], 0)
def test_parse_dump_ngrams():
    db = sqlite3.connect(":memory:")
    cur = db.cursor()
    with open(createtables_path()) as create:
        cur.executescript(create.read())

    dump = _test_dump_path()
    parse_dump(dump, db, N=2)

    ngram_count = dict(cur.execute("select ngram, tf from ngrams;"))
    link_count = dict(cur.execute("select target, count from linkstats;"))

    assert_in(ur"van München", ngram_count)
    assert_in(u"Vrede van M\xfcnster", link_count)
    # assert_greater(link_count[('AMX Index', 'Amsterdam Midkap Index')], 0)
    assert_greater(link_count["AMX Index"], 0)