예제 #1
0
def test_clean_lazy():
    G = LineFile("tests/smallcorpus-malformed.txt.bz2", header="foo bar baz qux".split(), 
                 path="tests/tmp/testcorpus")
    len_G = len(G)
    G.clean(columns=4, lower=False, alphanumeric=False, count_columns=True, 
            nounderscores=False, echo_toss=True, lazy=True)
    assert_equal(len(G), len_G - 2)
    G.delete()

    G = LineFile("tests/smallcorpus-malformed.txt.bz2", header="foo bar baz qux".split(), 
                 path="tests/tmp/testcorpus")
    G.clean(lower=True, alphanumeric=True, count_columns=False, echo_toss=True, lazy=True)
    assert_equal(len(G), 8562)
    G.delete()

    G = LineFile("tests/smallcorpus-malformed.txt.bz2", header="foo bar baz qux".split(), 
                 path="tests/tmp/testcorpus")
    G.clean(lower=True, alphanumeric=True, count_columns=False, echo_toss=True,
            filter_fn=lambda x: False, lazy=True)
    assert_equal(len(G), 0)
    G.delete()

    G = LineFile("tests/smallcorpus-malformed.txt.bz2", header="foo bar baz qux".split(), 
                 path="tests/tmp/testcorpus")
    G.clean(lower=True, alphanumeric=False, count_columns=False, echo_toss=True,
            modifier_fn=lambda x: "hello", lazy=True)
    for line in G.lines(parts=False):
        assert_equal(line, "hello")
    G.delete()
예제 #2
0
def test_basics():
    G = LineFile("tests/smallcorpus.txt.bz2", header="foo bar baz qux".split(), 
                 path="tests/tmp/testcorpus")
    assert_equal(G.header, "foo bar baz qux".split())
    assert_equal(G.files, ["tests/smallcorpus.txt.bz2"])
    assert_equal(G.path, "tests/tmp/testcorpus")
    assert_equal(G.tmppath, "tests/tmp/testcorpus.tmp")
    assert_equal(os.path.isfile("tests/tmp/testcorpus"), True)

    G_copy = G.copy()
    copy_path = G_copy.path
    assert_not_equal(copy_path, G.path)

    G_copy.mv_tmp()
    assert_equal(os.path.isfile(G_copy.path + ".tmp"), True)
    #G_copy.delete_tmp()
    #assert_equal(os.path.isfile(G_copy.path + ".tmp"), False)

    G.make_column("quux", lambda x, y, z, w: "cat", "foo bar baz qux".split())
    assert_equal(G.header, "foo bar baz qux quux".split())
    for line in G.lines(parts=False):
        assert_equal(G.extract_columns(line, "quux"), ["cat"])
    
    G.delete_columns("quux")
    assert_equal(G.header, "foo bar baz qux".split())

    G.copy_column("quux", "qux")
    assert_equal(G.header, "foo bar baz qux quux".split())
    for line in G.lines(parts=False):
        assert_equal(G.extract_columns(line, "qux"), 
                     G.extract_columns(line, "quux")
                     )

    G.delete()
    assert_equal(os.path.isfile("tests/tmp/testcorpus"), False)
예제 #3
0
def test_unicode():
    """ test unicode
    
    replace every word in the test corpus with random unicode
    and see if we get the same surprisal scores.

    """
    def generate_random_unicode():
        for _ in xrange(5):
            yield unichr(random.choice((0x300, 0x9999)) + random.randint(0, 0xff))

    scramblemap = {}

    G = LineFile("tests/smallcorpus.txt.bz2", header="foo bar baz qux".split(), 
                 path="tests/tmp/testcorpus")
    G.clean(lower=True, alphanumeric=False, count_columns=False, echo_toss=True, lazy=True)
    G.make_marginal_column("quux", "foo bar".split(), "qux", lazy=True)
    G.sort("baz")
    len_G = len(G)
    sum_counts = G.sum_column("quux")
    sum_surprisal = math.fsum(line[2] for line in G.average_surprisal("baz", "qux", "quux", assert_sorted=True))
    G.delete()

    G = LineFile("tests/smallcorpus.txt.bz2", header="foo bar baz qux".split(), 
                 path="tests/tmp/testcorpus")

    def scramble(line):
        words = line.split()[:3]
        count = line.split()[-1]
        for i, word in enumerate(words):
            if word in scramblemap:
                words[i] = scramblemap[word]
            else:
                garbage = u"".join(generate_random_unicode())
                words[i] = garbage
                scramblemap[word] = garbage

        return "\t".join(words + [count])

    G.clean(lower=True, alphanumeric=False, count_columns=False, echo_toss=True,
            modifier_fn=scramble)
    G.make_marginal_column("quux", "foo bar".split(), "qux")
    G.sort("baz")
    sum_counts_scrambled = G.sum_column("quux")
    assert_equal(sum_counts, sum_counts_scrambled)
    assert_equal(len_G, len(G))
    sum_surprisal_scrambled = math.fsum(line[2] for line in G.average_surprisal("baz", "qux", "quux", assert_sorted=True))
    G.delete()

    assert_equal(sum_surprisal, sum_surprisal_scrambled)
예제 #4
0
def test_resum_equal_lazy():
    G = LineFile("tests/smallcorpus.txt.bz2", header="foo bar baz qux".split(), 
                 path="tests/tmp/testcorpus")
    len_G = len(G)
    total = G.sum_column("qux")
    G.resum_equal("foo", "qux", assert_sorted=True, keep_all=False, lazy=True)
    for line in G.lines():
        assert_equal(int(G.extract_columns(line, "qux")[0]), total)
    G.delete()

    G = LineFile("tests/smallcorpus.txt.bz2", header="foo bar baz qux".split(), 
                 path="tests/tmp/testcorpus")
    G.resum_equal("foo", "qux", assert_sorted=True, keep_all=True, lazy=True)
    for line in G.lines():
        assert_equal(int(G.extract_columns(line, "qux")[0]), total)
    G.delete()
예제 #5
0
def test_clean_lazy():
    G = LineFile("tests/smallcorpus-malformed.txt.bz2",
                 header="foo bar baz qux".split(),
                 path="tests/tmp/testcorpus")
    len_G = len(G)
    G.clean(columns=4,
            lower=False,
            alphanumeric=False,
            count_columns=True,
            nounderscores=False,
            echo_toss=True,
            lazy=True)
    assert_equal(len(G), len_G - 2)
    G.delete()

    G = LineFile("tests/smallcorpus-malformed.txt.bz2",
                 header="foo bar baz qux".split(),
                 path="tests/tmp/testcorpus")
    G.clean(lower=True,
            alphanumeric=True,
            count_columns=False,
            echo_toss=True,
            lazy=True)
    assert_equal(len(G), 8562)
    G.delete()

    G = LineFile("tests/smallcorpus-malformed.txt.bz2",
                 header="foo bar baz qux".split(),
                 path="tests/tmp/testcorpus")
    G.clean(lower=True,
            alphanumeric=True,
            count_columns=False,
            echo_toss=True,
            filter_fn=lambda x: False,
            lazy=True)
    assert_equal(len(G), 0)
    G.delete()

    G = LineFile("tests/smallcorpus-malformed.txt.bz2",
                 header="foo bar baz qux".split(),
                 path="tests/tmp/testcorpus")
    G.clean(lower=True,
            alphanumeric=False,
            count_columns=False,
            echo_toss=True,
            modifier_fn=lambda x: "hello",
            lazy=True)
    for line in G.lines(parts=False):
        assert_equal(line, "hello")
    G.delete()
예제 #6
0
def test_unicode():
    """ test unicode
    
    replace every word in the test corpus with random unicode
    and see if we get the same surprisal scores.

    """
    def generate_random_unicode():
        for _ in xrange(5):
            yield unichr(
                random.choice((0x300, 0x9999)) + random.randint(0, 0xff))

    scramblemap = {}

    G = LineFile("tests/smallcorpus.txt.bz2",
                 header="foo bar baz qux".split(),
                 path="tests/tmp/testcorpus")
    G.clean(lower=True,
            alphanumeric=False,
            count_columns=False,
            echo_toss=True,
            lazy=True)
    G.make_marginal_column("quux", "foo bar".split(), "qux", lazy=True)
    G.sort("baz")
    len_G = len(G)
    sum_counts = G.sum_column("quux")
    sum_surprisal = math.fsum(line[2] for line in G.average_surprisal(
        "baz", "qux", "quux", assert_sorted=True))
    G.delete()

    G = LineFile("tests/smallcorpus.txt.bz2",
                 header="foo bar baz qux".split(),
                 path="tests/tmp/testcorpus")

    def scramble(line):
        words = line.split()[:3]
        count = line.split()[-1]
        for i, word in enumerate(words):
            if word in scramblemap:
                words[i] = scramblemap[word]
            else:
                garbage = u"".join(generate_random_unicode())
                words[i] = garbage
                scramblemap[word] = garbage

        return "\t".join(words + [count])

    G.clean(lower=True,
            alphanumeric=False,
            count_columns=False,
            echo_toss=True,
            modifier_fn=scramble)
    G.make_marginal_column("quux", "foo bar".split(), "qux")
    G.sort("baz")
    sum_counts_scrambled = G.sum_column("quux")
    assert_equal(sum_counts, sum_counts_scrambled)
    assert_equal(len_G, len(G))
    sum_surprisal_scrambled = math.fsum(
        line[2] for line in G.average_surprisal(
            "baz", "qux", "quux", assert_sorted=True))
    G.delete()

    assert_equal(sum_surprisal, sum_surprisal_scrambled)
예제 #7
0
def test_resum_equal_lazy():
    G = LineFile("tests/smallcorpus.txt.bz2",
                 header="foo bar baz qux".split(),
                 path="tests/tmp/testcorpus")
    len_G = len(G)
    total = G.sum_column("qux")
    G.resum_equal("foo", "qux", assert_sorted=True, keep_all=False, lazy=True)
    for line in G.lines():
        assert_equal(int(G.extract_columns(line, "qux")[0]), total)
    G.delete()

    G = LineFile("tests/smallcorpus.txt.bz2",
                 header="foo bar baz qux".split(),
                 path="tests/tmp/testcorpus")
    G.resum_equal("foo", "qux", assert_sorted=True, keep_all=True, lazy=True)
    for line in G.lines():
        assert_equal(int(G.extract_columns(line, "qux")[0]), total)
    G.delete()
예제 #8
0
def test_basics():
    G = LineFile("tests/smallcorpus.txt.bz2",
                 header="foo bar baz qux".split(),
                 path="tests/tmp/testcorpus")
    assert_equal(G.header, "foo bar baz qux".split())
    assert_equal(G.files, ["tests/smallcorpus.txt.bz2"])
    assert_equal(G.path, "tests/tmp/testcorpus")
    assert_equal(G.tmppath, "tests/tmp/testcorpus.tmp")
    assert_equal(os.path.isfile("tests/tmp/testcorpus"), True)

    G_copy = G.copy()
    copy_path = G_copy.path
    assert_not_equal(copy_path, G.path)

    G_copy.mv_tmp()
    assert_equal(os.path.isfile(G_copy.path + ".tmp"), True)
    #G_copy.delete_tmp()
    #assert_equal(os.path.isfile(G_copy.path + ".tmp"), False)

    G.make_column("quux", lambda x, y, z, w: "cat", "foo bar baz qux".split())
    assert_equal(G.header, "foo bar baz qux quux".split())
    for line in G.lines(parts=False):
        assert_equal(G.extract_columns(line, "quux"), ["cat"])

    G.delete_columns("quux")
    assert_equal(G.header, "foo bar baz qux".split())

    G.copy_column("quux", "qux")
    assert_equal(G.header, "foo bar baz qux quux".split())
    for line in G.lines(parts=False):
        assert_equal(G.extract_columns(line, "qux"),
                     G.extract_columns(line, "quux"))

    G.delete()
    assert_equal(os.path.isfile("tests/tmp/testcorpus"), False)