示例#1
0
def test_counts_dump_2(germaparl):
    strategy = 2

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.dump_from_query('[lemma="Helmut"%cd] [lemma="Kohl"%cd]')

    df = corpus.counts.dump(dump,
                            p_atts=['word'],
                            split=True,
                            strategy=strategy)
    assert (df["freq"]["Helmut"] == 6)

    df = corpus.counts.dump(dump,
                            p_atts=['word', 'pos'],
                            split=True,
                            strategy=strategy)
    assert (df["freq"][("Helmut", "NE")] == 6)

    df = corpus.counts.dump(dump,
                            p_atts=['word'],
                            split=False,
                            strategy=strategy)
    assert ("Helmut Kohl" in df.index)
    assert (df["freq"].iloc[0] == 6)

    df = corpus.counts.dump(dump,
                            p_atts=['word', 'pos'],
                            split=False,
                            strategy=strategy)
    assert (("Helmut Kohl", "NE NE") in df.index)
    assert (df["freq"].iloc[0] == 6)
示例#2
0
def test_counts_dump_1_split(germaparl):
    strategy = 1

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.dump_from_query('[lemma="die" %cd] [pos="N.*"]')

    df = corpus.counts.dump(dump,
                            p_atts=['word'],
                            split=True,
                            strategy=strategy)
    assert (int(df["freq"]["der"]) == 3775)

    df = corpus.counts.dump(dump,
                            p_atts=['word', 'lemma'],
                            split=True,
                            strategy=strategy)
    assert (int(df["freq"][("der", "die")]) == 3775)
示例#3
0
def test_counts_dump_1_no_split(germaparl):
    strategy = 1

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.dump_from_query('[lemma="Helmut"%cd] [lemma="Kohl"%cd]')

    # no split
    df = corpus.counts.dump(dump,
                            p_atts=['word'],
                            split=False,
                            strategy=strategy)
    assert ("Helmut Kohl" in df.index)

    df = corpus.counts.dump(dump,
                            p_atts=['word', 'pos'],
                            split=False,
                            strategy=strategy)
    assert (("Helmut Kohl", "NE NE") in df.index)