Пример #1
0
def test_constellation_add():

    topic_query = format_cqp_query(TOPIC_ITEMS,
                                   p_query=P_QUERY,
                                   s_query=S_QUERY,
                                   flags="%cd",
                                   escape=False)

    # init constellation
    topic_dump = CORPUS.query(topic_query,
                              context=None,
                              context_break=S_CONTEXT)

    const = Constellation(topic_dump)

    # add discourseme
    disc1_query = format_cqp_query(DISC1_ITEMS,
                                   p_query=P_QUERY,
                                   s_query=S_QUERY,
                                   flags="%cd",
                                   escape=False)

    disc1_dump = CORPUS.query(disc1_query,
                              context=None,
                              context_break=S_CONTEXT)

    const.add_discourseme(disc1_dump)

    print(const.df)
    print(const.discoursemes.keys())
Пример #2
0
def test_constellation_add(germaparl, discoursemes):

    corpus = get_corpus(germaparl)

    # init constellation
    topic_query = format_cqp_query(
        discoursemes['topic'],
        p_query=discoursemes['parameters']['p_query'],
        s_query=discoursemes['parameters']['s_query'],
        flags=discoursemes['parameters']['flags_query'],
        escape=discoursemes['parameters']['escape_query'])
    topic_dump = corpus.query(
        topic_query,
        context=None,
        context_break=discoursemes['parameters']['s_context'])
    const = Constellation(topic_dump)

    # add discourseme
    disc1_query = format_cqp_query(
        discoursemes['disc1'],
        p_query=discoursemes['parameters']['p_query'],
        s_query=discoursemes['parameters']['s_query'],
        flags=discoursemes['parameters']['flags_query'],
        escape=discoursemes['parameters']['escape_query'])
    disc1_dump = corpus.query(
        disc1_query,
        context=None,
        context_break=discoursemes['parameters']['s_context'])
    const.add_discourseme(disc1_dump)

    assert len(const.df) == 1599
    assert 'topic' in const.discoursemes
    assert len(const.discoursemes) == 2
Пример #3
0
def test_count_items(germaparl):

    corpus = get_corpus(germaparl)
    cqp = corpus.start_cqp()

    items = ["Helmut", "Kohl", "CDU"]
    queries = [format_cqp_query([item]) for item in items]

    # whole corpus
    counts1 = corpus.marginals(items)
    counts2 = corpus.counts.mwus(cqp, queries)
    assert (list(counts1["freq"]) == list(counts2["freq"]))

    # subcorpus
    cqp.nqr_from_query(query='[lemma="und"] expand to s', name='und')
    cqp.nqr_activate(corpus.corpus_name, 'und')
    counts1 = corpus.marginals(items)
    counts2 = corpus.counts.mwus(cqp, queries)
    assert (counts1.loc[items[0], 'freq'] > counts2.loc[queries[0], 'freq'])

    # whole corpus
    cqp.nqr_activate(corpus.corpus_name)
    counts1 = corpus.marginals(items)
    counts2 = corpus.counts.mwus(cqp, queries)
    assert (list(counts1["freq"]) == list(counts2["freq"]))

    cqp.__kill__()
Пример #4
0
def test_constellation_coll(germaparl, discoursemes):

    corpus = get_corpus(germaparl)

    # init constellation
    topic_query = format_cqp_query(
        discoursemes['topic'],
        p_query=discoursemes['parameters']['p_query'],
        s_query=discoursemes['parameters']['s_query'],
        flags=discoursemes['parameters']['flags_query'],
        escape=discoursemes['parameters']['escape_query'])
    topic_dump = corpus.query(
        topic_query,
        context=None,
        context_break=discoursemes['parameters']['s_context'])
    const = Constellation(topic_dump)

    # add discourseme 1
    disc1_query = format_cqp_query(
        discoursemes['disc1'],
        p_query=discoursemes['parameters']['p_query'],
        s_query=discoursemes['parameters']['s_query'],
        flags=discoursemes['parameters']['flags_query'],
        escape=discoursemes['parameters']['escape_query'])
    disc1_dump = corpus.query(
        disc1_query,
        context=None,
        context_break=discoursemes['parameters']['s_context'])
    const.add_discourseme(disc1_dump, name='disc1')

    # add discourseme 2
    disc2_query = format_cqp_query(
        discoursemes['disc2'],
        p_query=discoursemes['parameters']['p_query'],
        s_query=discoursemes['parameters']['s_query'],
        flags=discoursemes['parameters']['flags_query'],
        escape=discoursemes['parameters']['escape_query'])
    disc2_dump = corpus.query(
        disc2_query,
        context=None,
        context_break=discoursemes['parameters']['s_context'])
    const.add_discourseme(disc2_dump, name='disc2')

    dfs = const.collocates(windows=list(range(1, 21)))
    assert len(dfs) == 20
    assert len(dfs[1]) == 2
    assert len(dfs[20]) == 5
Пример #5
0
def test_constellation_conc():

    topic_query = format_cqp_query(TOPIC_ITEMS,
                                   p_query=P_QUERY,
                                   s_query=S_QUERY,
                                   flags="%cd",
                                   escape=False)

    # init constellation
    topic_dump = CORPUS.query(topic_query,
                              context=None,
                              context_break=S_CONTEXT)

    const = Constellation(topic_dump)

    # add discourseme 1
    disc1_query = format_cqp_query(DISC1_ITEMS,
                                   p_query=P_QUERY,
                                   s_query=S_QUERY,
                                   flags="%cd",
                                   escape=False)
    disc1_dump = CORPUS.query(disc1_query,
                              context=None,
                              context_break=S_CONTEXT)

    const.add_discourseme(disc1_dump, name='disc1')

    # add discourseme 2
    disc2_query = format_cqp_query(DISC2_ITEMS,
                                   p_query=P_QUERY,
                                   s_query=S_QUERY,
                                   flags="%cd",
                                   escape=False)
    disc2_dump = CORPUS.query(disc2_query,
                              context=None,
                              context_break=S_CONTEXT)

    const.add_discourseme(disc2_dump, name='disc2')

    lines = const.concordance(s_show=['text_id'])
    print(lines)
Пример #6
0
def test_count_mwus_3(germaparl):

    # whole corpus
    corpus = get_corpus(germaparl)
    items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"]
    queries = [format_cqp_query([item]) for item in items]

    cqp = corpus.start_cqp()
    counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False)
    cqp.__kill__()

    assert (counts3['freq']['CSU'] == 635)
Пример #7
0
def test_constellation_coll():

    topic_query = format_cqp_query(TOPIC_ITEMS,
                                   p_query=P_QUERY,
                                   s_query=S_QUERY,
                                   flags="%cd",
                                   escape=False)

    # init constellation
    topic_dump = CORPUS.query(topic_query,
                              context=None,
                              context_break=S_CONTEXT)
    const = Constellation(topic_dump)

    # add discourseme 1
    disc1_query = format_cqp_query(DISC1_ITEMS,
                                   p_query=P_QUERY,
                                   s_query=S_QUERY,
                                   flags="%cd",
                                   escape=False)
    disc1_dump = CORPUS.query(disc1_query,
                              context=None,
                              context_break=S_CONTEXT)

    const.add_discourseme(disc1_dump, name='disc1')

    # add discourseme 2
    disc2_query = format_cqp_query(DISC2_ITEMS,
                                   p_query=P_QUERY,
                                   s_query=S_QUERY,
                                   flags="%cd",
                                   escape=False)
    disc2_dump = CORPUS.query(disc2_query,
                              context=None,
                              context_break=S_CONTEXT)

    const.add_discourseme(disc2_dump, name='disc2')

    lines = const.collocates(windows=list(range(1, 20)))
    print(lines)
Пример #8
0
def test_constellation_init():

    # init constellation
    topic_query = format_cqp_query(TOPIC_ITEMS,
                                   p_query=P_QUERY,
                                   s_query=S_QUERY,
                                   flags="%cd",
                                   escape=False)

    topic_dump = CORPUS.query(topic_query,
                              context=None,
                              context_break=S_CONTEXT)

    const = Constellation(topic_dump)

    print(const.df)
Пример #9
0
def test_count_mwus_strategies(germaparl):

    # whole corpus
    corpus = get_corpus(germaparl)
    items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324"]
    queries = [format_cqp_query([item]) for item in items]

    cqp = corpus.start_cqp()
    counts1 = corpus.counts.mwus(cqp, queries, strategy=1, fill_missing=False)
    assert (queries[0] in counts1.index)

    counts2 = corpus.counts.mwus(cqp, queries, strategy=2, fill_missing=False)

    counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False)

    cqp.__kill__()
    assert (counts2.equals(counts3))
    assert (sum(counts1['freq']) == sum(counts2['freq']))
Пример #10
0
def test_count_items_subcorpora(germaparl):

    # subcorpus
    corpus = get_corpus(germaparl)
    cqp = corpus.start_cqp()
    dump = corpus.dump_from_s_att("text_role", ["presidency"])
    cqp.nqr_from_dump(dump, 'Presidency')
    cqp.nqr_activate(corpus.corpus_name, 'Presidency')
    items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"]
    queries = [format_cqp_query([item]) for item in items]

    counts1 = corpus.counts.mwus(cqp, queries, strategy=1, fill_missing=False)
    assert (sum(counts1['freq']) > 0)

    counts2 = corpus.counts.mwus(cqp, queries, strategy=2, fill_missing=False)

    counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False)
    assert (counts2.equals(counts3))
    cqp.__kill__()
Пример #11
0
def test_constellation_init(germaparl, discoursemes):

    corpus = get_corpus(germaparl)

    # init constellation
    topic_query = format_cqp_query(
        discoursemes['topic'],
        p_query=discoursemes['parameters']['p_query'],
        s_query=discoursemes['parameters']['s_query'],
        flags=discoursemes['parameters']['flags_query'],
        escape=discoursemes['parameters']['escape_query'])
    topic_dump = corpus.query(
        topic_query,
        context=None,
        context_break=discoursemes['parameters']['s_context'])
    const = Constellation(topic_dump)

    assert isinstance(const.df, DataFrame)
    assert len(const.df) == 2777
Пример #12
0
def test_textual_constellation(germaparl, discoursemes):

    corpus = get_corpus(germaparl)

    # init constellation
    topic_query = format_cqp_query(
        discoursemes['topic'],
        p_query=discoursemes['parameters']['p_query'],
        s_query=discoursemes['parameters']['s_query'],
        flags=discoursemes['parameters']['flags_query'],
        escape=discoursemes['parameters']['escape_query'])
    topic_dump = corpus.query(
        topic_query,
        context=None,
        context_break=discoursemes['parameters']['s_context'])
    const = TextConstellation(
        topic_dump, s_context=discoursemes['parameters']['s_context'])
    assert len(const.df) == 624
    assert 'MATCHES_topic' in const.df.columns