def test_constellation_add(): topic_query = format_cqp_query(TOPIC_ITEMS, p_query=P_QUERY, s_query=S_QUERY, flags="%cd", escape=False) # init constellation topic_dump = CORPUS.query(topic_query, context=None, context_break=S_CONTEXT) const = Constellation(topic_dump) # add discourseme disc1_query = format_cqp_query(DISC1_ITEMS, p_query=P_QUERY, s_query=S_QUERY, flags="%cd", escape=False) disc1_dump = CORPUS.query(disc1_query, context=None, context_break=S_CONTEXT) const.add_discourseme(disc1_dump) print(const.df) print(const.discoursemes.keys())
def test_constellation_add(germaparl, discoursemes): corpus = get_corpus(germaparl) # init constellation topic_query = format_cqp_query( discoursemes['topic'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], escape=discoursemes['parameters']['escape_query']) topic_dump = corpus.query( topic_query, context=None, context_break=discoursemes['parameters']['s_context']) const = Constellation(topic_dump) # add discourseme disc1_query = format_cqp_query( discoursemes['disc1'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], escape=discoursemes['parameters']['escape_query']) disc1_dump = corpus.query( disc1_query, context=None, context_break=discoursemes['parameters']['s_context']) const.add_discourseme(disc1_dump) assert len(const.df) == 1599 assert 'topic' in const.discoursemes assert len(const.discoursemes) == 2
def test_count_items(germaparl): corpus = get_corpus(germaparl) cqp = corpus.start_cqp() items = ["Helmut", "Kohl", "CDU"] queries = [format_cqp_query([item]) for item in items] # whole corpus counts1 = corpus.marginals(items) counts2 = corpus.counts.mwus(cqp, queries) assert (list(counts1["freq"]) == list(counts2["freq"])) # subcorpus cqp.nqr_from_query(query='[lemma="und"] expand to s', name='und') cqp.nqr_activate(corpus.corpus_name, 'und') counts1 = corpus.marginals(items) counts2 = corpus.counts.mwus(cqp, queries) assert (counts1.loc[items[0], 'freq'] > counts2.loc[queries[0], 'freq']) # whole corpus cqp.nqr_activate(corpus.corpus_name) counts1 = corpus.marginals(items) counts2 = corpus.counts.mwus(cqp, queries) assert (list(counts1["freq"]) == list(counts2["freq"])) cqp.__kill__()
def test_constellation_coll(germaparl, discoursemes): corpus = get_corpus(germaparl) # init constellation topic_query = format_cqp_query( discoursemes['topic'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], escape=discoursemes['parameters']['escape_query']) topic_dump = corpus.query( topic_query, context=None, context_break=discoursemes['parameters']['s_context']) const = Constellation(topic_dump) # add discourseme 1 disc1_query = format_cqp_query( discoursemes['disc1'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], escape=discoursemes['parameters']['escape_query']) disc1_dump = corpus.query( disc1_query, context=None, context_break=discoursemes['parameters']['s_context']) const.add_discourseme(disc1_dump, name='disc1') # add discourseme 2 disc2_query = format_cqp_query( discoursemes['disc2'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], escape=discoursemes['parameters']['escape_query']) disc2_dump = corpus.query( disc2_query, context=None, context_break=discoursemes['parameters']['s_context']) const.add_discourseme(disc2_dump, name='disc2') dfs = const.collocates(windows=list(range(1, 21))) assert len(dfs) == 20 assert len(dfs[1]) == 2 assert len(dfs[20]) == 5
def test_constellation_conc(): topic_query = format_cqp_query(TOPIC_ITEMS, p_query=P_QUERY, s_query=S_QUERY, flags="%cd", escape=False) # init constellation topic_dump = CORPUS.query(topic_query, context=None, context_break=S_CONTEXT) const = Constellation(topic_dump) # add discourseme 1 disc1_query = format_cqp_query(DISC1_ITEMS, p_query=P_QUERY, s_query=S_QUERY, flags="%cd", escape=False) disc1_dump = CORPUS.query(disc1_query, context=None, context_break=S_CONTEXT) const.add_discourseme(disc1_dump, name='disc1') # add discourseme 2 disc2_query = format_cqp_query(DISC2_ITEMS, p_query=P_QUERY, s_query=S_QUERY, flags="%cd", escape=False) disc2_dump = CORPUS.query(disc2_query, context=None, context_break=S_CONTEXT) const.add_discourseme(disc2_dump, name='disc2') lines = const.concordance(s_show=['text_id']) print(lines)
def test_count_mwus_3(germaparl): # whole corpus corpus = get_corpus(germaparl) items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"] queries = [format_cqp_query([item]) for item in items] cqp = corpus.start_cqp() counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False) cqp.__kill__() assert (counts3['freq']['CSU'] == 635)
def test_constellation_coll(): topic_query = format_cqp_query(TOPIC_ITEMS, p_query=P_QUERY, s_query=S_QUERY, flags="%cd", escape=False) # init constellation topic_dump = CORPUS.query(topic_query, context=None, context_break=S_CONTEXT) const = Constellation(topic_dump) # add discourseme 1 disc1_query = format_cqp_query(DISC1_ITEMS, p_query=P_QUERY, s_query=S_QUERY, flags="%cd", escape=False) disc1_dump = CORPUS.query(disc1_query, context=None, context_break=S_CONTEXT) const.add_discourseme(disc1_dump, name='disc1') # add discourseme 2 disc2_query = format_cqp_query(DISC2_ITEMS, p_query=P_QUERY, s_query=S_QUERY, flags="%cd", escape=False) disc2_dump = CORPUS.query(disc2_query, context=None, context_break=S_CONTEXT) const.add_discourseme(disc2_dump, name='disc2') lines = const.collocates(windows=list(range(1, 20))) print(lines)
def test_constellation_init(): # init constellation topic_query = format_cqp_query(TOPIC_ITEMS, p_query=P_QUERY, s_query=S_QUERY, flags="%cd", escape=False) topic_dump = CORPUS.query(topic_query, context=None, context_break=S_CONTEXT) const = Constellation(topic_dump) print(const.df)
def test_count_mwus_strategies(germaparl): # whole corpus corpus = get_corpus(germaparl) items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324"] queries = [format_cqp_query([item]) for item in items] cqp = corpus.start_cqp() counts1 = corpus.counts.mwus(cqp, queries, strategy=1, fill_missing=False) assert (queries[0] in counts1.index) counts2 = corpus.counts.mwus(cqp, queries, strategy=2, fill_missing=False) counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False) cqp.__kill__() assert (counts2.equals(counts3)) assert (sum(counts1['freq']) == sum(counts2['freq']))
def test_count_items_subcorpora(germaparl): # subcorpus corpus = get_corpus(germaparl) cqp = corpus.start_cqp() dump = corpus.dump_from_s_att("text_role", ["presidency"]) cqp.nqr_from_dump(dump, 'Presidency') cqp.nqr_activate(corpus.corpus_name, 'Presidency') items = ["Horst Seehofer", r"( CSU )", "CSU", "WES324", "CSU"] queries = [format_cqp_query([item]) for item in items] counts1 = corpus.counts.mwus(cqp, queries, strategy=1, fill_missing=False) assert (sum(counts1['freq']) > 0) counts2 = corpus.counts.mwus(cqp, queries, strategy=2, fill_missing=False) counts3 = corpus.counts.mwus(cqp, queries, strategy=3, fill_missing=False) assert (counts2.equals(counts3)) cqp.__kill__()
def test_constellation_init(germaparl, discoursemes): corpus = get_corpus(germaparl) # init constellation topic_query = format_cqp_query( discoursemes['topic'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], escape=discoursemes['parameters']['escape_query']) topic_dump = corpus.query( topic_query, context=None, context_break=discoursemes['parameters']['s_context']) const = Constellation(topic_dump) assert isinstance(const.df, DataFrame) assert len(const.df) == 2777
def test_textual_constellation(germaparl, discoursemes): corpus = get_corpus(germaparl) # init constellation topic_query = format_cqp_query( discoursemes['topic'], p_query=discoursemes['parameters']['p_query'], s_query=discoursemes['parameters']['s_query'], flags=discoursemes['parameters']['flags_query'], escape=discoursemes['parameters']['escape_query']) topic_dump = corpus.query( topic_query, context=None, context_break=discoursemes['parameters']['s_context']) const = TextConstellation( topic_dump, s_context=discoursemes['parameters']['s_context']) assert len(const.df) == 624 assert 'MATCHES_topic' in const.df.columns