def test_concordance_persistence(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query_1 = ( '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]' ) query_2 = ( '"und"' ) # will show results for query_1 dump = corpus.query(query_1, context_break='s') concordance = Concordance(corpus, dump.df) line_1 = concordance.lines(cut_off=1, form='dataframes') df_1 = line_1['df'].iloc[0] # will show results for query_1 dump = corpus.query(query_2, context_break='s') line_2 = concordance.lines(cut_off=1, form='dataframes') df_2 = line_2['df'].iloc[0] # will show results for query_2 concordance = Concordance(corpus, dump.df) line_3 = concordance.lines(cut_off=1, form='dataframes') df_3 = line_3['df'].iloc[0] assert(df_1.equals(df_2)) assert(not df_2.equals(df_3))
def test_filter_df(germaparl): c = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = c.query(germaparl['query']) coll = dump.collocates() print(coll) print(filter_df(coll, 'resources/stopwords-de.txt'))
def test_concordance_lines(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]' ) result = corpus.query(query, context_break='s', match_strategy='longest') concordance = Concordance(corpus, result.df) # standard = raw lines = concordance.lines() assert(len(lines) > 10) assert('raw' in lines.columns) assert(all(elem in lines.iloc[0]['raw'] for elem in ['cpos', 'match', 'word'])) # simple lines = concordance.lines(form='simple', cut_off=10) assert('text' in lines.columns) assert(len(lines) == 10) # kwic lines = concordance.lines(form='kwic', cut_off=10) assert(all(elem in lines.columns for elem in ['left', 'node', 'right'])) assert(len(lines) == 10) # kwic with s-attribute lines = concordance.lines(form='kwic', s_show=['text_id'], cut_off=10) assert(len(lines) == 10) assert('text_id' in lines.columns)
def test_collocates_speed_many(): corpus = Corpus("GERMAPARL_1114") query = '[lemma="sagen"]' df_dump = corpus.query(query, context=2, context_break='s').df collocates = Collocates(corpus, df_dump, p_query='lemma') c2 = collocates.show(window=2, cut_off=50) assert (type(c2) == pd.DataFrame)
def test_collocates_no_mws(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = ('[lemma="Armin"]? [lemma="Laschet"]') dump = corpus.query(query) collocates = Collocates(corpus, dump.df, p_query='word', mws=None) collocates.show()
def test_collocates_empty(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = ('[lemma="Armin"]? [lemma="NAHH"]') dump = corpus.query(query) collocates = Collocates(corpus, dump.df, p_query='word') collocates.show()
def test_query_breakdown(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = r'"\[" ([word="[A-Z]+"] "/"?)+ "\]"' dump = corpus.query(query) print(dump.breakdown())
def test_keywords_query(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = corpus.query('"SPD" expand to s') keywords = dump.keywords() print(keywords.head(50))
def test_query_keywords_collocates(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = ('"Horst" expand to s') dump = corpus.query(query) keywords = Keywords(corpus, df_dump=dump.df, p_query='lemma') assert ('Horst' == keywords.show(order='log_likelihood').head(1).index[0])
def test_keywords_switch(germaparl): name_all = 'test_all' # get some regions corpus = Corpus(corpus_name=germaparl['corpus_name'], registry_path=germaparl['registry_path']) df_all = corpus.query('"und" expand to s', name=name_all).df df_head = df_all.head(500) df_tail = df_all.tail(500) # will show keywords for head keywords = Keywords(corpus, df_dump=df_head, p_query="lemma") line_head_name = keywords.show(order='log_likelihood') # will show keywords for head keywords = Keywords(corpus, df_dump=df_head, p_query="lemma") line_head_df = keywords.show(order='log_likelihood') assert (line_head_df.equals(line_head_name)) # will show keywords for tail keywords = Keywords(corpus, df_dump=df_tail, p_query="lemma") line_tail_name = keywords.show(order='log_likelihood') # will show keywords for tail keywords = Keywords(corpus, df_dump=df_tail, p_query="lemma") line_tail_df = keywords.show(order='log_likelihood') assert (line_tail_df.equals(line_tail_name)) assert (not line_tail_df.equals(line_head_df))
def test_concordance_empty(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="NAHH"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert(conc.lines() is None)
def test_filter_df(germaparl): c = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) dump = c.query(germaparl['query']) coll = dump.collocates() assert ',' in coll.index coll_filtered = filter_df(coll, 'resources/stopwords-de.txt') assert ',' not in coll_filtered.index
def test_concordance_p_slots(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="Schröder"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert(conc.lines(p_slots='lemma') is None)
def test_query_context(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = r'"\[" ([word="[A-Z]+"] "/"?) + "\]"' dump = corpus.query(cqp_query=query, context=20, context_break='s') print(dump) print(dump.df)
def test_concordance_last(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="Schröder"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert(type(conc.lines(order='last')) == pd.DataFrame)
def test_concordance_order(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="Schröder"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) with pytest.raises(NotImplementedError): conc.lines(order='fail')
def test_concordance_many(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="oder"]' ) result = corpus.query(query) concordance = Concordance(corpus, result.df) lines = concordance.lines() assert(len(lines) == 100)
def test_query_logging(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = ('[word="\\("] [lemma=".*"]+ [word="\\)"]') df_dump = corpus.query(query).df collocates = Collocates(corpus, df_dump, 'fail') c = collocates.show(order='log_likelihood', window=15) assert (type(c) == pd.DataFrame) assert ('Dr.' in c.index)
def test_concordance_anchors(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]' ) result = corpus.query(query, context_break='s') concordance = Concordance(corpus, result.df) lines = concordance.lines(p_show=['lemma', 'pos'], form='dataframes') assert(len(lines) == 13)
def test_anchor(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = r'@1[pos="NE"]? @2[pos="NE"] "\[" (@3[word="[A-Z]+"]+ "/"?)+ "\]"' dump = corpus.query(query, context_break='s') lines = dump.concordance(form='dataframes') print() print(lines['df'].iloc[1])
def test_concordance_form_simple_kwic(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]' ) result = corpus.query(query, context_break='s') concordance = Concordance(corpus, result.df) lines = concordance.lines(order='random', cut_off=100, form='kwic') assert(len(lines) == 13)
def test_concordancing_dataframes(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = r'"\[" ([word="[A-Z]+"] "/"?)+ "\]"' dump = corpus.query(query, context_break='s') lines = dump.concordance(form='dataframes') from pprint import pprint pprint(lines['df'].iloc[1])
def test_collocates_pp(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = ('"SPD"') result = corpus.query(query) collocates = Collocates(corpus, result.df, p_query='word') c = collocates.show(order='log_likelihood', cut_off=None) assert (int(c.loc['Die']['O11']) < int(c.loc['die']['O11'])) c = collocates.show(order='log_likelihood', cut_off=None, flags="%cd") assert ('die' in c.index and 'Die' not in c.index)
def test_concordance_fallback(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[lemma="Gerhard"]? [lemma="Schröder"]' ) dump = corpus.query(query) conc = Concordance(corpus, dump.df) assert(type( conc.lines(order='last', form='simple', p_show=['word', 'lemma']) ) == pd.DataFrame)
def test_query_s_atts_brexit(brexit): corpus = Corpus(brexit['corpus_name']) df_dump = corpus.query(cqp_query='[lemma="nigel"]', context=10, context_break='tweet').df df = corpus.get_s_annotations(df_dump, ['ner_type', 'tweet_id', 'tweet']) assert (type(df) == pd.DataFrame) columns = [a + '_CWBID' for a in ['ner_type', 'tweet_id', 'tweet']] columns += ['ner_type', 'tweet_id'] print(df['ner_type'].value_counts()) assert (all(elem in df.columns for elem in columns))
def test_collocates_mwu(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = ('[lemma="CDU"] "/"? [lemma="CSU"]?') result = corpus.query(query, match_strategy='longest') collocates = Collocates(corpus, result.df, 'lemma') c = collocates.show(order='log_likelihood', cut_off=None) assert (type(c) == pd.DataFrame) assert (len(c) > 9) assert ('CSU' in c.index) assert (int(c.loc['CSU']['in_nodes']) > int(c.loc['CSU']['f']))
def test_concordancing_simple(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = r'"\[" ([word="[A-Z]+"] "/"?)+ "\]"' dump = corpus.query(query) lines = dump.concordance() print(lines) print(lines.columns) from pprint import pprint pprint(lines['raw'].iloc[0])
def test_concordance_lines_dataframes(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]' ) result = corpus.query(query, context_break='s', match_strategy='longest') concordance = Concordance(corpus, result.df) lines = concordance.lines(form='dataframes', s_show=['text_id'], cut_off=10) assert('df' in lines.columns) assert(type(lines['df'].iloc[0]) == pd.DataFrame)
def test_concordance_meta(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"]) query = ( '[word="\\["] [lemma="CDU"] "/" "CSU" [word="\\]"]' ) result = corpus.query(query) concordance = Concordance(corpus, result.df) lines = concordance.lines(s_show=['text_name', 'p_type']) assert('text_name' in lines.columns) assert('p_type' in lines.columns) assert(len(lines) == 13)
def test_collocates(germaparl): corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl['registry_path']) query = '"SPD"' dump = corpus.query(query, context=10, context_break='s') collocates = dump.collocates() print() print(collocates[[ 'O11', 'O12', 'O21', 'O22', 'E11', 'E12', 'E21', 'E22', 'log_likelihood' ]])