def test_concordance_persistence(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query_1 = (
        '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]'
    )
    query_2 = (
        '"und"'
    )

    # will show results for query_1
    dump = corpus.query(query_1, context_break='s')
    concordance = Concordance(corpus, dump.df)
    line_1 = concordance.lines(cut_off=1, form='dataframes')
    df_1 = line_1['df'].iloc[0]

    # will show results for query_1
    dump = corpus.query(query_2, context_break='s')
    line_2 = concordance.lines(cut_off=1, form='dataframes')
    df_2 = line_2['df'].iloc[0]

    # will show results for query_2
    concordance = Concordance(corpus, dump.df)
    line_3 = concordance.lines(cut_off=1, form='dataframes')
    df_3 = line_3['df'].iloc[0]

    assert(df_1.equals(df_2))
    assert(not df_2.equals(df_3))
示例#2
0
def test_filter_df(germaparl):
    c = Corpus(germaparl['corpus_name'],
               registry_path=germaparl['registry_path'])
    dump = c.query(germaparl['query'])
    coll = dump.collocates()
    print(coll)
    print(filter_df(coll, 'resources/stopwords-de.txt'))
def test_concordance_lines(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]'
    )
    result = corpus.query(query, context_break='s',
                          match_strategy='longest')

    concordance = Concordance(corpus, result.df)

    # standard = raw
    lines = concordance.lines()
    assert(len(lines) > 10)
    assert('raw' in lines.columns)
    assert(all(elem in lines.iloc[0]['raw'] for elem in ['cpos', 'match', 'word']))

    # simple
    lines = concordance.lines(form='simple', cut_off=10)
    assert('text' in lines.columns)
    assert(len(lines) == 10)

    # kwic
    lines = concordance.lines(form='kwic', cut_off=10)
    assert(all(elem in lines.columns for elem in ['left', 'node', 'right']))
    assert(len(lines) == 10)

    # kwic with s-attribute
    lines = concordance.lines(form='kwic', s_show=['text_id'], cut_off=10)
    assert(len(lines) == 10)
    assert('text_id' in lines.columns)
示例#4
0
def test_collocates_speed_many():
    corpus = Corpus("GERMAPARL_1114")
    query = '[lemma="sagen"]'
    df_dump = corpus.query(query, context=2, context_break='s').df
    collocates = Collocates(corpus, df_dump, p_query='lemma')
    c2 = collocates.show(window=2, cut_off=50)
    assert (type(c2) == pd.DataFrame)
示例#5
0
def test_collocates_no_mws(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    query = ('[lemma="Armin"]? [lemma="Laschet"]')
    dump = corpus.query(query)
    collocates = Collocates(corpus, dump.df, p_query='word', mws=None)
    collocates.show()
示例#6
0
def test_collocates_empty(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    query = ('[lemma="Armin"]? [lemma="NAHH"]')
    dump = corpus.query(query)
    collocates = Collocates(corpus, dump.df, p_query='word')
    collocates.show()
示例#7
0
def test_query_breakdown(germaparl):

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    query = r'"\[" ([word="[A-Z]+"] "/"?)+ "\]"'
    dump = corpus.query(query)
    print(dump.breakdown())
示例#8
0
def test_keywords_query(germaparl):

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    dump = corpus.query('"SPD" expand to s')
    keywords = dump.keywords()
    print(keywords.head(50))
示例#9
0
def test_query_keywords_collocates(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    query = ('"Horst" expand to s')
    dump = corpus.query(query)
    keywords = Keywords(corpus, df_dump=dump.df, p_query='lemma')
    assert ('Horst' == keywords.show(order='log_likelihood').head(1).index[0])
示例#10
0
def test_keywords_switch(germaparl):

    name_all = 'test_all'

    # get some regions
    corpus = Corpus(corpus_name=germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    df_all = corpus.query('"und" expand to s', name=name_all).df
    df_head = df_all.head(500)
    df_tail = df_all.tail(500)

    # will show keywords for head
    keywords = Keywords(corpus, df_dump=df_head, p_query="lemma")
    line_head_name = keywords.show(order='log_likelihood')

    # will show keywords for head
    keywords = Keywords(corpus, df_dump=df_head, p_query="lemma")
    line_head_df = keywords.show(order='log_likelihood')

    assert (line_head_df.equals(line_head_name))

    # will show keywords for tail
    keywords = Keywords(corpus, df_dump=df_tail, p_query="lemma")
    line_tail_name = keywords.show(order='log_likelihood')

    # will show keywords for tail
    keywords = Keywords(corpus, df_dump=df_tail, p_query="lemma")
    line_tail_df = keywords.show(order='log_likelihood')

    assert (line_tail_df.equals(line_tail_name))

    assert (not line_tail_df.equals(line_head_df))
示例#11
0
def test_concordance_empty(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="NAHH"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert(conc.lines() is None)
示例#12
0
def test_filter_df(germaparl):
    c = Corpus(germaparl['corpus_name'],
               registry_path=germaparl['registry_path'])
    dump = c.query(germaparl['query'])
    coll = dump.collocates()
    assert ',' in coll.index
    coll_filtered = filter_df(coll, 'resources/stopwords-de.txt')
    assert ',' not in coll_filtered.index
示例#13
0
def test_concordance_p_slots(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="Schröder"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert(conc.lines(p_slots='lemma') is None)
示例#14
0
def test_query_context(germaparl):

    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    query = r'"\[" ([word="[A-Z]+"] "/"?) + "\]"'
    dump = corpus.query(cqp_query=query, context=20, context_break='s')
    print(dump)
    print(dump.df)
示例#15
0
def test_concordance_last(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="Schröder"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert(type(conc.lines(order='last')) == pd.DataFrame)
示例#16
0
def test_concordance_order(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="Schröder"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    with pytest.raises(NotImplementedError):
        conc.lines(order='fail')
示例#17
0
def test_concordance_many(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="oder"]'
    )
    result = corpus.query(query)
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines()
    assert(len(lines) == 100)
示例#18
0
def test_query_logging(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    query = ('[word="\\("] [lemma=".*"]+ [word="\\)"]')
    df_dump = corpus.query(query).df
    collocates = Collocates(corpus, df_dump, 'fail')
    c = collocates.show(order='log_likelihood', window=15)
    assert (type(c) == pd.DataFrame)
    assert ('Dr.' in c.index)
示例#19
0
def test_concordance_anchors(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]'
    )
    result = corpus.query(query, context_break='s')
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(p_show=['lemma', 'pos'], form='dataframes')
    assert(len(lines) == 13)
示例#20
0
def test_anchor(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])

    query = r'@1[pos="NE"]? @2[pos="NE"] "\[" (@3[word="[A-Z]+"]+ "/"?)+ "\]"'
    dump = corpus.query(query, context_break='s')
    lines = dump.concordance(form='dataframes')
    print()
    print(lines['df'].iloc[1])
示例#21
0
def test_concordance_form_simple_kwic(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]'
    )
    result = corpus.query(query, context_break='s')
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(order='random', cut_off=100, form='kwic')
    assert(len(lines) == 13)
示例#22
0
def test_concordancing_dataframes(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])

    query = r'"\[" ([word="[A-Z]+"] "/"?)+ "\]"'
    dump = corpus.query(query, context_break='s')
    lines = dump.concordance(form='dataframes')
    from pprint import pprint
    pprint(lines['df'].iloc[1])
示例#23
0
def test_collocates_pp(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    query = ('"SPD"')
    result = corpus.query(query)
    collocates = Collocates(corpus, result.df, p_query='word')
    c = collocates.show(order='log_likelihood', cut_off=None)
    assert (int(c.loc['Die']['O11']) < int(c.loc['die']['O11']))
    c = collocates.show(order='log_likelihood', cut_off=None, flags="%cd")
    assert ('die' in c.index and 'Die' not in c.index)
示例#24
0
def test_concordance_fallback(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="Schröder"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert(type(
        conc.lines(order='last', form='simple', p_show=['word', 'lemma'])
    ) == pd.DataFrame)
示例#25
0
def test_query_s_atts_brexit(brexit):
    corpus = Corpus(brexit['corpus_name'])
    df_dump = corpus.query(cqp_query='[lemma="nigel"]',
                           context=10,
                           context_break='tweet').df
    df = corpus.get_s_annotations(df_dump, ['ner_type', 'tweet_id', 'tweet'])
    assert (type(df) == pd.DataFrame)
    columns = [a + '_CWBID' for a in ['ner_type', 'tweet_id', 'tweet']]
    columns += ['ner_type', 'tweet_id']
    print(df['ner_type'].value_counts())
    assert (all(elem in df.columns for elem in columns))
示例#26
0
def test_collocates_mwu(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    query = ('[lemma="CDU"] "/"? [lemma="CSU"]?')
    result = corpus.query(query, match_strategy='longest')
    collocates = Collocates(corpus, result.df, 'lemma')
    c = collocates.show(order='log_likelihood', cut_off=None)
    assert (type(c) == pd.DataFrame)
    assert (len(c) > 9)
    assert ('CSU' in c.index)
    assert (int(c.loc['CSU']['in_nodes']) > int(c.loc['CSU']['f']))
示例#27
0
def test_concordancing_simple(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])

    query = r'"\[" ([word="[A-Z]+"] "/"?)+ "\]"'
    dump = corpus.query(query)
    lines = dump.concordance()
    print(lines)
    print(lines.columns)
    from pprint import pprint
    pprint(lines['raw'].iloc[0])
示例#28
0
def test_concordance_lines_dataframes(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]'
    )
    result = corpus.query(query, context_break='s',
                          match_strategy='longest')
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(form='dataframes', s_show=['text_id'], cut_off=10)
    assert('df' in lines.columns)
    assert(type(lines['df'].iloc[0]) == pd.DataFrame)
示例#29
0
def test_concordance_meta(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] [lemma="CDU"] "/" "CSU" [word="\\]"]'
    )
    result = corpus.query(query)
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(s_show=['text_name', 'p_type'])
    assert('text_name' in lines.columns)
    assert('p_type' in lines.columns)
    assert(len(lines) == 13)
示例#30
0
def test_collocates(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])

    query = '"SPD"'
    dump = corpus.query(query, context=10, context_break='s')
    collocates = dump.collocates()
    print()
    print(collocates[[
        'O11', 'O12', 'O21', 'O22', 'E11', 'E12', 'E21', 'E22',
        'log_likelihood'
    ]])