示例#1
0
def test_concordance_anchors(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]')
    result = corpus.query(query, context_break='s')
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(p_show=['lemma', 'pos'], form='dataframe')
    assert (len(lines) == 13)
示例#2
0
def test_concordance_many(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[lemma="oder"]')
    result = corpus.query(query)
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines()
    assert (len(lines) == 100)
示例#3
0
def test_concordance_simple(germaparl):
    corpus = get_corpus(germaparl)
    df_dump = corpus.query('"CSU"').df
    concordance = Concordance(corpus, df_dump)
    lines = concordance.simple(df_dump, p_show=['word', 'lemma'])
    assert (len(lines) == len(df_dump))
    assert (all(col in lines.columns for col in ['word', 'lemma']))
示例#4
0
def test_concordance_form_kwic(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]')
    result = corpus.query(query, context_break='s')
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(order='random', cut_off=100, form='kwic')
    assert (len(lines) == 13)
示例#5
0
def test_concordance_dict(germaparl):
    corpus = get_corpus(germaparl)
    df_dump = corpus.query('[lemma="gehen"]', context=None).df
    concordance = Concordance(corpus, df_dump)
    lines = concordance.dict(df_dump, p_show=['word'])
    assert (isinstance(lines['dict'].iloc[0], dict))
    assert ('word' in lines['dict'].iloc[0])
示例#6
0
def test_concordance_anchors_weird(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[word="\\["] @9[lemma="CDU"] "/" @2".*" @5[word="\\]"]')
    result = corpus.query(query, context_break='s')
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(order='random', cut_off=100, form='dataframe')
    assert (len(lines) == 13)
示例#7
0
def test_concordance_simple_nocontext(germaparl):
    corpus = get_corpus(germaparl)
    df_dump = corpus.query('[lemma="gehen"]', context=None).df
    concordance = Concordance(corpus, df_dump)
    lines = concordance.simple(df_dump, p_show=['word'])
    assert (len(lines) == len(df_dump))
    assert (all(col in lines.columns for col in ['word']))
示例#8
0
def test_concordance_p_atts(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[word="\\["] [lemma="CDU"] "/" ".*" [word="\\]"]')
    result = corpus.query(query, context_break='s')
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(p_show=['lemma', 'pos'], form='dataframe')
    assert ('pos' in lines.iloc[0]['dataframe'].columns)
    assert (len(lines) == 13)
def test_concordance_p_slots(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="Schröder"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert(conc.lines(p_slots='lemma') is None)
示例#10
0
def test_concordance_last(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="Schröder"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert(type(conc.lines(order='last')) == pd.DataFrame)
示例#11
0
def test_concordance_fallback(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[lemma="Gerhard"]? [lemma="Schröder"]')
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert (isinstance(
        conc.lines(order='last', form='simple', p_show=['word', 'lemma']),
        pd.DataFrame))
示例#12
0
def test_concordance_empty(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="NAHH"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert(conc.lines() is None)
示例#13
0
def test_concordance_order(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="Schröder"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    with pytest.raises(NotImplementedError):
        conc.lines(order='fail')
示例#14
0
def test_concordance_form_simple(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]'
    )
    result = corpus.query(query, context_break='s')
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(order='random', cut_off=100, form='simple')
    assert(len(lines) == 13)
示例#15
0
def test_concordance_many(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="oder"]'
    )
    result = corpus.query(query)
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines()
    assert(len(lines) == 100)
示例#16
0
def test_concordance_dataframes(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]')
    result = corpus.query(query, context_break='s', match_strategy='longest')
    concordance = Concordance(corpus, result.df)
    df = concordance.dict(result.df, p_show=['word', 'lemma'])
    lines = concordance.dataframe(df, p_show=['word', 'lemma'])
    assert ('dataframe' in lines.columns)
    assert (isinstance(lines['dataframe'].iloc[0], pd.DataFrame))
示例#17
0
def test_concordance_fallback(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="Schröder"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert(type(
        conc.lines(order='last', form='simple', p_show=['word', 'lemma'])
    ) == pd.DataFrame)
示例#18
0
def test_concordance_kwic(germaparl):
    corpus = get_corpus(germaparl)
    df_dump = corpus.query('[lemma="gehen"]').df
    concordance = Concordance(corpus, df_dump)
    lines = concordance.kwic(df_dump, p_show=['word', 'lemma'])
    assert (len(lines) == len(df_dump))
    assert (all(col in lines.columns for col in [
        'left_word', 'node_word', 'right_word', 'left_lemma', 'node_lemma',
        'right_lemma'
    ]))
示例#19
0
def test_concordance_meta(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] [lemma="CDU"] "/" "CSU" [word="\\]"]'
    )
    result = corpus.query(query)
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(s_show=['text_name', 'p_type'])
    assert('text_name' in lines.columns)
    assert('p_type' in lines.columns)
    assert(len(lines) == 13)
示例#20
0
def test_concordance_lines_dataframes(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]'
    )
    result = corpus.query(query, context_break='s',
                          match_strategy='longest')
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(form='dataframes', s_show=['text_id'], cut_off=10)
    assert('df' in lines.columns)
    assert(type(lines['df'].iloc[0]) == pd.DataFrame)
示例#21
0
def test_concordance_export_dataframe(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]')
    result = corpus.query(query, context_break='s')
    concordance = Concordance(corpus, result.df)
    line = result.df.iloc[0]
    text_line = concordance._export(line.name,
                                    line,
                                    p_show=['word', 'pos'],
                                    form='dataframe')
    assert (isinstance(text_line, pd.DataFrame))
示例#22
0
def test_concordance_export_dict(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]')
    result = corpus.query(query)
    concordance = Concordance(corpus, result.df)
    line = result.df.iloc[0]
    text_line = concordance._export(line.name,
                                    line,
                                    p_show=['word', 'pos'],
                                    form='dict')
    assert (isinstance(text_line, dict))
    assert ('cpos' in text_line)
示例#23
0
def test_concordance_line(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] [lemma="CDU"] "/" "CSU" [word="\\]"]'
    )
    result = corpus.query(query)
    concordance = Concordance(corpus, result.df)
    line = result.df.iloc[0]
    text_line = concordance.text_line(
        line.name, line, p_show=['word', 'pos']
    )
    assert(type(text_line) == dict)
    assert('cpos' in text_line)
示例#24
0
def test_concordance_line2df(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] [lemma="CDU"] "/" "CSU" [word="\\]"]'
    )
    result = corpus.query(query)
    concordance = Concordance(corpus, result.df)
    line = result.df.iloc[0]
    text_line = concordance.text_line(
        line.name, line, ['word']
    )
    res = line2df(text_line)
    assert(type(res) == dict)
    assert(type(res['df']) == pd.DataFrame)
示例#25
0
def test_concordance_slots_regions(germaparl):
    corpus = get_corpus(germaparl)
    query = (
        r'[pos="NE"]? @1[pos="NE"] @2"\[" ([word="[A-Z]+"]+ "/"?)+ @3"\]"')
    df_dump = corpus.query(query,
                           context=10,
                           context_break='s',
                           match_strategy='longest',
                           corrections={
                               2: +1,
                               3: -1
                           }).df
    concordance = Concordance(corpus, df_dump)
    lines = concordance.slots(df_dump, ['word'], slots=[['match', 1], [2, 3]])
    assert (set(lines.columns) == {"word", "match..1_word", "2..3_word"})
示例#26
0
def test_concordance_persistence(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query_1 = (
        '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]'
    )
    query_2 = (
        '"und"'
    )

    # will show results for query_1
    dump = corpus.query(query_1, context_break='s')
    concordance = Concordance(corpus, dump.df)
    line_1 = concordance.lines(cut_off=1, form='dataframes')
    df_1 = line_1['df'].iloc[0]

    # will show results for query_1
    dump = corpus.query(query_2, context_break='s')
    line_2 = concordance.lines(cut_off=1, form='dataframes')
    df_2 = line_2['df'].iloc[0]

    # will show results for query_2
    concordance = Concordance(corpus, dump.df)
    line_3 = concordance.lines(cut_off=1, form='dataframes')
    df_3 = line_3['df'].iloc[0]

    assert(df_1.equals(df_2))
    assert(not df_2.equals(df_3))
示例#27
0
def test_concordance_slots_singletons(germaparl):
    corpus = get_corpus(germaparl)
    query = (
        r'[pos="NE"]? @1[pos="NE"] @2"\[" ([word="[A-Z]+"]+ "/"?)+ @3"\]"')
    df_dump = corpus.query(query,
                           context=2,
                           context_break='s',
                           match_strategy='longest',
                           corrections={
                               2: +1,
                               3: -1
                           }).df
    concordance = Concordance(corpus, df_dump)
    lines = concordance.slots(df_dump, ['word', 'lemma'])
    assert (set(lines.columns) == {
        "word", "lemma", "1_word", "1_lemma", "2_word", "2_lemma", "3_word",
        "3_lemma", "match..matchend_word", "match..matchend_lemma"
    })
示例#28
0
def test_concordance_line2simple(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] [lemma="CDU"] "/" "CSU" [word="\\]"]'
    )
    result = corpus.query(query)
    concordance = Concordance(corpus, result.df)
    line = result.df.iloc[0]
    text_line = concordance.text_line(
        line.name, line, ['word']
    )
    # simple
    res = line2simple(text_line)
    assert(type(res) == dict)
    assert(type(res["text"]) == str)
    # kwic
    res = line2simple(text_line, kwic=True)
    assert(type(res) == dict)
    assert(type(res["left"]) == str)
    assert(type(res["node"]) == str)
    assert(type(res["right"]) == str)
示例#29
0
def test_concordance_lines(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]'
    )
    result = corpus.query(query, context_break='s',
                          match_strategy='longest')

    concordance = Concordance(corpus, result.df)

    # standard = raw
    lines = concordance.lines()
    assert(len(lines) > 10)
    assert('raw' in lines.columns)
    assert(all(elem in lines.iloc[0]['raw'] for elem in ['cpos', 'match', 'word']))

    # simple
    lines = concordance.lines(form='simple', cut_off=10)
    assert('text' in lines.columns)
    assert(len(lines) == 10)

    # kwic
    lines = concordance.lines(form='kwic', cut_off=10)
    assert(all(elem in lines.columns for elem in ['left', 'node', 'right']))
    assert(len(lines) == 10)

    # kwic with s-attribute
    lines = concordance.lines(form='kwic', s_show=['text_id'], cut_off=10)
    assert(len(lines) == 10)
    assert('text_id' in lines.columns)
示例#30
0
    def concordance(self,
                    context=None,
                    matches=None,
                    p_show=['word'],
                    s_show=[],
                    order='random',
                    cut_off=100,
                    form='dataframes'):

        # deal with context
        if context is None:
            context = self.parameters['context']
        if context > self.parameters['context']:
            logger.warning("out of context; falling back to context=%d" %
                           self.parameters['context'])
            context = self.parameters['context']
            df = self.dump.df
        elif context < self.parameters['context']:
            df = self.dump.df.reset_index()
            df['context_new'] = df['match'] - context
            df['contextend_new'] = df['matchend'] + context
            df['context'] = df[['context', 'context_new']].max(axis=1)
            df['contextend'] = df[['contextend', 'contextend_new']].min(axis=1)
            df = df.drop(['context_new', 'contextend_new'], axis=1)
            df = df.set_index(['match', 'matchend'])
        else:
            df = self.dump.df

        conc = Concordance(self.dump.corpus.copy(), df)
        return conc.lines(matches=matches,
                          p_show=p_show,
                          s_show=s_show,
                          p_text=None,
                          p_slots=None,
                          slots=[],
                          order=order,
                          cut_off=cut_off,
                          form=form)