def test_concordance_persistence(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query_1 = (
        '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]'
    )
    query_2 = (
        '"und"'
    )

    # will show results for query_1
    dump = corpus.query(query_1, context_break='s')
    concordance = Concordance(corpus, dump.df)
    line_1 = concordance.lines(cut_off=1, form='dataframes')
    df_1 = line_1['df'].iloc[0]

    # will show results for query_1
    dump = corpus.query(query_2, context_break='s')
    line_2 = concordance.lines(cut_off=1, form='dataframes')
    df_2 = line_2['df'].iloc[0]

    # will show results for query_2
    concordance = Concordance(corpus, dump.df)
    line_3 = concordance.lines(cut_off=1, form='dataframes')
    df_3 = line_3['df'].iloc[0]

    assert(df_1.equals(df_2))
    assert(not df_2.equals(df_3))
def test_concordance_lines(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]'
    )
    result = corpus.query(query, context_break='s',
                          match_strategy='longest')

    concordance = Concordance(corpus, result.df)

    # standard = raw
    lines = concordance.lines()
    assert(len(lines) > 10)
    assert('raw' in lines.columns)
    assert(all(elem in lines.iloc[0]['raw'] for elem in ['cpos', 'match', 'word']))

    # simple
    lines = concordance.lines(form='simple', cut_off=10)
    assert('text' in lines.columns)
    assert(len(lines) == 10)

    # kwic
    lines = concordance.lines(form='kwic', cut_off=10)
    assert(all(elem in lines.columns for elem in ['left', 'node', 'right']))
    assert(len(lines) == 10)

    # kwic with s-attribute
    lines = concordance.lines(form='kwic', s_show=['text_id'], cut_off=10)
    assert(len(lines) == 10)
    assert('text_id' in lines.columns)
def test_concordance_order(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="Schröder"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    with pytest.raises(NotImplementedError):
        conc.lines(order='fail')
示例#4
0
def test_concordance_form_kwic(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]')
    result = corpus.query(query, context_break='s')
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(order='random', cut_off=100, form='kwic')
    assert (len(lines) == 13)
示例#5
0
def test_concordance_anchors_weird(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[word="\\["] @9[lemma="CDU"] "/" @2".*" @5[word="\\]"]')
    result = corpus.query(query, context_break='s')
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(order='random', cut_off=100, form='dataframe')
    assert (len(lines) == 13)
示例#6
0
def test_concordance_anchors(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]')
    result = corpus.query(query, context_break='s')
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(p_show=['lemma', 'pos'], form='dataframe')
    assert (len(lines) == 13)
示例#7
0
def test_concordance_many(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[lemma="oder"]')
    result = corpus.query(query)
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines()
    assert (len(lines) == 100)
def test_concordance_last(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="Schröder"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert(type(conc.lines(order='last')) == pd.DataFrame)
示例#9
0
def test_concordance_p_atts(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[word="\\["] [lemma="CDU"] "/" ".*" [word="\\]"]')
    result = corpus.query(query, context_break='s')
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(p_show=['lemma', 'pos'], form='dataframe')
    assert ('pos' in lines.iloc[0]['dataframe'].columns)
    assert (len(lines) == 13)
示例#10
0
def test_concordance_fallback(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[lemma="Gerhard"]? [lemma="Schröder"]')
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert (isinstance(
        conc.lines(order='last', form='simple', p_show=['word', 'lemma']),
        pd.DataFrame))
示例#11
0
def test_concordance_p_slots(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="Schröder"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert(conc.lines(p_slots='lemma') is None)
示例#12
0
def test_concordance_empty(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="NAHH"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert(conc.lines() is None)
示例#13
0
def test_concordance_form_simple(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] @1[lemma="CDU"] "/" @2".*" [word="\\]"]'
    )
    result = corpus.query(query, context_break='s')
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(order='random', cut_off=100, form='simple')
    assert(len(lines) == 13)
示例#14
0
def test_concordance_many(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="oder"]'
    )
    result = corpus.query(query)
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines()
    assert(len(lines) == 100)
示例#15
0
def test_concordance_fallback(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[lemma="Gerhard"]? [lemma="Schröder"]'
    )
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert(type(
        conc.lines(order='last', form='simple', p_show=['word', 'lemma'])
    ) == pd.DataFrame)
示例#16
0
def test_concordance_meta(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] [lemma="CDU"] "/" "CSU" [word="\\]"]'
    )
    result = corpus.query(query)
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(s_show=['text_name', 'p_type'])
    assert('text_name' in lines.columns)
    assert('p_type' in lines.columns)
    assert(len(lines) == 13)
示例#17
0
def test_concordance_lines_dataframes(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]'
    )
    result = corpus.query(query, context_break='s',
                          match_strategy='longest')
    concordance = Concordance(corpus, result.df)
    lines = concordance.lines(form='dataframes', s_show=['text_id'], cut_off=10)
    assert('df' in lines.columns)
    assert(type(lines['df'].iloc[0]) == pd.DataFrame)
示例#18
0
    def concordance(self,
                    context=None,
                    matches=None,
                    p_show=['word'],
                    s_show=[],
                    order='random',
                    cut_off=100,
                    form='dataframes'):

        # deal with context
        if context is None:
            context = self.parameters['context']
        if context > self.parameters['context']:
            logger.warning("out of context; falling back to context=%d" %
                           self.parameters['context'])
            context = self.parameters['context']
            df = self.dump.df
        elif context < self.parameters['context']:
            df = self.dump.df.reset_index()
            df['context_new'] = df['match'] - context
            df['contextend_new'] = df['matchend'] + context
            df['context'] = df[['context', 'context_new']].max(axis=1)
            df['contextend'] = df[['contextend', 'contextend_new']].min(axis=1)
            df = df.drop(['context_new', 'contextend_new'], axis=1)
            df = df.set_index(['match', 'matchend'])
        else:
            df = self.dump.df

        conc = Concordance(self.dump.corpus.copy(), df)
        return conc.lines(matches=matches,
                          p_show=p_show,
                          s_show=s_show,
                          p_text=None,
                          p_slots=None,
                          slots=[],
                          order=order,
                          cut_off=cut_off,
                          form=form)
示例#19
0
def test_concordance_lines_extended(germaparl):
    corpus = Corpus(germaparl['corpus_name'], registry_path=germaparl["registry_path"])
    query = (
        '[word="\\["] @1[lemma="CDU"] "/" @2 ".*" @3[word="\\]"]'
    )
    result = corpus.query(query, context_break='s',
                          match_strategy='longest')
    concordance = Concordance(corpus, result.df)
    p_slots = 'lemma'
    slots = {'test': [1, 3]}
    lines = concordance.lines(form='extended',
                              p_show=['word', 'lemma'],
                              s_show=['text_id'],
                              slots=slots,
                              p_text='word',
                              p_slots=p_slots,
                              cut_off=10)

    assert('df' in lines.columns)
    assert(type(lines['df'].iloc[0]) == pd.DataFrame)

    assert(3 in lines.columns)

    assert('text' in lines.columns)
示例#20
0
def test_concordance_lines(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[word="\\["] @1[lemma="CDU"] "/" "CSU" [word="\\]"]')
    result = corpus.query(query, context_break='s', match_strategy='longest')

    concordance = Concordance(corpus, result.df)

    # default = simple
    lines = concordance.lines()
    assert (len(lines) > 10)
    assert ('word' in lines.columns)

    # kwic
    lines = concordance.lines(form='kwic', cut_off=10)
    assert (all(elem in lines.columns
                for elem in ['left_word', 'node_word', 'right_word']))
    assert (len(lines) == 10)

    # kwic with s-attribute
    lines = concordance.lines(form='kwic', s_show=['text_id'], cut_off=10)
    assert (len(lines) == 10)
    assert ('text_id' in lines.columns)

    # slots
    lines = concordance.lines(form='slots', s_show=['text_id'], cut_off=10)
    assert (len(lines) == 10)
    assert (all(elem in lines.columns
                for elem in ['match..matchend_word', '1_word']))

    # dict
    lines = concordance.lines(form='dict', s_show=['text_id'], cut_off=10)
    assert (len(lines) == 10)
    assert (all(elem in lines.columns for elem in ['dict', 'text_id']))

    # dict
    lines = concordance.lines(form='dataframe', s_show=['text_id'], cut_off=10)
    assert (len(lines) == 10)
    assert (all(elem in lines.columns for elem in ['dataframe', 'text_id']))
示例#21
0
    def concordance(self,
                    window=5,
                    matches=None,
                    p_show=['word'],
                    s_show=[],
                    order='random',
                    cut_off=100,
                    form='dataframes'):
        """ self.df_nodes has duplicate entries
        (1) convert to (match matchend) disc_1_set disc_2_set ...
        (2) convert each line to dataframe
        """

        # make sure we're having the right context
        if window not in self.df_nodes.keys():
            df_nodes = self.slice_discs(window).copy()
        else:
            df_nodes = self.df_nodes[window]

        # get ids of all discoursemes
        disc_ids = set(self.discoursemes.keys())

        logger.info("converting discourse nodes to regular dump")
        # TODO speed up
        all_matches = set(df_nodes['match'])
        rows = list()
        for match in all_matches:
            row = dict()
            df_loc = df_nodes.loc[df_nodes.match == match]
            row['match'] = match
            row['matchend'] = df_loc.iloc[0]['matchend']
            row['context_id'] = df_loc.iloc[0]['context_id']
            row['context'] = df_loc.iloc[0]['context']
            row['contextend'] = df_loc.iloc[0]['contextend']
            for idx in disc_ids:
                disc_f1 = set()
                for a, b in zip(df_loc['match_' + idx],
                                df_loc['matchend_' + idx]):
                    disc_f1.update(range(a, b + 1))
                row[idx] = disc_f1
            rows.append(row)
        df = DataFrame(rows)
        df = df.set_index(["match", "matchend"])

        logger.info("converting each line to dataframe")
        conc = Concordance(self.corpus.copy(), df)
        lines = conc.lines(matches=matches,
                           p_show=p_show,
                           s_show=s_show,
                           p_text=None,
                           p_slots=None,
                           slots=[],
                           order=order,
                           cut_off=cut_off,
                           form=form)

        logger.info("inserting discourseme and window/context info")
        # TODO mark out of context
        dfs = list()
        for line in lines.iterrows():
            df = line[1]['df']
            # indicate topic matches
            match, matchend = line[0]
            df[self.topic.idx] = df.index.isin(set(range(match, matchend + 1)))
            # indicate discourseme matches
            for idx in disc_ids:
                df[idx] = df.index.isin(line[1][idx])
            df = df.drop(['match', 'matchend', 'context', 'contextend'],
                         axis=1)
            dfs.append(df)
        lines['df'] = dfs

        return lines
示例#22
0
def test_concordance_order(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[lemma="Gerhard"]? [lemma="Schröder"]')
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    conc.lines(order='fail')
示例#23
0
def test_concordance_empty(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[lemma="Gerhard"]? [lemma="NAHH"]')
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert (conc.lines().empty)
示例#24
0
def test_concordance_last(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[lemma="Gerhard"]? [lemma="Schröder"]')
    dump = corpus.query(query)
    conc = Concordance(corpus, dump.df)
    assert (isinstance(conc.lines(order='last'), pd.DataFrame))