Exemplo n.º 1
0
def test_collo_combo(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[word="\\("] [lemma=".*"]+ [word="\\)"]')
    df_dump = corpus.query(query).df
    collocates = Collocates(corpus, df_dump, ['lemma', 'pos'])
    c = collocates.show(order='log_likelihood')
    assert (type(c) == pd.DataFrame)
Exemplo n.º 2
0
def test_collocates_empty(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    query = ('[lemma="Armin"]? [lemma="NAHH"]')
    dump = corpus.query(query)
    collocates = Collocates(corpus, dump.df, p_query='word')
    collocates.show()
Exemplo n.º 3
0
def test_collocates_no_mws(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    query = ('[lemma="Armin"]? [lemma="Laschet"]')
    dump = corpus.query(query)
    collocates = Collocates(corpus, dump.df, p_query='word', mws=None)
    collocates.show()
Exemplo n.º 4
0
def test_collocates_speed_many():
    corpus = Corpus("GERMAPARL_1114")
    query = '[lemma="sagen"]'
    df_dump = corpus.query(query, context=2, context_break='s').df
    collocates = Collocates(corpus, df_dump, p_query='lemma')
    c2 = collocates.show(window=2, cut_off=50)
    assert (type(c2) == pd.DataFrame)
Exemplo n.º 5
0
def test_query_logging(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[word="\\("] [lemma=".*"]+ [word="\\)"]')
    df_dump = corpus.query(query).df
    collocates = Collocates(corpus, df_dump, 'fail')
    c = collocates.show(order='log_likelihood', window=15)
    assert (type(c) == pd.DataFrame)
    assert ('Dr.' in c.index)
Exemplo n.º 6
0
def test_collocates_speed_many(germaparl):
    corpus = get_corpus(germaparl)
    query = '[lemma="die"]'
    df_dump = corpus.query(query, context_break='text').df
    collocates = Collocates(corpus, df_dump, p_query='lemma', mws=100)
    c2 = collocates.show(window=50, cut_off=50)
    assert c2.index[0] == ','
    assert type(c2) == pd.DataFrame
Exemplo n.º 7
0
def test_query_default(germaparl):
    corpus = Corpus(germaparl['corpus_name'],
                    registry_path=germaparl['registry_path'])
    query = ('[word="\\("] [lemma=".*"]+ [word="\\)"]')
    df_dump = corpus.query(query).df
    collocates = Collocates(corpus, df_dump, 'lemma')
    c = collocates.show(order='log_likelihood')
    assert (type(c) == pd.DataFrame)
    assert ('Dr.' in c.index)
Exemplo n.º 8
0
def test_collo_single(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[word="\\("] [lemma=".*"]+ [word="\\)"]')
    df_dump = corpus.query(query).df
    collocates = Collocates(corpus, df_dump, 'lemma')
    c = collocates.show(order='log_likelihood')
    print(c)
    assert (type(c) == pd.DataFrame)
    assert ('Dr.' in c.index)
Exemplo n.º 9
0
def test_collocates_pp(germaparl):
    corpus = get_corpus(germaparl)
    query = ('"SPD"')
    result = corpus.query(query)
    collocates = Collocates(corpus, result.df, p_query='word')
    c = collocates.show(order='log_likelihood', cut_off=None)
    assert (int(c.loc['Die']['O11']) < int(c.loc['die']['O11']))
    c = collocates.show(order='log_likelihood', cut_off=None, flags="%cd")
    assert ('die' in c.index and 'Die' not in c.index)
Exemplo n.º 10
0
def test_collocates_nodes(germaparl):

    corpus = get_corpus(germaparl)
    query = ('[lemma=","] | [lemma="\\."] | [lemma="\\)"] | [lemma="\\("]')
    # three discoursemes
    dump = corpus.query(query)
    collocates = Collocates(corpus, dump.df)
    df = collocates.show(cut_off=None)
    assert ("," not in df.index)
    assert ("(" not in df.index)
Exemplo n.º 11
0
def test_collocates_mwu(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[lemma="CDU"] "/"? [lemma="CSU"]?')
    result = corpus.query(query, match_strategy='longest')
    collocates = Collocates(corpus, result.df, 'lemma')
    c = collocates.show(order='log_likelihood', cut_off=None)
    assert (type(c) == pd.DataFrame)
    assert (len(c) > 9)
    assert ('CSU' in c.index)
    assert (int(c.loc['CSU']['in_nodes']) > int(c.loc['CSU']['f']))
Exemplo n.º 12
0
    def collocates(self,
                   window=5,
                   order='f',
                   cut_off=100,
                   p_query="lemma",
                   ams=None,
                   min_freq=2,
                   frequencies=True,
                   flags=None):

        coll = Collocates(self.dump.corpus.copy(), self.dump.df, p_query)
        return coll.show(window=window,
                         order=order,
                         cut_off=cut_off,
                         ams=ams,
                         min_freq=min_freq,
                         frequencies=frequencies,
                         flags=flags)
Exemplo n.º 13
0
def compare_counts(lemma, window, min_freq=0):
    # TODO: update to reproduceable example

    # CCC
    corpus = Corpus("GERMAPARL_1114")
    query = '[lemma="' + lemma + '"]'
    df_dump = corpus.query(query, context=window, context_break='s').df
    collocates = Collocates(corpus, df_dump, p_query='lemma')
    col = collocates.show(window=5, cut_off=None, min_freq=min_freq)

    # UCS
    ucs = pd.read_csv("tests/gold/ucs-germaparl1114-" + lemma + ".ds.gz",
                      sep="\t",
                      index_col=2,
                      comment="#",
                      quoting=3,
                      na_filter=False)
    ucs.index.name = 'item'
    try:
        O11_ucs_node = ucs.loc[lemma]['f']
        ucs.drop(lemma, inplace=True)
    except KeyError:
        O11_ucs_node = 0

    # identities that should hold between counting strategies
    # (1) N_ccc + f1_ccc = N_ucs
    # (2) f1_infl_ccc = f1_infl_ucs - O11_ucs_node
    nr = {
        'f1_ccc': int(corpus.marginals([lemma], "lemma")[['freq']].values[0]),
        'N_ccc': int(col[['N']].values[0]),
        'f1_infl_ccc': int(col[['f1']].values[0]),
        'N_ucs': int(ucs[['N']].values[0]),
        'f1_infl_ucs': int(ucs[['f1']].values[0]),
        'O11_ucs_node': O11_ucs_node
    }

    # make dataframes comparable
    ucs = ucs[['f', 'f2']]
    ucs.columns = ['O11', 'f2']
    ucs.sort_values(by=['O11', 'item'], ascending=False, inplace=True)

    assert (nr['N_ccc'] + nr['f1_ccc'] == nr['N_ucs'])
    assert (nr['f1_infl_ccc'] == nr['f1_infl_ucs'] - nr['O11_ucs_node'])
Exemplo n.º 14
0
def test_compare_counts(germaparl, ucs_counts):
    # identities that should hold between counting strategies:
    # O11 = f_ucs
    # O11 + O21 = f2_ucs
    # O11 + O12 + O21 + O22 + freq[node] = N_ucs
    # O11 + O12 = f1_ucs - O11_ucs[node]

    corpus = get_corpus(germaparl)

    # [lemma="Land"]
    lemma = "Land"
    context = 10
    min_freq = 0

    df_dump = corpus.query('[lemma="%s"]' % lemma,
                           context=context,
                           context_break='s').df
    collocates = Collocates(corpus, df_dump, p_query='lemma')
    counts = collocates.show(window=context, cut_off=None, min_freq=min_freq)[[
        'O11', 'O12', 'O21', 'O22', 'in_nodes'
    ]]
    counts = counts.join(ucs_counts[lemma])
    ucs_node_cooc = ucs_counts[lemma].loc[lemma]
    ccc_node_freq = corpus.marginals([lemma], "lemma")['freq'].values[0]

    assert (counts['O11'].equals(counts['f_ucs']))
    assert ((counts['O11'] + counts['O21']).equals(counts['f2_ucs']))
    assert ((counts['O11'] + counts['O12'] + counts['O21'] + counts['O22'] +
             ccc_node_freq).equals(counts['N_ucs']))
    assert ((counts['O11'] + counts['O12']).equals(counts['f1_ucs'] -
                                                   ucs_node_cooc['f_ucs']))

    # [lemma="und"]
    lemma = "und"
    context = 5
    min_freq = 2

    df_dump = corpus.query('[lemma="%s"]' % lemma,
                           context=context,
                           context_break='s').df
    collocates = Collocates(corpus, df_dump, p_query='lemma')
    counts = collocates.show(window=context, cut_off=None, min_freq=min_freq)[[
        'O11', 'O12', 'O21', 'O22', 'in_nodes'
    ]]
    counts = counts.join(ucs_counts['und'])
    ucs_node_cooc = ucs_counts['und'].loc['und']
    ccc_node_freq = corpus.marginals(['und'], "lemma")['freq'].values[0]

    assert (counts['O11'].equals(counts['f_ucs']))
    assert ((counts['O11'] + counts['O21']).equals(counts['f2_ucs']))
    assert ((counts['O11'] + counts['O12'] + counts['O21'] + counts['O22'] +
             ccc_node_freq).equals(counts['N_ucs']))
    assert ((counts['O11'] + counts['O12']).equals(counts['f1_ucs'] -
                                                   ucs_node_cooc['f_ucs']))
Exemplo n.º 15
0
def test_collocates_persistence(germaparl):
    corpus = get_corpus(germaparl)
    query_1 = ('"SPD"')
    query_2 = ('"CSU"')

    # will show collocates for query_1
    result = corpus.query(query_1, context_break='s').df
    collocates = Collocates(corpus, result, 'lemma')
    line_1 = collocates.show()

    # will show collocates for query_1
    result = corpus.query(query_2, context_break='s').df
    line_2 = collocates.show()

    # will show collocates for query_2
    collocates = Collocates(corpus, result, 'lemma')
    line_3 = collocates.show()

    assert (line_1.equals(line_2))
    assert (not line_2.equals(line_3))
Exemplo n.º 16
0
def test_collocates_no_mws(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[lemma="Armin"]? [lemma="Laschet"]')
    dump = corpus.query(query)
    collocates = Collocates(corpus, dump.df, p_query='word', mws=None)
    collocates.show()
Exemplo n.º 17
0
def test_collocates_empty(germaparl):
    corpus = get_corpus(germaparl)
    query = ('[lemma="Armin"]? [lemma="NAHH"]')
    dump = corpus.query(query)
    collocates = Collocates(corpus, dump.df, p_query='word')
    collocates.show()