예제 #1
0
def test_corpus_pass_tmpreproc():
    c = Corpus()
    c['doc1'] = 'A simple example in simple English.'
    c['doc2'] = 'It contains only three very simple documents.'
    c['doc3'] = 'Simply written documents are very brief.'

    preproc = TMPreproc(c)
    tok = preproc.tokenize().tokens
    assert set(tok.keys()) == set(c.keys())
    assert len(tok['doc1']) == 7
예제 #2
0
def evaluate_model(file_name, date, n_iter, scope, lang, n_eval=5):
  #
  corpus = Corpus()
  corpus.add_files(file_name, encoding='utf8')
  #
  preproc = TMPreproc(corpus)
  dtm_bg = preproc.dtm
  #
  var_params = [{'n_topics': k} for k in range(5, int(n_eval*10), n_eval)]
  #
  const_params = {
    'n_iter': n_iter,
    'random_state': 20200713  # to make results reproducible
  }
  eval_results = evaluate_topic_models(dtm_bg,
                                     varying_parameters=var_params,
                                     constant_parameters=const_params,
                                     metric=['loglikelihood', 'cao_juan_2009', 'arun_2010']#,
                                     #return_models=True
                                     )
  #
  eval_results_by_topics = results_by_parameter(eval_results, 'n_topics')
  #
  name = "evaluate_model_{}_{}iter_{}eval_{}_{}.png".format(date, n_iter, n_eval, scope, lang)
  plot_eval_results(eval_results_by_topics, figsize=(8, 6), metric_direction_font_size='x-small', title_fontsize='small', axes_title_fontsize='x-small')
  plt.tight_layout()
  plt.savefig('out/'+name)
  return
예제 #3
0
def _check_save_load_state(preproc, repeat=1, recreate_from_state=False):
    # copy simple attribute states
    simple_state_attrs = ('language', 'stopwords', 'punctuation',
                          'special_chars', 'n_workers', 'tokenized',
                          'pos_tagged', 'ngrams_generated', 'ngrams_as_tokens')
    pre_state = {
        attr: deepcopy(getattr(preproc, attr))
        for attr in simple_state_attrs
    }

    # copy complex attribute states
    pre_state['docs'] = deepcopy(preproc.docs)

    if preproc.tokenized:
        pre_state['tokens'] = preproc.tokens
        pre_state['vocabulary'] = preproc.vocabulary
    if preproc.pos_tagged:
        pre_state['tokens_with_pos_tags'] = preproc.tokens_with_pos_tags
    if preproc.ngrams_generated:
        pre_state['ngrams'] = preproc.ngrams

    # save and then load the same state
    for _ in range(repeat):
        if recreate_from_state:
            preproc.save_state(TMPREPROC_TEMP_STATE_FILE)
            preproc = TMPreproc.from_state(TMPREPROC_TEMP_STATE_FILE)
        else:
            preproc.save_state(TMPREPROC_TEMP_STATE_FILE).load_state(
                TMPREPROC_TEMP_STATE_FILE)

    # check if states are the same now
    for attr in simple_state_attrs:
        assert pre_state[attr] == getattr(preproc, attr)

    assert set(pre_state['docs'].keys()) == set(preproc.docs.keys())
    assert preproc.n_docs == len(pre_state['docs'])
    assert all(pre_state['docs'][k] == preproc.docs[k]
               for k in preproc.docs.keys())

    if preproc.tokenized:
        assert set(pre_state['tokens'].keys()) == set(preproc.tokens.keys())
        assert all(pre_state['tokens'][k] == preproc.tokens[k]
                   for k in preproc.tokens.keys())

        assert pre_state['vocabulary'] == preproc.vocabulary

    if preproc.pos_tagged:
        assert set(pre_state['tokens_with_pos_tags'].keys()) == set(
            preproc.tokens_with_pos_tags.keys())
        assert all(pre_state['tokens_with_pos_tags'][k] ==
                   preproc.tokens_with_pos_tags[k]
                   for k in preproc.tokens_with_pos_tags.keys())

    if preproc.ngrams_generated:
        assert set(pre_state['ngrams'].keys()) == set(preproc.ngrams.keys())
        assert all(pre_state['ngrams'][k] == preproc.ngrams[k]
                   for k in preproc.ngrams.keys())
예제 #4
0
An example for preprocessing documents in German language and generating a document-term-matrix (DTM).
"""
from pprint import pprint

import pandas as pd
from tmtoolkit.preprocess import TMPreproc
from tmtoolkit.utils import pickle_data

if __name__ == '__main__':  # this is necessary for multiprocessing on Windows!
    corpus = {
        u'doc1': u'Ein einfaches Beispiel in einfachem Deutsch.',
        u'doc2': u'Es enthält nur drei sehr einfache Dokumente.',
        u'doc3': u'Die Dokumente sind sehr kurz.',
    }

    preproc = TMPreproc(corpus, language='german')

    print('tokenized:')
    preproc.tokenize()
    pprint(preproc.tokens)

    # preproc.stem()
    # pprint(preproc.tokens)

    print('POS tagged:')
    preproc.pos_tag()
    pprint(preproc.tokens_with_pos_tags)

    print('lemmatized:')
    preproc.lemmatize()
    pprint(preproc.tokens_with_pos_tags)
예제 #5
0
    print(corpus.docs.keys())
    print("-----")

    corpus.split_by_paragraphs()
    print("documents split into paragraphs")
    print(corpus.docs.keys())
    print("-----")

    print("first 5 paragraphs of Werther:")
    for par_num in range(1, 6):
        doclabel = u'werther-goethe_werther1-%d' % par_num
        print(u"par%d (document label '%s'):" % (par_num, doclabel))
        print(corpus.docs[doclabel])
    print("-----")

    preproc = TMPreproc(corpus.docs, language=u'german')
    preproc.tokenize().tokens_to_lowercase()

    print("tokenized first 5 paragraphs of Werther:")
    for par_num in range(1, 6):
        doclabel = u'werther-goethe_werther1-%d' % par_num
        print(u"par%d (document label '%s'):" % (par_num, doclabel))
        print(preproc.tokens[doclabel])

    preproc.generate_ngrams(2, join=False).use_ngrams_as_tokens(join=True)

    print("bigrams from first 5 paragraphs of Werther:")
    for par_num in range(1, 6):
        doclabel = u'werther-goethe_werther1-%d' % par_num
        print(u"par%d (document label '%s'):" % (par_num, doclabel))
        print(preproc.tokens[doclabel])
예제 #6
0
#%%

timings = []
timing_labels = []


def add_timing(label):
    timings.append(datetime.today())
    timing_labels.append(label)


#%%

add_timing('start')

preproc = TMPreproc(corpus, n_max_processes=cpu_count())
add_timing('load')

preproc.tokenize()
add_timing('tokenize')

preproc.expand_compound_tokens()
add_timing('expand_compound_tokens')

preproc.pos_tag()
add_timing('pos_tag')

preproc.lemmatize()
add_timing('lemmatize')

preproc.remove_special_chars_in_tokens()
예제 #7
0
"""
Script that generates "eval_table/eval_table.csv" from text samples in folder "eval_texts". This table is later
used to manually add correct lemmata.

Markus Konrad <*****@*****.**>, Wissenschaftszentrum Berlin für Sozialforschung
January 2019
"""

import pandas as pd
from tmtoolkit.corpus import Corpus
from tmtoolkit.preprocess import TMPreproc

corpus = Corpus.from_folder('eval_texts')

preproc = TMPreproc(corpus.docs, language='german')

postagged = preproc.tokenize().pos_tag()
postagged = postagged.filter_for_pos({'N', 'V', 'ADJ', 'ADV'})

tok_pos_df = pd.DataFrame()
for doc_id, tok_pos in postagged.tokens_with_pos_tags.items():
    tok, pos = zip(*tok_pos)
    tok_pos_df = tok_pos_df.append(pd.DataFrame({
        'doc_id': doc_id,
        'token': tok,
        'pos': pos
    }), ignore_index=True)

tok_pos_df.drop_duplicates(['token', 'pos'], inplace=True)

tok_pos_df.to_csv('eval_table/eval_table.csv')
        dtm, vocab, doc_labels = load_dtm_from_pickle(DTM_PICKLE)
    else:
        europarl = nltk.corpus.util.LazyCorpusLoader(
            'europarl_raw', nltk.corpus.EuroparlCorpusReader, fileids=FILEIDS)

        corpus = Corpus(
            {f: europarl.raw(f_id)
             for f, f_id in zip(FILES, FILEIDS)})

        print("all loaded documents:")
        for dl, text in corpus.docs.items():
            print("%s: %d chars" % (dl, len(text)))
        print("-----")

        start_time = time.time()
        preproc = TMPreproc(corpus.docs, language=u'german')
        print('tokenizing...')
        preproc.tokenize()
        print('POS tagging...')
        preproc.pos_tag()
        print('lemmatization...')
        preproc.lemmatize()
        print('lowercase transform...')
        preproc.tokens_to_lowercase()
        print('cleaning...')
        preproc.clean_tokens()

        proc_time = time.time() - start_time
        print('-- processing took %f sec. so far' % proc_time)

        preproc.save_state('data/read_preproc_lda_de_state.pickle')
예제 #9
0
from tmtoolkit.topicmod.evaluate import results_by_parameter
from tmtoolkit.topicmod.visualize import plot_eval_results

logging.basicConfig(level=logging.INFO)
tmtoolkit_log = logging.getLogger('tmtoolkit')
tmtoolkit_log.setLevel(logging.INFO)
tmtoolkit_log.propagate = True

print('loading data...')
bt18 = pd.read_pickle('data/bt18_sample_1000.pickle')
print('loaded %d documents' % len(bt18))
doc_labels = [u'%s_%s' % info for info in zip(bt18.sitzung, bt18.sequence)]

print('preprocessing data...')
bt18corp = Corpus(dict(zip(doc_labels, bt18.text)))
preproc = TMPreproc(bt18corp, language='german')
preproc.tokenize().stem().clean_tokens()

doc_labels = list(preproc.tokens.keys())
texts = list(preproc.tokens.values())

print('creating gensim corpus...')
gnsm_dict = gensim.corpora.Dictionary.from_documents(texts)
gnsm_corpus = [gnsm_dict.doc2bow(text) for text in texts]

# evaluate topic models with different parameters
const_params = dict(update_every=0, passes=10)
ks = list(range(10, 140, 10)) + list(range(140, 200, 20))
varying_params = [dict(num_topics=k, alpha=1.0 / k) for k in ks]

print('evaluating %d topic models' % len(varying_params))
예제 #10
0
tmtoolkit_log.setLevel(logging.INFO)
tmtoolkit_log.propagate = True

random.seed(20200320)

#%%

corpus = Corpus.from_builtin_corpus('en-NewsArticles').sample(1000)

print('%d documents' % len(corpus))

#%%

add_timing('start')

preproc = TMPreproc(corpus, language='en', n_max_processes=cpu_count())
add_timing('load and tokenize')

preproc.expand_compound_tokens()
add_timing('expand_compound_tokens')

preproc.pos_tag()
add_timing('pos_tag')

preproc.lemmatize()
add_timing('lemmatize')

preproc_copy = preproc.copy()
preproc_copy.shutdown_workers()
del preproc_copy
add_timing('copy')
예제 #11
0
An example for preprocessing documents in German language and generating a document-term-matrix (DTM).
"""
from pprint import pprint
from tmtoolkit.preprocess import TMPreproc

import pandas as pd


if __name__ == '__main__':   # this is necessary for multiprocessing on Windows!
    corpus = {
        'doc1': u'A simple example in simple English.',
        'doc2': u'It contains only three very simple documents.',
        'doc3': u'Simply written documents are very brief.',
    }

    preproc = TMPreproc(corpus, language='english')

    print('input corpus:')
    pprint(corpus)

    print('running preprocessing pipeline...')
    preproc.tokenize().pos_tag().lemmatize().tokens_to_lowercase().clean_tokens()

    print('final tokens:')
    pprint(preproc.tokens)

    print('DTM:')
    doc_labels, vocab, dtm = preproc.get_dtm()

    # using pandas just for a nice tabular output
    print(pd.DataFrame(dtm.todense(), columns=vocab, index=doc_labels))
예제 #12
0
#%% Correct contractions

# some contractions have a stray space in between, like "EU -Hilfen" where it should be "EU-Hilfen"
# correct this by applying a custom function with a regular expression (RE) to each document in the corpus
pttrn_contraction_ws = re.compile(r'(\w+)(\s+)(-\w+)')

print('correcting wrong contractions')
# in each document text `t`, remove the RE group 2 (the stray white space "(\s+)") for each match `m`
corpus.apply(lambda t: pttrn_contraction_ws.sub(lambda m: m.group(1) + m.group(3), t))

#%% Create a TMPreproc object for token processing

# this takes some time because the documents are directly tokenized
print('creating TMPreproc object from corpus')
preproc = TMPreproc(corpus, language='german')
print('created: %s' % preproc)

# we don't need this anymore, remove it to free memory
del corpus

#%% Calculate the total number of tokens in the whole corpus

print('total number of tokens in the whole corpus:')
print(sum(preproc.doc_lengths.values()))

#%% Have a glimpse at the tokens

# Note that "preproc.tokens_datatable" (*table* instead of *frame*) is much faster but the "datatable" package is still
# in early development stages. If you like to have a pandas dataframe instead, use the property "tokens_dataframe".
예제 #13
0
def tmpreproc_de():
    return TMPreproc(corpus_de.docs, language='german')
예제 #14
0
def tmpreproc_en():
    return TMPreproc(corpus_en.docs, language='english')
from tmtoolkit.corpus import Corpus
from tmtoolkit.preprocess import TMPreproc

from examples._benchmarktools import add_timing, print_timings

logging.basicConfig(level=logging.INFO)
tmtoolkit_log = logging.getLogger('tmtoolkit')
tmtoolkit_log.setLevel(logging.INFO)
tmtoolkit_log.propagate = True

#%%

corpus = Corpus.from_builtin_corpus('en-NewsArticles')

print('%d documents' % len(corpus))

#%%

add_timing('start')

preproc = TMPreproc(corpus, language='en', n_max_processes=4)
add_timing('load and tokenize')

preproc.pos_tag()
add_timing('pos_tag')

preproc.lemmatize()
add_timing('lemmatize')

print_timings()
예제 #16
0
        u'einzig', u'beim', u'darin', u'innerhalb', u'daraus', u'dadurch',
        u'allerdings'
    ]

print('preparing corpus...')
corpus = {}
for speech_id, speech in speeches_df.iterrows():
    doc_label = '%d_sess%d_top%d_spk_%s_seq%d' % (
        speech_id, speech.sitzung, speech.top_id, speech.speaker_fp,
        speech.sequence)
    corpus[doc_label] = speech.text

assert len(corpus) == len(speeches_df)

print('starting preprocessing...')
preproc = TMPreproc(corpus, language='german')
preproc.add_stopwords(CUSTOM_STOPWORDS)
preproc.add_special_chars(CUSTOM_SPECIALCHARS)

print('tokenizing...')
preproc.tokenize()

vocab = preproc.vocabulary
pttrn_token_w_specialchar = re.compile(
    u'[^A-Za-z0-9ÄÖÜäöüß' + re.escape(string.punctuation) + u']', re.UNICODE)
pttrn_token_w_specialchar_inv = re.compile(
    u'[A-Za-z0-9ÄÖÜäöüß' + re.escape(string.punctuation) + u']', re.UNICODE)
tokens_w_specialchars = [
    t for t in vocab if pttrn_token_w_specialchar.search(t)
]
uncommon_special_chars = set(