Python Corpus.Corpus примеры использования

Язык программирования: Python

Пространство имен/Пакет: tmtoolkit.corpus

Класс/Тип: Corpus

Метод/Функция: Corpus

Примеров на hotexamples.com: 20

Python Corpus.Corpus - 20 примеров найдено. Это лучшие примеры Python кода для tmtoolkit.corpus.Corpus.Corpus, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Corpus(20)

from_folder(8)

from_files(6)

from_builtin_corpus(5)

copy(3)

keys(3)

items(2)

builtin_corpora(2)

replace_characters(2)

apply(2)

add_files(1)

split_by_paragraphs(1)

from_pickle(1)

get_doc_labels(1)

from_zip(1)

from_tabular(1)

add_doc(1)

filter_characters(1)

to_pickle(1)

Пример #1

Показать файл

def evaluate_model(file_name, date, n_iter, scope, lang, n_eval=5):
  #
  corpus = Corpus()
  corpus.add_files(file_name, encoding='utf8')
  #
  preproc = TMPreproc(corpus)
  dtm_bg = preproc.dtm
  #
  var_params = [{'n_topics': k} for k in range(5, int(n_eval*10), n_eval)]
  #
  const_params = {
    'n_iter': n_iter,
    'random_state': 20200713  # to make results reproducible
  }
  eval_results = evaluate_topic_models(dtm_bg,
                                     varying_parameters=var_params,
                                     constant_parameters=const_params,
                                     metric=['loglikelihood', 'cao_juan_2009', 'arun_2010']#,
                                     #return_models=True
                                     )
  #
  eval_results_by_topics = results_by_parameter(eval_results, 'n_topics')
  #
  name = "evaluate_model_{}_{}iter_{}eval_{}_{}.png".format(date, n_iter, n_eval, scope, lang)
  plot_eval_results(eval_results_by_topics, figsize=(8, 6), metric_direction_font_size='x-small', title_fontsize='small', axes_title_fontsize='x-small')
  plt.tight_layout()
  plt.savefig('out/'+name)
  return

Пример #2

Показать файл

Файл: test_corpus.py Проект: ihavemanyquestions/tmtoolkit

def test_corpus_replace_characters_simple():
    c = Corpus({'doc1': 'ABC', 'doc2': 'abcDeF'})
    c.replace_characters({'a': None, 'C': 'c', 'e': ord('X')})

    assert c.docs == {
        'doc1': 'ABc',
        'doc2': 'bcDXF',
    }

    c.replace_characters({ord('A'): None})

    assert c.docs == {
        'doc1': 'Bc',
        'doc2': 'bcDXF',
    }

    c.replace_characters(str.maketrans('DXFY', '1234'))

    assert c.docs == {
        'doc1': 'Bc',
        'doc2': 'bc123',
    }

    c.replace_characters({})

    assert c.docs == {
        'doc1': 'Bc',
        'doc2': 'bc123',
    }

Пример #3

Показать файл

Файл: test_corpus.py Проект: ihavemanyquestions/tmtoolkit

def test_corpus_unique_characters(texts):
    all_chars = set(''.join(texts))

    c = Corpus({str(i): t for i, t in enumerate(texts)})
    res_chars = c.unique_characters
    assert isinstance(res_chars, set)
    assert res_chars == all_chars

Пример #4

Показать файл

Файл: test_corpus.py Проект: ihavemanyquestions/tmtoolkit

def test_corpus_from_pickle():
    c1 = Corpus({'a': '1', 'b': '22', 'c': '333'})

    with tempfile.TemporaryFile(suffix='.pickle') as f:
        c1.to_pickle(f)
        f.seek(0)
        c2 = Corpus.from_pickle(f)

    assert c1.docs == c2.docs

Пример #5

Показать файл

def test_corpus_pass_tmpreproc():
    c = Corpus()
    c['doc1'] = 'A simple example in simple English.'
    c['doc2'] = 'It contains only three very simple documents.'
    c['doc3'] = 'Simply written documents are very brief.'

    preproc = TMPreproc(c)
    tok = preproc.tokenize().tokens
    assert set(tok.keys()) == set(c.keys())
    assert len(tok['doc1']) == 7

Пример #6

Показать файл

Файл: test_corpus.py Проект: ihavemanyquestions/tmtoolkit

def test_corpus_apply(texts):
    c = Corpus({str(i): t for i, t in enumerate(texts)})
    c_orig = c.copy()
    orig_doc_labels = c.doc_labels
    orig_doc_lengths = c.doc_lengths

    assert isinstance(c.apply(str.upper), Corpus)

    assert c.doc_labels == orig_doc_labels
    assert c.doc_lengths == orig_doc_lengths

    for dl, dt in c.items():
        assert c_orig[dl].upper() == dt

Пример #7

Показать файл

Файл: test_corpus.py Проект: ihavemanyquestions/tmtoolkit

def test_corpus_copy(texts):
    c1 = Corpus({str(i): t for i, t in enumerate(texts)})
    c2 = c1.copy()

    assert c1.docs is not c2.docs
    assert c1.docs == c2.docs

    assert c1.doc_paths is not c2.doc_paths
    assert c1.doc_paths == c2.doc_paths

    assert c1.doc_labels == c2.doc_labels
    assert c1.doc_lengths == c2.doc_lengths
    assert c1.unique_characters == c2.unique_characters

Пример #8

Показать файл

def test_corpus_add_doc():
    c = Corpus()
    with pytest.raises(ValueError):
        c.add_doc('', 'x')
    with pytest.raises(ValueError):
        c.add_doc(123, 'x')
    with pytest.raises(ValueError):
        c.add_doc('d1', None)

    c.add_doc('d1', 'd1 text')
    with pytest.raises(ValueError):
        c.add_doc('d1', 'd1 text')

    c.add_doc('d2', '')

    assert set(c.keys()) == {'d1', 'd2'}

Пример #9

Показать файл

def test_corpus_from_files():
    doc_path = 'examples/data/gutenberg/kafka_verwandlung.txt'
    c1 = Corpus.from_files([doc_path])
    c2 = Corpus().add_files([doc_path])

    assert len(c1.docs) == len(c1.doc_paths) == 1
    assert len(c2.docs) == len(c2.doc_paths) == 1
    assert c1.docs.keys() == c2.docs.keys() == c1.doc_paths.keys(
    ) == c2.doc_paths.keys()

    only_doc_label = next(iter(c1.docs.keys()))
    assert only_doc_label.endswith('kafka_verwandlung')

    only_doc = c1.docs[only_doc_label]
    assert len(only_doc) > 0

    assert c1.doc_paths[only_doc_label] == doc_path

Пример #10

Показать файл

def test_corpus_dict_methods():
    c = Corpus()
    assert len(c) == 0
    with pytest.raises(KeyError):
        x = c['x']

    with pytest.raises(KeyError):
        c[1] = 'abc'

    with pytest.raises(KeyError):
        c[''] = 'abc'

    with pytest.raises(ValueError):
        c['d1'] = None

    c['d1'] = 'd1 text'
    assert len(c) == 1
    assert 'd1' in c
    assert set(c.keys()) == {'d1'}
    assert c['d1'] == 'd1 text'

    c['d2'] = 'd2 text'
    assert len(c) == 2
    for dl in c:
        assert dl in {'d1', 'd2'}
    assert set(c.keys()) == {'d1', 'd2'}

    for dl, dt in c.items():
        assert dl in {'d1', 'd2'}
        assert c[dl] == dt

    with pytest.raises(KeyError):
        del c['d3']

    del c['d1']
    assert len(c) == 1
    assert set(c.keys()) == {'d2'}

    del c['d2']
    assert len(c) == 0
    assert set(c.keys()) == set()

Пример #11

Показать файл

Файл: test_corpus.py Проект: ihavemanyquestions/tmtoolkit

def test_corpus_filter_characters(texts):
    c = Corpus({str(i): t for i, t in enumerate(texts)})
    c_orig = c.copy()

    orig_doc_labels = c.doc_labels
    orig_doc_lengths = c.doc_lengths
    orig_uniq_chars = c.unique_characters

    assert isinstance(c.filter_characters(orig_uniq_chars), Corpus)
    assert c.doc_labels == orig_doc_labels
    assert c.doc_lengths == orig_doc_lengths
    assert c.unique_characters == orig_uniq_chars

    not_in_corpus_chars = set(string.printable) - orig_uniq_chars
    if len(not_in_corpus_chars) > 0:
        c.filter_characters(not_in_corpus_chars)
        assert c.doc_labels == orig_doc_labels
        assert c.doc_lengths == {dl: 0 for dl in c.doc_labels}
        assert c.unique_characters == set()

    c = c_orig.copy()
    c.filter_characters(set())
    assert c.doc_labels == orig_doc_labels
    assert c.doc_lengths == {dl: 0 for dl in c.doc_labels}
    assert c.unique_characters == set()

    if len(orig_uniq_chars) > 3:
        c = c_orig.copy()
        only_chars = set(sample(list(orig_uniq_chars), 3))
        c.filter_characters(only_chars)
        assert c.doc_labels == orig_doc_labels
        assert c.doc_lengths != orig_doc_lengths
        assert c.unique_characters == only_chars

        c = c_orig.copy()
        only_chars = set(sample(list(orig_uniq_chars), 3))
        c.filter_characters(''.join(only_chars))  # as char sequence
        assert c.doc_labels == orig_doc_labels
        assert c.doc_lengths != orig_doc_lengths
        assert c.unique_characters == only_chars

Пример #12

Показать файл

#

MAX_DOC_LEN = 5000
N_DOCS_EN = 7
N_DOCS_DE = 3  # given from corpus size

all_docs_en = {
    f_id: nltk.corpus.gutenberg.raw(f_id)
    for f_id in nltk.corpus.gutenberg.fileids()
}
smaller_docs_en = [(dl, txt[:min(nchar, MAX_DOC_LEN)])
                   for dl, txt, nchar in map(lambda x: (x[0], x[1], len(x[1])),
                                             all_docs_en.items())]

corpus_en = Corpus(
    dict(
        sample([(dl, txt) for dl, txt in smaller_docs_en
                if dl != u'melville-moby_dick.txt'], N_DOCS_EN - 2)))
corpus_en.docs['empty_doc'] = ''  # additionally test empty document
corpus_en.docs[u'melville-moby_dick.txt'] = dict(smaller_docs_en)[
    u'melville-moby_dick.txt']  # make sure we always have moby dick
#corpus_en = Corpus(dict(smaller_docs_en))
corpus_de = Corpus.from_folder('examples/data/gutenberg',
                               read_size=MAX_DOC_LEN)


@pytest.fixture
def tmpreproc_en():
    return TMPreproc(corpus_en.docs, language='english')


@pytest.fixture

Пример #13

Показать файл

def test_empty_corpora():
    c1 = Corpus()
    c2 = Corpus.from_files([])
    c3 = Corpus.from_files([]).add_files([])
    assert c1.docs == c2.docs == c3.docs == {}

Пример #14

Показать файл

Файл: gensim_evaluation.py Проект: twl520/tmtoolkit

from tmtoolkit.utils import pickle_data
from tmtoolkit.topicmod.evaluate import results_by_parameter
from tmtoolkit.topicmod.visualize import plot_eval_results

logging.basicConfig(level=logging.INFO)
tmtoolkit_log = logging.getLogger('tmtoolkit')
tmtoolkit_log.setLevel(logging.INFO)
tmtoolkit_log.propagate = True

print('loading data...')
bt18 = pd.read_pickle('data/bt18_sample_1000.pickle')
print('loaded %d documents' % len(bt18))
doc_labels = [u'%s_%s' % info for info in zip(bt18.sitzung, bt18.sequence)]

print('preprocessing data...')
bt18corp = Corpus(dict(zip(doc_labels, bt18.text)))
preproc = TMPreproc(bt18corp, language='german')
preproc.tokenize().stem().clean_tokens()

doc_labels = list(preproc.tokens.keys())
texts = list(preproc.tokens.values())

print('creating gensim corpus...')
gnsm_dict = gensim.corpora.Dictionary.from_documents(texts)
gnsm_corpus = [gnsm_dict.doc2bow(text) for text in texts]

# evaluate topic models with different parameters
const_params = dict(update_every=0, passes=10)
ks = list(range(10, 140, 10)) + list(range(140, 200, 20))
varying_params = [dict(num_topics=k, alpha=1.0 / k) for k in ks]

Пример #15

Показать файл

Файл: read_preproc_lda_de.py Проект: petershan1119/tmtoolkit

logging.basicConfig(level=logging.DEBUG)
tmtoolkit_log = logging.getLogger('tmtoolkit')
tmtoolkit_log.setLevel(logging.DEBUG)
tmtoolkit_log.propagate = True

if __name__ == '__main__':  # this is necessary for multiprocessing on Windows!
    if os.path.exists(DTM_PICKLE):
        print("loading DTM data from pickle file '%s'..." % DTM_PICKLE)

        dtm, vocab, doc_labels = load_dtm_from_pickle(DTM_PICKLE)
    else:
        europarl = nltk.corpus.util.LazyCorpusLoader(
            'europarl_raw', nltk.corpus.EuroparlCorpusReader, fileids=FILEIDS)

        corpus = Corpus(
            {f: europarl.raw(f_id)
             for f, f_id in zip(FILES, FILEIDS)})

        print("all loaded documents:")
        for dl, text in corpus.docs.items():
            print("%s: %d chars" % (dl, len(text)))
        print("-----")

        start_time = time.time()
        preproc = TMPreproc(corpus.docs, language=u'german')
        print('tokenizing...')
        preproc.tokenize()
        print('POS tagging...')
        preproc.pos_tag()
        print('lemmatization...')
        preproc.lemmatize()

Пример #16

Показать файл

Файл: bundestag18_tfidf.py Проект: yushu-liu/tmtoolkit

del bt18pickle, bt18zip

#%% Generate document labels

# format of the document labels: <session_number>_<speech_number>
bt18_data['doc_label'] = ['%s_%s' % (str(sitzung).zfill(3), str(seq).zfill(5))
                          for sitzung, seq in zip(bt18_data.sitzung, bt18_data.sequence)]

print('loaded data frame with %d rows:' % bt18_data.shape[0])
print(bt18_data.head())


#%% Generate a Corpus object to preprocess the raw, untokenized text data

# we use the column "doc_label" as document labels and "text" as raw text
corpus = Corpus(dict(zip(bt18_data.doc_label, bt18_data.text)))
print('created corpus')

print('document lengths in number of characters:')
pprint(corpus.doc_lengths)

# we don't need this anymore, remove it to free memory
del bt18_data

#%% Investigate the set of characters used in the whole corpus

# we can see that there are several "strange" characters and unprintable unicode characters which may later cause
# trouble
print('used set of characters used in the whole corpus:')
pprint(corpus.unique_characters)

Пример #17

Показать файл

Файл: test_corpus.py Проект: ihavemanyquestions/tmtoolkit

def test_corpus_doc_lengths():
    c = Corpus({'a': '1', 'b': '22', 'c': '333'})
    assert isinstance(c.doc_lengths, dict)
    assert c.doc_lengths == {'a': 1, 'b': 2, 'c': 3}

Пример #18

Показать файл

Файл: test_corpus.py Проект: ihavemanyquestions/tmtoolkit

def test_corpus_doc_labels():
    c = Corpus({'a': 'Doc A text', 'b': 'Doc B text', 'c': 'Doc C text'})
    assert isinstance(c.doc_labels, list)
    assert list(c.doc_labels) == c.get_doc_labels(sort=True) == list('abc')

Пример #19

Показать файл

Файл: test_corpus.py Проект: ihavemanyquestions/tmtoolkit

def test_corpus_n_docs():
    c = Corpus({'a': 'Doc A text', 'b': 'Doc B text', 'c': 'Doc C text'})
    assert c.n_docs == len(c) == 3

Пример #20

Показать файл

Файл: benchmark_preproc.py Проект: yushu-liu/tmtoolkit

from tmtoolkit.corpus import Corpus
from tmtoolkit.preprocess import TMPreproc

logging.basicConfig(level=logging.INFO)
tmtoolkit_log = logging.getLogger('tmtoolkit')
tmtoolkit_log.setLevel(logging.INFO)
tmtoolkit_log.propagate = True

#%%

use_paragraphs = len(sys.argv) > 1 and sys.argv[1] == 'paragraphs'

#%%

corpus = Corpus({
    f_id: nltk.corpus.gutenberg.raw(f_id)
    for f_id in nltk.corpus.gutenberg.fileids() if f_id != 'bible-kjv.txt'
})

if use_paragraphs:
    print('using paragraphs as documents')
    corpus.split_by_paragraphs()

print('%d documents' % len(corpus))

#%%

timings = []
timing_labels = []


def add_timing(label):