def _multi_iter_tokenize(sources): for source in sources: with open(source, 'r') as f: lines = (line.decode('utf-8') for line in f) itokens = tokenizer.iter_tokenize(lines) for t in itokens: yield t
def _multi_iter_tokenize(sources): for source in sources: with open(source, 'r', encoding='UTF-8') as f: lines = (line for line in f) itokens = tokenizer.iter_tokenize(lines) for t in itokens: yield t
def tf(source): """Term frequency distribution""" fh = open(source, 'r', encoding='UTF-8') lines = (line for line in fh) itokens = tokenizer.iter_tokenize(lines) itokens = (token.lower() for token in itokens if token[0].isalpha()) distribution = corpus.tf_distribution(itokens).items() distribution.sort(key=lambda item: -item[1]) for token, val in distribution: print(token, '%.4f' % val)
def tf(source): """Term frequency distribution""" fh = open(source, 'r') lines = (line.decode('utf-8') for line in fh) itokens = tokenizer.iter_tokenize(lines) itokens = (token.lower() for token in itokens if token[0].isalpha()) distribution = corpus.tf_distribution(itokens).items() distribution.sort(key = lambda item: -item[1]) for token, val in distribution: print token.encode('utf-8'), '%.4f' % val
def tag(source, tagger_name): """Tag a document using a pre-built tagger""" tagger_ = _load_tagger(tagger_name) fh = open(source, 'r', encoding='UTF-8') lines = (line for line in fh) itokens = tokenizer.iter_tokenize(lines) for token, tag in tagger.smart_tag(itokens, tagger_): tmp = token if tag is not None: tmp = tmp + ' {{%s}}' % tag print(tmp) fh.close()
def tag(source, tagger_name): """Tag a document using a pre-built tagger""" tagger_ = _load_tagger(tagger_name) fh = open(source, 'r') lines = (line.decode('utf-8') for line in fh) itokens = tokenizer.iter_tokenize(lines) for token, tag in tagger.smart_tag(itokens, tagger_): tmp = token.encode('utf-8') if tag is not None: tmp = tmp + ' {{%s}}' % tag print tmp, fh.close()
def concordance(source, word, window=4): """Concordance, finds word in a document along with context""" try: fh = open(source, 'r', encoding='UTF-8') except Exception: print(f'File not found: {source}') return window = int(window) lines = (line for line in fh) itokens = tokenizer.iter_tokenize(lines) for window in text.concordance(word, itokens, window): print(' '.join(window)) fh.close()
def concordance(source, word, window = 4): """Concordance, finds word in a document along with context""" try: fh = open(source, 'r') except Exception: print 'File not found:', source return word = word.decode('utf-8') window = int(window) lines = (line.decode('utf-8') for line in fh) itokens = tokenizer.iter_tokenize(lines) for window in text.concordance(word, itokens, window): print ' '.join(window).encode('utf-8') fh.close()
def concordance(source, word, window=4): """Concordance, finds word in a document along with context""" try: fh = open(source, 'r') except Exception: print 'File not found:', source return word = word.decode('utf-8') window = int(window) lines = (line.decode('utf-8') for line in fh) itokens = tokenizer.iter_tokenize(lines) for window in text.concordance(word, itokens, window): print ' '.join(window).encode('utf-8') fh.close()
# -*- coding: utf-8 -*- """ Created on Mon Jan 22 01:44:50 2018 @author: Stefan Aleksik """ from nlmk import tokenizer, stopwords #, corpus #from nlmk import ngramgen as ngramgenmod stopwords = stopwords() f = open('all_topics.txt', 'r') linii = (line.decode('utf-8') for line in f) tokens = tokenizer.iter_tokenize(linii) zborovi = list(token.lower() for token in tokens if token[0].isalpha()) for zbor in zborovi: if zbor not in stopwords and len(zbor) > 2: print zbor.encode('utf-8')