示例#1
0
文件: run.py 项目: petrushev/nlmk
def _multi_iter_tokenize(sources):
    for source in sources:
        with open(source, 'r') as f:
            lines = (line.decode('utf-8') for line in f)
            itokens = tokenizer.iter_tokenize(lines)
            for t in itokens:
                yield t
示例#2
0
文件: run.py 项目: MatejMecka/nlmk
def _multi_iter_tokenize(sources):
    for source in sources:
        with open(source, 'r', encoding='UTF-8') as f:
            lines = (line for line in f)
            itokens = tokenizer.iter_tokenize(lines)
            for t in itokens:
                yield t
示例#3
0
def _multi_iter_tokenize(sources):
    for source in sources:
        with open(source, 'r') as f:
            lines = (line.decode('utf-8') for line in f)
            itokens = tokenizer.iter_tokenize(lines)
            for t in itokens:
                yield t
示例#4
0
文件: run.py 项目: MatejMecka/nlmk
def tf(source):
    """Term frequency distribution"""
    fh = open(source, 'r', encoding='UTF-8')
    lines = (line for line in fh)
    itokens = tokenizer.iter_tokenize(lines)
    itokens = (token.lower() for token in itokens if token[0].isalpha())
    distribution = corpus.tf_distribution(itokens).items()
    distribution.sort(key=lambda item: -item[1])
    for token, val in distribution:
        print(token, '%.4f' % val)
示例#5
0
文件: run.py 项目: petrushev/nlmk
def tf(source):
    """Term frequency distribution"""
    fh = open(source, 'r')
    lines = (line.decode('utf-8') for line in fh)
    itokens = tokenizer.iter_tokenize(lines)
    itokens = (token.lower() for token in itokens if token[0].isalpha())
    distribution = corpus.tf_distribution(itokens).items()
    distribution.sort(key = lambda item: -item[1])
    for token, val in distribution:
        print token.encode('utf-8'), '%.4f' % val
示例#6
0
文件: run.py 项目: MatejMecka/nlmk
def tag(source, tagger_name):
    """Tag a document using a pre-built tagger"""
    tagger_ = _load_tagger(tagger_name)
    fh = open(source, 'r', encoding='UTF-8')
    lines = (line for line in fh)
    itokens = tokenizer.iter_tokenize(lines)
    for token, tag in tagger.smart_tag(itokens, tagger_):
        tmp = token
        if tag is not None:
            tmp = tmp + ' {{%s}}' % tag
        print(tmp)
    fh.close()
示例#7
0
文件: run.py 项目: petrushev/nlmk
def tag(source, tagger_name):
    """Tag a document using a pre-built tagger"""
    tagger_ = _load_tagger(tagger_name)
    fh = open(source, 'r')
    lines = (line.decode('utf-8') for line in fh)
    itokens = tokenizer.iter_tokenize(lines)
    for token, tag in tagger.smart_tag(itokens, tagger_):
        tmp = token.encode('utf-8')
        if tag is not None:
            tmp = tmp + ' {{%s}}' % tag
        print tmp,
    fh.close()
示例#8
0
文件: run.py 项目: MatejMecka/nlmk
def concordance(source, word, window=4):
    """Concordance, finds word in a document along with context"""
    try:
        fh = open(source, 'r', encoding='UTF-8')
    except Exception:
        print(f'File not found: {source}')
        return

    window = int(window)

    lines = (line for line in fh)
    itokens = tokenizer.iter_tokenize(lines)
    for window in text.concordance(word, itokens, window):
        print(' '.join(window))
    fh.close()
示例#9
0
文件: run.py 项目: petrushev/nlmk
def concordance(source, word, window = 4):
    """Concordance, finds word in a document along with context"""
    try:
        fh = open(source, 'r')
    except Exception:
        print 'File not found:', source
        return

    word = word.decode('utf-8')
    window = int(window)

    lines = (line.decode('utf-8') for line in fh)
    itokens = tokenizer.iter_tokenize(lines)
    for window in text.concordance(word, itokens, window):
        print ' '.join(window).encode('utf-8')
    fh.close()
示例#10
0
def concordance(source, word, window=4):
    """Concordance, finds word in a document along with context"""
    try:
        fh = open(source, 'r')
    except Exception:
        print 'File not found:', source
        return

    word = word.decode('utf-8')
    window = int(window)

    lines = (line.decode('utf-8') for line in fh)
    itokens = tokenizer.iter_tokenize(lines)
    for window in text.concordance(word, itokens, window):
        print ' '.join(window).encode('utf-8')
    fh.close()
示例#11
0
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 22 01:44:50 2018

@author: Stefan Aleksik
"""
from nlmk import tokenizer, stopwords  #, corpus
#from nlmk import ngramgen as ngramgenmod
stopwords = stopwords()
f = open('all_topics.txt', 'r')
linii = (line.decode('utf-8') for line in f)
tokens = tokenizer.iter_tokenize(linii)
zborovi = list(token.lower() for token in tokens if token[0].isalpha())

for zbor in zborovi:
    if zbor not in stopwords and len(zbor) > 2:
        print zbor.encode('utf-8')