Python LazyCorpusLoader.words 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: nltk.corpus

클래스/타입: LazyCorpusLoader

메소드/함수: words

hotexamples.com에서의 예제들: 5

Python LazyCorpusLoader.words - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 nltk.corpus.LazyCorpusLoader.words에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

LazyCorpusLoader(10)

categories(2)

words(2)

chunked_sents(1)

fileids(1)

parsed_sents(1)

예제 #1

파일 보기

 def __init__(self,
              tokenizer: Optional[TokenizerI] = SimpleTokenizer(),
              detokenizer: Optional[TokenizerI] = TreebankWordDetokenizer(),
              stopwords: LazyCorpusLoader = stopw):
     self.tokenizer = tokenizer
     self.detokenizer = detokenizer
     self.stopwords = stopwords.words(tokenizer.language)

예제 #2

파일 보기

파일: classificador.py 프로젝트: drudi/sentimentAnalysis

#     output = word
#     if len(word) > 0 and (not dic.check(word)):
#         sugestoes = dic.suggest(word)
#         if len(sugestoes) > 0:
#             output = sugestoes[0]
#     return output


## Inicio do Treinamento
catho_treinamento = LazyCorpusLoader(
    'catho_treinamento', CategorizedPlaintextCorpusReader,
    r'(?!\.).*\.txt', cat_pattern=r'(negativo|positivo|neutro)/.*')

print "Preparando documentos para treinamento..."
sys.stdout.flush()
documents_treinamento = [(list(catho_treinamento.words(fileid)), category)
                         for category in catho_treinamento.categories()
                         for fileid in catho_treinamento.fileids(category)]
print "fim da preparacao dos documentos de treinamento."
sys.stdout.flush()
## Pre-processamento

corpus_words = [w.lower()
                for w in catho_treinamento.words()
                if w not in string.punctuation]
                #if w not in string.punctuation and
                #w not in stopwords]


#random.shuffle(documents)

예제 #3

파일 보기

파일: decision_trees.py 프로젝트: galapijiu/opinions-classifier

    encoding='utf8',
)

# returns all non-stop words from corpus
def get_top_words():
    all_words = nltk.FreqDist(
        w.lower() for w in decisions.words()
        if not w.lower() in stopwords.words('english')
    )
    word_features = list(all_words)[:500]
    nltk.FreqDist.pprint(all_words, 500)
    return word_features


print(decisions.categories())
documents = [(list(decisions.words(fileid)), category)
             for category in decisions.categories()
             for fileid in decisions.fileids(category)]

#random.shuffle(documents)
print (documents)

pos_features = [(features.tweet_to_words(d), c) for (d, c) in documents if c == 'pos']
neg_features = [(features.tweet_to_words(d), c) for (d, c) in documents if c == 'neg']

random.shuffle(pos_features)
random.shuffle(neg_features)

chosen_features_200 = pos_features[:100] + neg_features[:100]
random.shuffle(chosen_features_200)

예제 #4

파일 보기

파일: Language_Processing_and_Python.py 프로젝트: manugallardo/nlp

import nltk
nltk.download()
from nltk.book import *
print text1

from nltk.text import Text
from nltk.corpus import LazyCorpusLoader, PlaintextCorpusReader
mytest = LazyCorpusLoader('mytest', PlaintextCorpusReader, r'(?!\.).*\.txt')
tresh = Text(mytest.words('tresh.txt'))
tresh.collocations()
tresh.concordance('esto')

예제 #5

파일 보기

파일: nbc.py 프로젝트: soldierkam/pynews


from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from nltk import FreqDist, BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures
from nltk.classify.naivebayes import NaiveBayesClassifier
from logger import  logger
from nltk.corpus import LazyCorpusLoader
from nltk.text import TextCollection
import nltk, random, operator, itertools

my_corpus = LazyCorpusLoader(
    'my_corpus', CategorizedPlaintextCorpusReader, '(data).*',
    cat_file='cats.txt')
stopwords = nltk.corpus.stopwords.words()

all_words = FreqDist(w.lower() for w in my_corpus.words() if w not in stopwords and len(w) > 2)
#all_words_inf = {}
#textCollection = TextCollection(my_corpus)
#for word in all_words.keys()[:1000]:
#    score = 0
#    for fileid in my_corpus.fileids():
#        text = my_corpus.raw(fileid)
#        score += textCollection.tf_idf(word, text)
#    all_words_inf[word] = score
#all_words = sorted(all_words_inf.items(), key=operator.itemgetter(1), reverse=False)
word_features = [word for word in all_words.keys() if len(word) > 2][:2000]


def document_features(words_in_document, score_fn=TrigramAssocMeasures.chi_sq, n=300):
    trigram_finder = TrigramCollocationFinder.from_words(words_in_document)
    trigrams = trigram_finder.nbest(score_fn, n)