예제 #1
0
 def __init__(self,
              tokenizer: Optional[TokenizerI] = SimpleTokenizer(),
              detokenizer: Optional[TokenizerI] = TreebankWordDetokenizer(),
              stopwords: LazyCorpusLoader = stopw):
     self.tokenizer = tokenizer
     self.detokenizer = detokenizer
     self.stopwords = stopwords.words(tokenizer.language)
예제 #2
0
#     output = word
#     if len(word) > 0 and (not dic.check(word)):
#         sugestoes = dic.suggest(word)
#         if len(sugestoes) > 0:
#             output = sugestoes[0]
#     return output


## Inicio do Treinamento
catho_treinamento = LazyCorpusLoader(
    'catho_treinamento', CategorizedPlaintextCorpusReader,
    r'(?!\.).*\.txt', cat_pattern=r'(negativo|positivo|neutro)/.*')

print "Preparando documentos para treinamento..."
sys.stdout.flush()
documents_treinamento = [(list(catho_treinamento.words(fileid)), category)
                         for category in catho_treinamento.categories()
                         for fileid in catho_treinamento.fileids(category)]
print "fim da preparacao dos documentos de treinamento."
sys.stdout.flush()
## Pre-processamento

corpus_words = [w.lower()
                for w in catho_treinamento.words()
                if w not in string.punctuation]
                #if w not in string.punctuation and
                #w not in stopwords]


#random.shuffle(documents)
    encoding='utf8',
)

# returns all non-stop words from corpus
def get_top_words():
    all_words = nltk.FreqDist(
        w.lower() for w in decisions.words()
        if not w.lower() in stopwords.words('english')
    )
    word_features = list(all_words)[:500]
    nltk.FreqDist.pprint(all_words, 500)
    return word_features


print(decisions.categories())
documents = [(list(decisions.words(fileid)), category)
             for category in decisions.categories()
             for fileid in decisions.fileids(category)]

#random.shuffle(documents)
print (documents)

pos_features = [(features.tweet_to_words(d), c) for (d, c) in documents if c == 'pos']
neg_features = [(features.tweet_to_words(d), c) for (d, c) in documents if c == 'neg']

random.shuffle(pos_features)
random.shuffle(neg_features)

chosen_features_200 = pos_features[:100] + neg_features[:100]
random.shuffle(chosen_features_200)
import nltk
nltk.download()
from nltk.book import *
print text1

from nltk.text import Text
from nltk.corpus import LazyCorpusLoader, PlaintextCorpusReader
mytest = LazyCorpusLoader('mytest', PlaintextCorpusReader, r'(?!\.).*\.txt')
tresh = Text(mytest.words('tresh.txt'))
tresh.collocations()
tresh.concordance('esto')
예제 #5
0
파일: nbc.py 프로젝트: soldierkam/pynews

from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from nltk import FreqDist, BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures
from nltk.classify.naivebayes import NaiveBayesClassifier
from logger import  logger
from nltk.corpus import LazyCorpusLoader
from nltk.text import TextCollection
import nltk, random, operator, itertools

my_corpus = LazyCorpusLoader(
    'my_corpus', CategorizedPlaintextCorpusReader, '(data).*',
    cat_file='cats.txt')
stopwords = nltk.corpus.stopwords.words()

all_words = FreqDist(w.lower() for w in my_corpus.words() if w not in stopwords and len(w) > 2)
#all_words_inf = {}
#textCollection = TextCollection(my_corpus)
#for word in all_words.keys()[:1000]:
#    score = 0
#    for fileid in my_corpus.fileids():
#        text = my_corpus.raw(fileid)
#        score += textCollection.tf_idf(word, text)
#    all_words_inf[word] = score
#all_words = sorted(all_words_inf.items(), key=operator.itemgetter(1), reverse=False)
word_features = [word for word in all_words.keys() if len(word) > 2][:2000]


def document_features(words_in_document, score_fn=TrigramAssocMeasures.chi_sq, n=300):
    trigram_finder = TrigramCollocationFinder.from_words(words_in_document)
    trigrams = trigram_finder.nbest(score_fn, n)