예제 #1
0
	def load_documents(self,path):
		docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*')
		for cat in docs.categories():
			self.cat_gram_freq[cat] = {}
			self.cat_word_freq[cat]={}
		return ((category,list(docs.words(fileid))) 
			for category in docs.categories() 
			for fileid in docs.fileids(category))
예제 #2
0
	def load_documents(self,path):
		docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*')
		print docs.categories()
		documents = [(list(docs.words(fileid)), category)
				for category in docs.categories()
				for fileid in docs.fileids(category)
		]
		random.shuffle(documents)
		return documents
예제 #3
0
import nltk
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

DOC_PATTERN = r'[\w_\s]+/[\w\s\d\-]+\.TXT'
CAT_PATTERN = r'([\w_\s]+)/.*'

corpus = CategorizedPlaintextCorpusReader('ENGLISH',
                                          DOC_PATTERN,
                                          cat_pattern=CAT_PATTERN)

print(corpus.categories())
print(corpus.fileids()[100:110])
print(corpus.words())
예제 #4
0
파일: bill_bayes.py 프로젝트: PamelaM/billy
    if args.bigrams:
        featurizer = bigram_feats
    else:
        featurizer = word_feats

    corpus = CategorizedPlaintextCorpusReader(
        root=args.directory,
        fileids=".*/.*\.txt",
        cat_pattern=r'(dem|rep)/')

    best_words = most_informative_words(corpus)

    dem_ids = corpus.fileids(categories=['dem'])
    rep_ids = corpus.fileids(categories=['rep'])

    dem_feats = [(featurizer(corpus.words(fileids=[f])), 'dem')
                 for f in dem_ids]
    rep_feats = [(featurizer(corpus.words(fileids=[f])), 'rep')
                 for f in rep_ids]

    dem_cutoff = len(dem_feats) * 5 / 6
    rep_cutoff = len(rep_feats) * 5 / 6

    train_feats = dem_feats[:dem_cutoff] + rep_feats[:rep_cutoff]
    test_feats = dem_feats[dem_cutoff:] + rep_feats[rep_cutoff:]
    print 'training on %d instances, testing on %d instances' % (
        len(train_feats), len(test_feats))

    classifier = NaiveBayesClassifier.train(train_feats)
    print 'accuracy:', nltk.classify.util.accuracy(classifier, test_feats)
예제 #5
0
    if args.bigrams:
        featurizer = bigram_feats
    else:
        featurizer = word_feats

    corpus = CategorizedPlaintextCorpusReader(root=args.directory,
                                              fileids=".*/.*\.txt",
                                              cat_pattern=r'(dem|rep)/')

    best_words = most_informative_words(corpus)

    dem_ids = corpus.fileids(categories=['dem'])
    rep_ids = corpus.fileids(categories=['rep'])

    dem_feats = [(featurizer(corpus.words(fileids=[f])), 'dem')
                 for f in dem_ids]
    rep_feats = [(featurizer(corpus.words(fileids=[f])), 'rep')
                 for f in rep_ids]

    dem_cutoff = len(dem_feats) * 5 / 6
    rep_cutoff = len(rep_feats) * 5 / 6

    train_feats = dem_feats[:dem_cutoff] + rep_feats[:rep_cutoff]
    test_feats = dem_feats[dem_cutoff:] + rep_feats[rep_cutoff:]
    print 'training on %d instances, testing on %d instances' % (
        len(train_feats), len(test_feats))

    classifier = NaiveBayesClassifier.train(train_feats)
    print 'accuracy:', nltk.classify.util.accuracy(classifier, test_feats)
#Document Classification

#Load Libraries

import os
import random
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

#Read the dataset into the categorized corpus

# Directory of the corpus
corpusdir = 'corpus/' 
review_corpus = CategorizedPlaintextCorpusReader(corpusdir, r'.*\.txt', cat_pattern=r'\d+_(\w+)\.txt')

# list of documents(fileid) and category (pos/neg)
documents = [(list(review_corpus.words(fileid)), category)
              for category in review_corpus.categories()
              for fileid in review_corpus.fileids(category)]
random.shuffle(documents)

for category in review_corpus.categories():
    print(category)

type(review_corpus)

len(documents)

#Compute word frequency

import nltk
all_words = nltk.FreqDist(w.lower() for w in review_corpus.words())
예제 #7
0
# fileids_ = corpus_dir + '/rt-polarity*'

corpus_dir = '/home/mayank/IdeaProjects/Lab_Machine_Learning/src/Text_Analytics/data/rt-polaritydata'

cat_map_ = {'rt-polarity.pos': ['pos'], 'rt-polarity.neg': ['neg']}

corpus_treatment(corpus_dir)

encoded_corpus_dir = os.path.join(corpus_dir, 'encoded_data')
fileids_ = '^rt-polarity.*'

categorized_plaintext_corpusreader = CategorizedPlaintextCorpusReader(
    root=encoded_corpus_dir,
    cat_map=t_map_,
    fileids=fileids_,
)

pos_words = categorized_plaintext_corpusreader.words(categories=['pos'])
pos_sents = categorized_plaintext_corpusreader.sents(categories=['pos'])
pos_paras = categorized_plaintext_corpusreader.paras(categories=['pos'])

neg_words = categorized_plaintext_corpusreader.words(categories=['pos'])
neg_sents = categorized_plaintext_corpusreader.sents(categories=['neg'])
neg_paras = categorized_plaintext_corpusreader.paras(categories=['neg'])

# NOTE: para views are not working to be looked into later

# classification
train = pos_words
classifier = NaiveBayesClassifier.train(train)
예제 #8
0
t = time() - t
print str(t) + 's'

# Test generation of CFD
print 'Creating CFD...',
sys.stdout.flush()
t = time()

cat = cr.categories()[0]

n = 3

cfd = ConditionalFreqDist()
prefix = ('',) * (n - 1)

for ngram in ingrams(chain(prefix, cr.words(categories=[cat])), n):
    context = tuple(ngram[:-1])
    token = ngram[-1]
    cfd[context].inc(token)

t = time() - t
print str(t) + 's'


t = time()
print 'Pickling CFD...',
sys.stdout.flush()

pickle.dump(cfd, open('cfd.p', 'w'), protocol=1)

t = time() - t
예제 #9
0
        return 1 in [c in str for c in set]

    def is_number(s):
        try:
            float(s)
            return True
        except ValueError:
            return False

    doc_lowercase = [w.lower() for w in doc]
    return lemma.lemmatize([w for w in doc_lowercase
                            if not (is_number(w)) and len(w) > 1 and contains_any(w, wordchars)
                            and not contains_any(w, exclude) and w not in stop])


doc_dict = {fid: clean(corpus.words(fid)) for cat in corpus.categories() for fid in corpus.fileids(cat)}  # XXX

docs = doc_dict.values()
dictionary = gensim.corpora.Dictionary(docs)

doc_ids = [k for k in doc_dict.keys()]
doc_term_matrix = [dictionary.doc2bow(doc) for doc in docs]
bow_array = np.array(doc_term_matrix)


def find_best_lda_model(texts, bow, id2word, min_n=min_topics, max_n=max_topics):
    best_model = None
    max_coherence = -1
    for n in range(min_n, max_n + 1):
        ctm = CtmModel(
            bow, id2word=id2word, num_topics=n)
예제 #10
0
cr = CategorizedPlaintextCorpusReader(train_path, ".*", cat_pattern="(\w*)")

# Get categories
print "%d categories: %s" % (len(cr.categories()), ", ".join(cr.categories()))

for c in [cr.categories()[0]]:
    print c + "..."
    sys.stdout.flush()

    ngrams = {}
    for i in range(n, 0, -1):
        print str(i) + "-grams..."
        ngrams[i] = {}
        prefix = ("",) * (i - 1)
        for ngram in ingrams(chain(prefix, cr.words(categories=[c])), n):
            if not ngram in ngrams[i]:
                ngrams[i][ngram] = 0

            ngrams[i][ngram] += 1

    print ngrams


sys.exit()


total_words = len(cr.words())
cat_prob_dict = {}
ngrams = {}
예제 #11
0
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from nltk.text import Text
from nltk import ConditionalFreqDist, FreqDist
from nltk.stem.snowball import FrenchStemmer

stemmer = FrenchStemmer()
stopwordsdir = "C:/Projects/Allocine/stopwords/used"
stopwords = []
root = "C:/Projects/Allocine/corpus2/"
cats = ['cine', 'autre', 'critique', 'critique_a']
reader = CategorizedPlaintextCorpusReader(root,
                                          r'.*\.txt',
                                          cat_pattern=r'(\w+)/*',
                                          encoding='latin-1')

text_all = Text(reader.words())
text_cine = Text(reader.words(categories='cine'))
text_autre = Text(reader.words(categories='autre'))
text_critique = Text(reader.words(categories='critique'))
text_critique_a = Text(reader.words(categories='critique_a'))
texts_list = [text_cine, text_autre, text_critique, text_critique_a]


def remove_accents(text):
    text = re.sub("[àâäÄÂÀ]", "a", text)
    text = re.sub("[éèêëÈÊËÉ]", "e", text)
    text = re.sub("[ïîìÏÎÌ]", "i", text)
    text = re.sub("[öôòÖÔÒ]", "o", text)
    text = re.sub("[ùûüÜÛÙ]", "u", text)
    text = re.sub("[«»!/:;,\?•€%—\"\\^@\*\d\-\+\]\<>)\(\[]", " ", text)
    text = re.sub("[œŒ]", "oe", text)