def load_documents(self,path): docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*') for cat in docs.categories(): self.cat_gram_freq[cat] = {} self.cat_word_freq[cat]={} return ((category,list(docs.words(fileid))) for category in docs.categories() for fileid in docs.fileids(category))
def load_documents(self,path): docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*') print docs.categories() documents = [(list(docs.words(fileid)), category) for category in docs.categories() for fileid in docs.fileids(category) ] random.shuffle(documents) return documents
import nltk from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader DOC_PATTERN = r'[\w_\s]+/[\w\s\d\-]+\.TXT' CAT_PATTERN = r'([\w_\s]+)/.*' corpus = CategorizedPlaintextCorpusReader('ENGLISH', DOC_PATTERN, cat_pattern=CAT_PATTERN) print(corpus.categories()) print(corpus.fileids()[100:110]) print(corpus.words())
if args.bigrams: featurizer = bigram_feats else: featurizer = word_feats corpus = CategorizedPlaintextCorpusReader( root=args.directory, fileids=".*/.*\.txt", cat_pattern=r'(dem|rep)/') best_words = most_informative_words(corpus) dem_ids = corpus.fileids(categories=['dem']) rep_ids = corpus.fileids(categories=['rep']) dem_feats = [(featurizer(corpus.words(fileids=[f])), 'dem') for f in dem_ids] rep_feats = [(featurizer(corpus.words(fileids=[f])), 'rep') for f in rep_ids] dem_cutoff = len(dem_feats) * 5 / 6 rep_cutoff = len(rep_feats) * 5 / 6 train_feats = dem_feats[:dem_cutoff] + rep_feats[:rep_cutoff] test_feats = dem_feats[dem_cutoff:] + rep_feats[rep_cutoff:] print 'training on %d instances, testing on %d instances' % ( len(train_feats), len(test_feats)) classifier = NaiveBayesClassifier.train(train_feats) print 'accuracy:', nltk.classify.util.accuracy(classifier, test_feats)
if args.bigrams: featurizer = bigram_feats else: featurizer = word_feats corpus = CategorizedPlaintextCorpusReader(root=args.directory, fileids=".*/.*\.txt", cat_pattern=r'(dem|rep)/') best_words = most_informative_words(corpus) dem_ids = corpus.fileids(categories=['dem']) rep_ids = corpus.fileids(categories=['rep']) dem_feats = [(featurizer(corpus.words(fileids=[f])), 'dem') for f in dem_ids] rep_feats = [(featurizer(corpus.words(fileids=[f])), 'rep') for f in rep_ids] dem_cutoff = len(dem_feats) * 5 / 6 rep_cutoff = len(rep_feats) * 5 / 6 train_feats = dem_feats[:dem_cutoff] + rep_feats[:rep_cutoff] test_feats = dem_feats[dem_cutoff:] + rep_feats[rep_cutoff:] print 'training on %d instances, testing on %d instances' % ( len(train_feats), len(test_feats)) classifier = NaiveBayesClassifier.train(train_feats) print 'accuracy:', nltk.classify.util.accuracy(classifier, test_feats)
#Document Classification #Load Libraries import os import random from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader #Read the dataset into the categorized corpus # Directory of the corpus corpusdir = 'corpus/' review_corpus = CategorizedPlaintextCorpusReader(corpusdir, r'.*\.txt', cat_pattern=r'\d+_(\w+)\.txt') # list of documents(fileid) and category (pos/neg) documents = [(list(review_corpus.words(fileid)), category) for category in review_corpus.categories() for fileid in review_corpus.fileids(category)] random.shuffle(documents) for category in review_corpus.categories(): print(category) type(review_corpus) len(documents) #Compute word frequency import nltk all_words = nltk.FreqDist(w.lower() for w in review_corpus.words())
# fileids_ = corpus_dir + '/rt-polarity*' corpus_dir = '/home/mayank/IdeaProjects/Lab_Machine_Learning/src/Text_Analytics/data/rt-polaritydata' cat_map_ = {'rt-polarity.pos': ['pos'], 'rt-polarity.neg': ['neg']} corpus_treatment(corpus_dir) encoded_corpus_dir = os.path.join(corpus_dir, 'encoded_data') fileids_ = '^rt-polarity.*' categorized_plaintext_corpusreader = CategorizedPlaintextCorpusReader( root=encoded_corpus_dir, cat_map=t_map_, fileids=fileids_, ) pos_words = categorized_plaintext_corpusreader.words(categories=['pos']) pos_sents = categorized_plaintext_corpusreader.sents(categories=['pos']) pos_paras = categorized_plaintext_corpusreader.paras(categories=['pos']) neg_words = categorized_plaintext_corpusreader.words(categories=['pos']) neg_sents = categorized_plaintext_corpusreader.sents(categories=['neg']) neg_paras = categorized_plaintext_corpusreader.paras(categories=['neg']) # NOTE: para views are not working to be looked into later # classification train = pos_words classifier = NaiveBayesClassifier.train(train)
t = time() - t print str(t) + 's' # Test generation of CFD print 'Creating CFD...', sys.stdout.flush() t = time() cat = cr.categories()[0] n = 3 cfd = ConditionalFreqDist() prefix = ('',) * (n - 1) for ngram in ingrams(chain(prefix, cr.words(categories=[cat])), n): context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) t = time() - t print str(t) + 's' t = time() print 'Pickling CFD...', sys.stdout.flush() pickle.dump(cfd, open('cfd.p', 'w'), protocol=1) t = time() - t
return 1 in [c in str for c in set] def is_number(s): try: float(s) return True except ValueError: return False doc_lowercase = [w.lower() for w in doc] return lemma.lemmatize([w for w in doc_lowercase if not (is_number(w)) and len(w) > 1 and contains_any(w, wordchars) and not contains_any(w, exclude) and w not in stop]) doc_dict = {fid: clean(corpus.words(fid)) for cat in corpus.categories() for fid in corpus.fileids(cat)} # XXX docs = doc_dict.values() dictionary = gensim.corpora.Dictionary(docs) doc_ids = [k for k in doc_dict.keys()] doc_term_matrix = [dictionary.doc2bow(doc) for doc in docs] bow_array = np.array(doc_term_matrix) def find_best_lda_model(texts, bow, id2word, min_n=min_topics, max_n=max_topics): best_model = None max_coherence = -1 for n in range(min_n, max_n + 1): ctm = CtmModel( bow, id2word=id2word, num_topics=n)
cr = CategorizedPlaintextCorpusReader(train_path, ".*", cat_pattern="(\w*)") # Get categories print "%d categories: %s" % (len(cr.categories()), ", ".join(cr.categories())) for c in [cr.categories()[0]]: print c + "..." sys.stdout.flush() ngrams = {} for i in range(n, 0, -1): print str(i) + "-grams..." ngrams[i] = {} prefix = ("",) * (i - 1) for ngram in ingrams(chain(prefix, cr.words(categories=[c])), n): if not ngram in ngrams[i]: ngrams[i][ngram] = 0 ngrams[i][ngram] += 1 print ngrams sys.exit() total_words = len(cr.words()) cat_prob_dict = {} ngrams = {}
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader from nltk.text import Text from nltk import ConditionalFreqDist, FreqDist from nltk.stem.snowball import FrenchStemmer stemmer = FrenchStemmer() stopwordsdir = "C:/Projects/Allocine/stopwords/used" stopwords = [] root = "C:/Projects/Allocine/corpus2/" cats = ['cine', 'autre', 'critique', 'critique_a'] reader = CategorizedPlaintextCorpusReader(root, r'.*\.txt', cat_pattern=r'(\w+)/*', encoding='latin-1') text_all = Text(reader.words()) text_cine = Text(reader.words(categories='cine')) text_autre = Text(reader.words(categories='autre')) text_critique = Text(reader.words(categories='critique')) text_critique_a = Text(reader.words(categories='critique_a')) texts_list = [text_cine, text_autre, text_critique, text_critique_a] def remove_accents(text): text = re.sub("[àâäÄÂÀ]", "a", text) text = re.sub("[éèêëÈÊËÉ]", "e", text) text = re.sub("[ïîìÏÎÌ]", "i", text) text = re.sub("[öôòÖÔÒ]", "o", text) text = re.sub("[ùûüÜÛÙ]", "u", text) text = re.sub("[«»!/:;,\?•€%—\"\\^@\*\d\-\+\]\<>)\(\[]", " ", text) text = re.sub("[œŒ]", "oe", text)