import random from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import CategorizedPlaintextCorpusReader from nltk.corpus import stopwords from nltk.text import TextCollection from collections import Counter from nltk.tokenize import word_tokenize from operator import itemgetter dialect = LazyCorpusLoader('dialects1', CategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'(egyptian|gulf|levantine|standardArabic)/.*', encoding="utf-8") x = TextCollection(dialect) sentences = [(list(dialect.words(fileid)[i:i+40]),category) for category in dialect.categories() for fileid in dialect.fileids(category) for i in range(0,len(dialect.words(fileid)),40)] shuffled_sentences = random.sample(sentences, len(sentences)) print('sentences count',len(sentences)) text = dialect.words() print('words count',len(text)) #################### Test with getting topN ############################################################################ # all_words = nltk.FreqDist(w for w in dialect.words()) # Mcommon = all_words.most_common(4000) # topN = [i[0] for i in Mcommon] # print('finished topN')
reader_kwargs['cat_file'] = args.cat_file if args.delimiter and args.delimiter != ' ': reader_kwargs['delimiter'] = args.delimiter if args.cat_pattern: reader_args.append(args.cat_pattern) else: reader_args.append('.+/.+') elif args.cat_pattern: reader_args.append(args.cat_pattern) reader_kwargs['cat_pattern'] = re.compile(args.cat_pattern) categorized_corpus = LazyCorpusLoader(args.corpus, reader_class[args.reader], *reader_args, **reader_kwargs) labels = categorized_corpus.categories() nlabels = len(labels) if args.trace: print '%d labels: %s' % (nlabels, labels) if not nlabels: raise ValueError('corpus does not have any categories') elif nlabels == 1: raise ValueError('corpus must have more than 1 category') elif nlabels == 2 and args.multi: raise ValueError('corpus must have more than 2 categories if --multi is specified') ######################## ## text normalization ## ########################
interrogazioni = LazyCorpusLoader( 'opp_interrogazioni_macro', CategorizedPlaintextCorpusReader, r'\d*', cat_file='cats.txt', cat_delimiter=',' ) print "computing FreqDist over all words" all_words = nltk.FreqDist(w.lower() for w in interrogazioni.words()) top_words = all_words.keys()[:2000] print "generating list of documents for each category" documents = [ (list(interrogazioni.words(fileid)), category) for category in interrogazioni.categories() for fileid in interrogazioni.fileids(category) ] random.shuffle(documents) print "building the classifier" featuresets = [(document_features(d, top_words), c) for (d,c) in documents] train_set, test_set = featuresets[1000:], featuresets[:1000] classifier = nltk.NaiveBayesClassifier.train(train_set) print "classifier accuracy: ", nltk.classify.accuracy(classifier, test_set)
def loadClassifier(outputdir): classifier_filename = os.path.join("pickled_algos", "voted_classifier.pickle") word_features_filename = os.path.join("pickled_algos", "word_features.pickle") if os.path.exists(classifier_filename) and os.path.exists(word_features_filename): word_features = pickleLoad("word_features.pickle") # classifier = pickleLoad("originalnaivebayes.pickle") # MNB_classifier = pickleLoad("MNB_classifier.pickle") # BernoulliNB_classifier = pickleLoad("BernoulliNB_classifier.pickle") # LogisticRegression_classifier = pickleLoad("LogisticRegression_classifier.pickle") # SGDClassifier_classifier = pickleLoad("SGDClassifier_classifier.pickle") # LinearSVC_classifier = pickleLoad("LinearSVC_classifier.pickle") # # voted_classifier = VoteClassifier(classifier, ## NuSVC_classifier, # LinearSVC_classifier, # SGDClassifier_classifier, # MNB_classifier, # BernoulliNB_classifier, # LogisticRegression_classifier) voted_classifier= pickleLoad("voted_classifier.pickle") return voted_classifier, word_features else: criticas_cine = LazyCorpusLoader( 'criticas_cine', CategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='utf-8') # criticas_cine = LazyCorpusLoader( # 'criticas_cine_neu', CategorizedPlaintextCorpusReader, # r'(?!\.).*\.txt', cat_pattern=r'(neg|neu|pos)/.*', # encoding='utf-8') documents = [(list(criticas_cine.words(fileid)), category) for category in criticas_cine.categories() for fileid in criticas_cine.fileids(category)] # # document_pos = [(list(criticas_cine.words(fileid)), "pos") # for fileid in criticas_cine.fileids("pos")] # document_neg = [(list(criticas_cine.words(fileid)), "neg") # for fileid in criticas_cine.fileids("neg")] # document_neu = [(list(criticas_cine.words(fileid)), "neu") # for fileid in criticas_cine.fileids("neu")] random.shuffle(documents) # random.shuffle(document_pos) # random.shuffle(document_neg) # random.shuffle(document_neu) all_words = [] for w in criticas_cine.words(): all_words.append(w.lower()) # for w in criticas_cine.words(): # if not is_filtered(w.lower()): # all_words.append(w.lower()) # all_words = nltk.FreqDist(all_words) #print (all_words.most_common(50)) # Filtering by type of word # for sample in all_words: word_features = list(all_words.keys())[:3000] pickleDump(word_features, "word_features.pickle") featuresets = [(find_features(rev, word_features), category) for (rev, category) in documents] # featuresetpos = [(find_features(rev, word_features), category) for (rev, category) in document_pos] # featuresetneg = [(find_features(rev, word_features), category) for (rev, category) in document_neg] # featuresetneu = [(find_features(rev, word_features), category) for (rev, category) in document_neu] # training_set = featuresetpos[:1000] # training_set.extend(featuresetneg[:1000]) # training_set.extend(featuresetneu[:1000]) # testing_set = featuresetpos[1000:1273] # testing_set.extend(featuresetneg[1000:]) # testing_set.extend(featuresetneu[1000:]) # pos_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "pos"] # neu_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neu"] # neg_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neg"] training_set = featuresets[:2000] testing_set = featuresets[2000:] classifier = nltk.NaiveBayesClassifier.train(training_set) # pickleDump(classifier, "originalnaivebayes.pickle") NaiveBayesClassifierAccuracy = nltk.classify.accuracy(classifier, testing_set) print("Original Naive Bayes Algo accuracy percent:", (NaiveBayesClassifierAccuracy)*100) accuracy = Accuracy(classifier,testing_set) print(accuracy) # order: neu, neg, pos # print("Accuracy: ", (accuracy["neg"][0]+accuracy["pos"][2])/3) # print("Discarded: ", (accuracy["neu"][0]+accuracy["neg"][1]+accuracy["pos"][0])/3) # print("Failed: ", (accuracy["neu"][1]+accuracy["neu"][2]+accuracy["neg"][2]+accuracy["pos"][1])/3) # print ("Pos:", nltk.classify.accuracy(classifier, pos_feat)*100) # print ("Neu:", nltk.classify.accuracy(classifier, neu_feat)*100) # print ("Neg:", nltk.classify.accuracy(classifier, neg_feat)*100) classifier.show_most_informative_features(15) MNB_classifier = SklearnClassifier(MultinomialNB()) MNB_classifier.train(training_set) MNB_classifierAccuracy = nltk.classify.accuracy(MNB_classifier, testing_set) print("MNB_classifier accuracy percent:", (MNB_classifierAccuracy)*100) # pickleDump(MNB_classifier, "MNB_classifier.pickle") BernoulliNB_classifier = SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(training_set) BernoulliNB_classifierAccuracy = nltk.classify.accuracy(BernoulliNB_classifier, testing_set) print("BernoulliNB_classifier accuracy percent:", (BernoulliNB_classifierAccuracy)*100) # pickleDump(BernoulliNB_classifier, "BernoulliNB_classifier.pickle") LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(training_set) LogisticRegression_classifierAccuracy = nltk.classify.accuracy(LogisticRegression_classifier, testing_set) print("LogisticRegression_classifier accuracy percent:", (LogisticRegression_classifierAccuracy)*100) # pickleDump(LogisticRegression_classifier, "LogisticRegression_classifier.pickle") SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) SGDClassifier_classifier.train(training_set) SGDClassifier_classifierAccuracy = nltk.classify.accuracy(SGDClassifier_classifier, testing_set) print("SGDClassifier_classifier accuracy percent:", (SGDClassifier_classifierAccuracy)*100) # pickleDump(SGDClassifier_classifier, "SGDClassifier_classifier.pickle") LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(training_set) LinearSVC_classifierAccuracy = nltk.classify.accuracy(LinearSVC_classifier, testing_set) print("LinearSVC_classifier accuracy percent:", (LinearSVC_classifierAccuracy)*100) # pickleDump(LinearSVC_classifier, "LinearSVC_classifier.pickle") # SVC_classifier = SklearnClassifier(SVC()) # SVC_classifier.train(training_set) # print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100) NuSVC_classifier = SklearnClassifier(NuSVC()) NuSVC_classifier.train(training_set) NuSVC_classifierAccuracy = nltk.classify.accuracy(NuSVC_classifier, testing_set) print("NuSVC_classifier accuracy percent:", (NuSVC_classifierAccuracy)*100) # pickleDump(LinearSVC_classifier, "LinearSVC_classifier.pickle") # pickleDump([NaiveBayesClassifierAccuracy, # LinearSVC_classifierAccuracy, # SGDClassifier_classifierAccuracy, # MNB_classifierAccuracy, # BernoulliNB_classifierAccuracy, # LogisticRegression_classifierAccuracy], "accuracies.pickle") voted_classifier = VoteClassifier([classifier,NaiveBayesClassifierAccuracy], [NuSVC_classifier,NuSVC_classifierAccuracy], [LinearSVC_classifier,LinearSVC_classifierAccuracy], [SGDClassifier_classifier,SGDClassifier_classifierAccuracy], [MNB_classifier,MNB_classifierAccuracy], [BernoulliNB_classifier,BernoulliNB_classifierAccuracy], [LogisticRegression_classifier,LogisticRegression_classifierAccuracy]) accuracy = Accuracy(voted_classifier,testing_set) print(accuracy) VoteClassifierAccuracy = nltk.classify.accuracy(voted_classifier, testing_set) print("VoteClassifier accuracy percent:", (VoteClassifierAccuracy)*100) # print ("Pos:", nltk.classify.accuracy(voted_classifier, pos_feat)*100) # print ("Neu:", nltk.classify.accuracy(voted_classifier, neu_feat)*100) # print ("Neg:", nltk.classify.accuracy(voted_classifier, neg_feat)*100) print("Accuracy: ", (accuracy["neg"][0]+accuracy["pos"][2])/2) print("Discarded: ", (accuracy["neu"][1]+accuracy["neg"][1]+accuracy["pos"][1])/2) print("Failed: ", (accuracy["neu"][0]+accuracy["neu"][2]+accuracy["neg"][2]+accuracy["pos"][0])/2) print("------------------------------------------"); pickleDump(voted_classifier, "voted_classifier.pickle") return voted_classifier, word_features
def train_test_split(min=25): ''' Gets the train/test set for testing from the Reuters corpus. Keeps only the documents that have a category in both test and train set, with a possible user defined miniumum number of tokens. Parameters: ------------ min: (default = 25) The minimum number of tokens in documents to test with. Returns: ------------ train_set, train_target, test_set, test_target as lists ''' #imports import re from nltk.tokenize import RegexpTokenizer from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import CategorizedPlaintextCorpusReader #reading corpus reuters = LazyCorpusLoader('reuters', CategorizedPlaintextCorpusReader, '(training|test).*', cat_file='cats.txt', encoding='ISO-8859-2') documents = reuters.fileids() #spliting into train and test sets train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents)) test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents)) #getting documents and their categories train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id] test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id] train_cat = [reuters.categories(doc_id) for doc_id in train_docs_id] test_cat = [reuters.categories(doc_id) for doc_id in test_docs_id] #formating the train set, tokenizing and gathering stats for processing train_token_docs = [] train_token_docs_length = [] train_token_docs_unique = [] for i in train_docs: tempy_tokens = tokenize(i) train_token_docs.append(" ".join(tempy_tokens)) train_token_docs_length.append(len(tempy_tokens)) train_token_docs_unique.append(len(set(tempy_tokens))) #formating the test set, tokenizing and gathering stats for processing test_token_docs = [] test_token_docs_length = [] test_token_docs_unique = [] for i in test_docs: tempy_tokens = tokenize(i) test_token_docs.append(" ".join(tempy_tokens)) test_token_docs_length.append(len(tempy_tokens)) test_token_docs_unique.append(len(set(tempy_tokens))) #removes any documents that do not meet the minimum tokens setting train_less_than_min = [ n for n, i in enumerate(train_token_docs_length) if i < min ] test_less_than_min = [ n for n, i in enumerate(test_token_docs_length) if i < min ] train_token_docs_more_than_min = [ i for n, i in enumerate(train_token_docs) if n not in train_less_than_min ] test_token_docs_more_than_min = [ i for n, i in enumerate(test_token_docs) if n not in test_less_than_min ] train_cat_more_than_min = [ i for n, i in enumerate(train_cat) if n not in train_less_than_min ] test_cat_more_than_min = [ i for n, i in enumerate(test_cat) if n not in test_less_than_min ] #getting documents with single categories #(corpus has some with multiple categories) cat_count_train = [len(i) for i in train_cat_more_than_min] cat_count_test = [len(i) for i in test_cat_more_than_min] single_cat_train = [n for n, i in enumerate(cat_count_train) if i == 1] single_cat_test = [n for n, i in enumerate(cat_count_test) if i == 1] train_single = [ i for n, i in enumerate(train_token_docs_more_than_min) if n in single_cat_train ] test_single = [ i for n, i in enumerate(test_token_docs_more_than_min) if n in single_cat_test ] train_single_cat = [ i for n, i in enumerate(train_cat_more_than_min) if n in single_cat_train ] test_single_cat = [ i for n, i in enumerate(test_cat_more_than_min) if n in single_cat_test ] train_cat_set = set([i[0] for i in train_single_cat]) test_cat_set = set([i[0] for i in test_single_cat]) mutual_cat = train_cat_set.intersection(test_cat_set) member_of_mutual_test = [ n for n, i in enumerate(test_single_cat) if i[0] in mutual_cat ] member_of_mutual_train = [ n for n, i in enumerate(train_single_cat) if i[0] in mutual_cat ] train_single2 = [ i for n, i in enumerate(train_single) if n in member_of_mutual_train ] test_single2 = [ i for n, i in enumerate(test_single) if n in member_of_mutual_test ] train_single_cat2 = [ i for n, i in enumerate(train_single_cat) if n in member_of_mutual_train ] test_single_cat2 = [ i for n, i in enumerate(test_single_cat) if n in member_of_mutual_test ] return train_single2, train_single_cat2, test_single2, test_single_cat2