import random
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from nltk.corpus import stopwords
from nltk.text import TextCollection
from collections import Counter
from nltk.tokenize import word_tokenize
from operator import itemgetter



dialect = LazyCorpusLoader('dialects1', CategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'(egyptian|gulf|levantine|standardArabic)/.*', encoding="utf-8")
x = TextCollection(dialect)

sentences = [(list(dialect.words(fileid)[i:i+40]),category)
             for category in dialect.categories()
             for fileid in dialect.fileids(category)
             for i in range(0,len(dialect.words(fileid)),40)]


shuffled_sentences = random.sample(sentences, len(sentences))
print('sentences count',len(sentences))

text = dialect.words()
print('words count',len(text))

#################### Test with getting topN ############################################################################
# all_words = nltk.FreqDist(w for w in dialect.words())
# Mcommon = all_words.most_common(4000)
# topN = [i[0] for i in Mcommon]
# print('finished topN')
示例#2
0
	reader_kwargs['cat_file'] = args.cat_file
	
	if args.delimiter and args.delimiter != ' ':
		reader_kwargs['delimiter'] = args.delimiter
	
	if args.cat_pattern:
		reader_args.append(args.cat_pattern)
	else:
		reader_args.append('.+/.+')
elif args.cat_pattern:
	reader_args.append(args.cat_pattern)
	reader_kwargs['cat_pattern'] = re.compile(args.cat_pattern)

categorized_corpus = LazyCorpusLoader(args.corpus, reader_class[args.reader],
	*reader_args, **reader_kwargs)
labels = categorized_corpus.categories()
nlabels = len(labels)

if args.trace:
	print '%d labels: %s' % (nlabels, labels)

if not nlabels:
	raise ValueError('corpus does not have any categories')
elif nlabels == 1:
	raise ValueError('corpus must have more than 1 category')
elif nlabels == 2 and args.multi:
	raise ValueError('corpus must have more than 2 categories if --multi is specified')

########################
## text normalization ##
########################
interrogazioni = LazyCorpusLoader(
    'opp_interrogazioni_macro',
    CategorizedPlaintextCorpusReader,
    r'\d*', cat_file='cats.txt', cat_delimiter=','
)

print "computing FreqDist over all words"
all_words = nltk.FreqDist(w.lower() for w in interrogazioni.words())
top_words = all_words.keys()[:2000]


print "generating list of documents for each category"
documents = [
    (list(interrogazioni.words(fileid)), category)
    for category in interrogazioni.categories()
    for fileid in interrogazioni.fileids(category)
]
random.shuffle(documents)

print "building the classifier"
featuresets = [(document_features(d, top_words), c) for (d,c) in documents]
train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)

print "classifier accuracy: ", nltk.classify.accuracy(classifier, test_set)




示例#4
0
def loadClassifier(outputdir):
    classifier_filename = os.path.join("pickled_algos", "voted_classifier.pickle") 
    word_features_filename = os.path.join("pickled_algos", "word_features.pickle")
    if os.path.exists(classifier_filename) and os.path.exists(word_features_filename):
        word_features = pickleLoad("word_features.pickle")
#        classifier = pickleLoad("originalnaivebayes.pickle")
#        MNB_classifier = pickleLoad("MNB_classifier.pickle")
#        BernoulliNB_classifier = pickleLoad("BernoulliNB_classifier.pickle")
#        LogisticRegression_classifier = pickleLoad("LogisticRegression_classifier.pickle")
#        SGDClassifier_classifier = pickleLoad("SGDClassifier_classifier.pickle")
#        LinearSVC_classifier = pickleLoad("LinearSVC_classifier.pickle")
#        
#        voted_classifier = VoteClassifier(classifier,
##                                  NuSVC_classifier,
#                                  LinearSVC_classifier,
#                                  SGDClassifier_classifier,
#                                  MNB_classifier,
#                                  BernoulliNB_classifier,
#                                  LogisticRegression_classifier)
        voted_classifier= pickleLoad("voted_classifier.pickle")
        return voted_classifier, word_features
    else:
        criticas_cine = LazyCorpusLoader(
                'criticas_cine', CategorizedPlaintextCorpusReader,
                r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*',
                encoding='utf-8')
#        criticas_cine = LazyCorpusLoader(
#                'criticas_cine_neu', CategorizedPlaintextCorpusReader,
#                r'(?!\.).*\.txt', cat_pattern=r'(neg|neu|pos)/.*',
#                encoding='utf-8')
            
        documents = [(list(criticas_cine.words(fileid)), category)
                     for category in criticas_cine.categories()
                     for fileid in criticas_cine.fileids(category)]
#            
#        document_pos = [(list(criticas_cine.words(fileid)), "pos")
#                        for fileid in criticas_cine.fileids("pos")]
#        document_neg = [(list(criticas_cine.words(fileid)), "neg")
#                        for fileid in criticas_cine.fileids("neg")]
#        document_neu = [(list(criticas_cine.words(fileid)), "neu")
#                        for fileid in criticas_cine.fileids("neu")]
        
        random.shuffle(documents)
        
#        random.shuffle(document_pos)
#        random.shuffle(document_neg)
#        random.shuffle(document_neu)
        
        all_words = []
        
        for w in criticas_cine.words():
            all_words.append(w.lower())
        
#        for w in criticas_cine.words():
#            if not is_filtered(w.lower()):
#                all_words.append(w.lower())
#        
        all_words = nltk.FreqDist(all_words)
        
        #print (all_words.most_common(50))
        
        # Filtering by type of word
        
#        for sample in all_words:
                    
        
        word_features = list(all_words.keys())[:3000]
        pickleDump(word_features, "word_features.pickle")
        
        featuresets = [(find_features(rev, word_features), category) for (rev, category) in documents]
        
#        featuresetpos = [(find_features(rev, word_features), category) for (rev, category) in document_pos]
#        featuresetneg = [(find_features(rev, word_features), category) for (rev, category) in document_neg]
#        featuresetneu = [(find_features(rev, word_features), category) for (rev, category) in document_neu]
        
#        training_set = featuresetpos[:1000]
#        training_set.extend(featuresetneg[:1000])
#        training_set.extend(featuresetneu[:1000])
#        testing_set = featuresetpos[1000:1273]
#        testing_set.extend(featuresetneg[1000:])
#        testing_set.extend(featuresetneu[1000:])

#        pos_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "pos"]
#        neu_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neu"]
#        neg_feat = [(featuresSet, category) for (featuresSet, category) in featuresets if category == "neg"]
                
        training_set = featuresets[:2000]
        testing_set =  featuresets[2000:]
        classifier = nltk.NaiveBayesClassifier.train(training_set)
#        pickleDump(classifier, "originalnaivebayes.pickle")
    
        NaiveBayesClassifierAccuracy = nltk.classify.accuracy(classifier, testing_set)
        
        print("Original Naive Bayes Algo accuracy percent:", (NaiveBayesClassifierAccuracy)*100)
        
        accuracy = Accuracy(classifier,testing_set)
        print(accuracy)
        # order: neu, neg, pos
#        print("Accuracy: ", (accuracy["neg"][0]+accuracy["pos"][2])/3)
#        print("Discarded: ", (accuracy["neu"][0]+accuracy["neg"][1]+accuracy["pos"][0])/3)
#        print("Failed: ", (accuracy["neu"][1]+accuracy["neu"][2]+accuracy["neg"][2]+accuracy["pos"][1])/3)
#        print ("Pos:", nltk.classify.accuracy(classifier, pos_feat)*100)
#        print ("Neu:", nltk.classify.accuracy(classifier, neu_feat)*100)
#        print ("Neg:", nltk.classify.accuracy(classifier, neg_feat)*100)
        classifier.show_most_informative_features(15)
        
        MNB_classifier = SklearnClassifier(MultinomialNB())
        MNB_classifier.train(training_set)
        MNB_classifierAccuracy = nltk.classify.accuracy(MNB_classifier, testing_set)
        print("MNB_classifier accuracy percent:", (MNB_classifierAccuracy)*100)
#        pickleDump(MNB_classifier, "MNB_classifier.pickle")
        
        BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
        BernoulliNB_classifier.train(training_set)
        BernoulliNB_classifierAccuracy = nltk.classify.accuracy(BernoulliNB_classifier, testing_set)
        print("BernoulliNB_classifier accuracy percent:", (BernoulliNB_classifierAccuracy)*100)
#        pickleDump(BernoulliNB_classifier, "BernoulliNB_classifier.pickle")
        
        LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
        LogisticRegression_classifier.train(training_set)
        LogisticRegression_classifierAccuracy = nltk.classify.accuracy(LogisticRegression_classifier, testing_set)
        print("LogisticRegression_classifier accuracy percent:", (LogisticRegression_classifierAccuracy)*100)
#        pickleDump(LogisticRegression_classifier, "LogisticRegression_classifier.pickle")
        
        SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
        SGDClassifier_classifier.train(training_set)
        SGDClassifier_classifierAccuracy = nltk.classify.accuracy(SGDClassifier_classifier, testing_set)
        print("SGDClassifier_classifier accuracy percent:", (SGDClassifier_classifierAccuracy)*100)
#        pickleDump(SGDClassifier_classifier, "SGDClassifier_classifier.pickle")
        
        LinearSVC_classifier = SklearnClassifier(LinearSVC())
        LinearSVC_classifier.train(training_set)
        LinearSVC_classifierAccuracy = nltk.classify.accuracy(LinearSVC_classifier, testing_set)
        print("LinearSVC_classifier accuracy percent:", (LinearSVC_classifierAccuracy)*100)
#        pickleDump(LinearSVC_classifier, "LinearSVC_classifier.pickle")
        
#        SVC_classifier = SklearnClassifier(SVC())
#        SVC_classifier.train(training_set)
#        print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)
        
        NuSVC_classifier = SklearnClassifier(NuSVC())
        NuSVC_classifier.train(training_set)
        NuSVC_classifierAccuracy = nltk.classify.accuracy(NuSVC_classifier, testing_set)
        print("NuSVC_classifier accuracy percent:", (NuSVC_classifierAccuracy)*100)
        #        pickleDump(LinearSVC_classifier, "LinearSVC_classifier.pickle")
        
        
#        pickleDump([NaiveBayesClassifierAccuracy, 
#                    LinearSVC_classifierAccuracy,
#                    SGDClassifier_classifierAccuracy,
#                    MNB_classifierAccuracy,
#                    BernoulliNB_classifierAccuracy,
#                    LogisticRegression_classifierAccuracy], "accuracies.pickle")
        
        voted_classifier = VoteClassifier([classifier,NaiveBayesClassifierAccuracy],
                                          [NuSVC_classifier,NuSVC_classifierAccuracy],
                                          [LinearSVC_classifier,LinearSVC_classifierAccuracy],
                                          [SGDClassifier_classifier,SGDClassifier_classifierAccuracy],
                                          [MNB_classifier,MNB_classifierAccuracy],
                                          [BernoulliNB_classifier,BernoulliNB_classifierAccuracy],
                                          [LogisticRegression_classifier,LogisticRegression_classifierAccuracy])

        accuracy = Accuracy(voted_classifier,testing_set)
        print(accuracy)
        VoteClassifierAccuracy = nltk.classify.accuracy(voted_classifier, testing_set)
        print("VoteClassifier accuracy percent:", (VoteClassifierAccuracy)*100)
#        print ("Pos:", nltk.classify.accuracy(voted_classifier, pos_feat)*100)
#        print ("Neu:", nltk.classify.accuracy(voted_classifier, neu_feat)*100)
#        print ("Neg:", nltk.classify.accuracy(voted_classifier, neg_feat)*100)
        print("Accuracy: ", (accuracy["neg"][0]+accuracy["pos"][2])/2)
        print("Discarded: ", (accuracy["neu"][1]+accuracy["neg"][1]+accuracy["pos"][1])/2)
        print("Failed: ", (accuracy["neu"][0]+accuracy["neu"][2]+accuracy["neg"][2]+accuracy["pos"][0])/2)
        print("------------------------------------------");
                                          
        pickleDump(voted_classifier, "voted_classifier.pickle")

        return voted_classifier, word_features
示例#5
0
def train_test_split(min=25):
    '''
    Gets the train/test set for testing from the Reuters corpus.  Keeps only
    the documents that have a category in both test and train set, with a
    possible user defined miniumum number of tokens.

    Parameters:
    ------------
    min: (default = 25) The minimum number of tokens in documents to test with.

    Returns:
    ------------
    train_set, train_target, test_set, test_target as lists
    '''
    #imports
    import re
    from nltk.tokenize import RegexpTokenizer
    from nltk.corpus.util import LazyCorpusLoader
    from nltk.corpus.reader import CategorizedPlaintextCorpusReader

    #reading corpus
    reuters = LazyCorpusLoader('reuters',
                               CategorizedPlaintextCorpusReader,
                               '(training|test).*',
                               cat_file='cats.txt',
                               encoding='ISO-8859-2')

    documents = reuters.fileids()
    #spliting into train and test sets
    train_docs_id = list(filter(lambda doc: doc.startswith("train"),
                                documents))
    test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
    #getting documents and their categories
    train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
    test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]
    train_cat = [reuters.categories(doc_id) for doc_id in train_docs_id]
    test_cat = [reuters.categories(doc_id) for doc_id in test_docs_id]

    #formating the train set, tokenizing and gathering stats for processing
    train_token_docs = []
    train_token_docs_length = []
    train_token_docs_unique = []
    for i in train_docs:
        tempy_tokens = tokenize(i)
        train_token_docs.append(" ".join(tempy_tokens))
        train_token_docs_length.append(len(tempy_tokens))
        train_token_docs_unique.append(len(set(tempy_tokens)))

    #formating the test set, tokenizing and gathering stats for processing
    test_token_docs = []
    test_token_docs_length = []
    test_token_docs_unique = []
    for i in test_docs:
        tempy_tokens = tokenize(i)
        test_token_docs.append(" ".join(tempy_tokens))
        test_token_docs_length.append(len(tempy_tokens))
        test_token_docs_unique.append(len(set(tempy_tokens)))

    #removes any documents that do not meet the minimum tokens setting
    train_less_than_min = [
        n for n, i in enumerate(train_token_docs_length) if i < min
    ]
    test_less_than_min = [
        n for n, i in enumerate(test_token_docs_length) if i < min
    ]

    train_token_docs_more_than_min = [
        i for n, i in enumerate(train_token_docs)
        if n not in train_less_than_min
    ]

    test_token_docs_more_than_min = [
        i for n, i in enumerate(test_token_docs) if n not in test_less_than_min
    ]
    train_cat_more_than_min = [
        i for n, i in enumerate(train_cat) if n not in train_less_than_min
    ]
    test_cat_more_than_min = [
        i for n, i in enumerate(test_cat) if n not in test_less_than_min
    ]

    #getting documents with single categories
    #(corpus has some with multiple categories)
    cat_count_train = [len(i) for i in train_cat_more_than_min]
    cat_count_test = [len(i) for i in test_cat_more_than_min]

    single_cat_train = [n for n, i in enumerate(cat_count_train) if i == 1]
    single_cat_test = [n for n, i in enumerate(cat_count_test) if i == 1]

    train_single = [
        i for n, i in enumerate(train_token_docs_more_than_min)
        if n in single_cat_train
    ]
    test_single = [
        i for n, i in enumerate(test_token_docs_more_than_min)
        if n in single_cat_test
    ]
    train_single_cat = [
        i for n, i in enumerate(train_cat_more_than_min)
        if n in single_cat_train
    ]
    test_single_cat = [
        i for n, i in enumerate(test_cat_more_than_min) if n in single_cat_test
    ]

    train_cat_set = set([i[0] for i in train_single_cat])
    test_cat_set = set([i[0] for i in test_single_cat])

    mutual_cat = train_cat_set.intersection(test_cat_set)

    member_of_mutual_test = [
        n for n, i in enumerate(test_single_cat) if i[0] in mutual_cat
    ]
    member_of_mutual_train = [
        n for n, i in enumerate(train_single_cat) if i[0] in mutual_cat
    ]

    train_single2 = [
        i for n, i in enumerate(train_single) if n in member_of_mutual_train
    ]
    test_single2 = [
        i for n, i in enumerate(test_single) if n in member_of_mutual_test
    ]
    train_single_cat2 = [
        i for n, i in enumerate(train_single_cat)
        if n in member_of_mutual_train
    ]
    test_single_cat2 = [
        i for n, i in enumerate(test_single_cat) if n in member_of_mutual_test
    ]

    return train_single2, train_single_cat2, test_single2, test_single_cat2