Exemplo n.º 1
0
def cross_validate(content_set, times, words, amount_of_words):
    incr = len(content_set) // times
    document_extraction = Doc_extract(words, amount_of_words)
    for i in xrange(times):
        train_set = apply_features(document_extraction,
                                   content_set[:i * incr] + content_set[(i + 1) * incr:])
        test_set = apply_features(document_extraction,
                                  content_set[i * incr: min((i + 1) * incr, len(content_set))])
        classifier = get_trained_classifier(train_set)
        acc = accuracy(classifier, test_set)
        print('\n{0} classifier\n\tAccuracy:  {1:.6}'.format(i + 1, acc))
        print('\tPrecision: {0:.6}\n\tRecall:    {1:.6}\n\tF_measure: {2:.6}'.format(*get_f_measure(classifier, test_set)))
Exemplo n.º 2
0
def runTests(the_names):
    feature_sets = [(genderFeatures(n), g) for (n,g) in the_names]
    
    if False:
        train_set, test_set = feature_sets[500:], feature_sets[:500]
    else:
        train_set = apply_features(genderFeatures, the_names[500:])
        test_set  = apply_features(genderFeatures, the_names[:500])

    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print '-----------------------------'

    if True:
        def getGender(name):
            return classifier.classify(genderFeatures(name))
        def testClassifier(name):
           print name, 'is', getGender(name)

        test_names = ['Peter', 'Catherine', 'John', 'Madeline', 'Mark', 'Tom', 
                      'Matt', 'Matthew', 'David', 'Julie',
                      'Chris', 'Morgan', 'Riley']
        for name in test_names:
            testClassifier(name)
        print '-----------------------------'

    if False:
        for (n,g) in the_names[:500]:
            predicted = getGender(n)
            print '%10s is' % n, '%6s' % predicted, '(%6s)' % g, '***' if predicted != g else ''

    if False:
        def getDesc(a_list):
            return (len(a_list), sorted(a_list))
        print 'Correct (male) =', getDesc([n for (n,g) in the_names[:500] if getGender(n)==g and g =='male'])
        print '---------------------------------------------------'
        print 'Correct (female) =', getDesc([n for (n,g) in the_names[:500] if getGender(n)==g and g =='female'])
        print '===================================================='
        print 'Incorrect (male) =', getDesc([n for (n,g) in the_names[:500] if getGender(n)!=g and g =='male'])
        print '---------------------------------------------------'
        print 'Incorrect (female) =', getDesc([n for (n,g) in the_names[:500] if getGender(n)!=g and g =='female'])

    num_correct = len([1 for (n,g) in the_names[:500] if getGender(n)==g])
    total = len(the_names[:500])
    print num_correct, total
    accuracy = num_correct/total
    print 'accuracy = %.02f' % accuracy
    print 'accuracy = %.02f' % nltk.classify.accuracy(classifier, test_set)  

    classifier.show_most_informative_features(10)
Exemplo n.º 3
0
def cross_validate(a_classiffier, content_set, number_folds, all_feats, amount_feats, type_feats):
    with open('result.txt', 'a') as result_file:
        print('\n\n\nClassiffier {0}, features is {1}'.format(a_classiffier[1], type_feats[8:]), file=result_file) # запись в файл file=result_file
        document_extraction = Doc_feat_extractor(all_feats, amount_feats, type_feats)
        incr = len(content_set) // number_folds
        for i in xrange(number_folds):
            train_set = apply_features(document_extraction,
                                       content_set[:i * incr] + content_set[(i + 1) * incr:])
            test_set = apply_features(document_extraction,
                                      content_set[i * incr: min((i + 1) * incr, len(content_set))])
            classifier = a_classiffier[0].train(train_set)
            y_true = [l for (fs, l) in test_set]
            y_pred = classifier.batch_classify([fs for (fs, l) in test_set])
            print('\n{0} attempt\n'.format(i + 1), classification_report(y_true, y_pred,), file=result_file)
            result_file.flush()
Exemplo n.º 4
0
def naive_bayes_gender_classifier():
  from nltk.corpus import names
  names = ([(name, "male") for name in names.words("male.txt")] +
           [(name, "female") for name in names.words("female.txt")])
  random.shuffle(names)
#  featuresets = [(_gender_features(n), g) for (n,g) in names]
#  train_set, test_set = featuresets[500:], featuresets[:500]
  # advisable to stream the sets in for large data set.
  train_set = apply_features(_gender_features, names[500:])
  test_set = apply_features(_gender_features, names[:500])
  classifier = nltk.NaiveBayesClassifier.train(train_set)
  print "Neo is ", classifier.classify(_gender_features("Neo"))
  print "Trinity is", classifier.classify(_gender_features("Trinity"))
  # calculate the accuracy of the classifier
  print nltk.classify.accuracy(classifier, test_set)
  classifier.show_most_informative_features(5)
Exemplo n.º 5
0
def analyzeErrors(the_names):
    train_names = the_names[1500:]
    validation_names = the_names[500:1500]
    test_names = the_names[:500]

    train_set, validation_set, test_set = [apply_features(genderFeatures, n) 
                                           for n in [train_names, validation_names, test_names]]
    
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    print 'validation accuracy = %.02f' % nltk.classify.accuracy(classifier, validation_set) 
    print '      test accuracy = %.02f' % nltk.classify.accuracy(classifier, test_set) 
    
    def getPrediction(name):
         return classifier.classify(genderFeatures(name))
    
    validation_results = [(n,g,getPrediction(n)) for (n,g) in validation_names]
    validation_results_incorrect = [(n,g,p) for (n,g,p) in validation_results if g!=p]
    validation_results_incorrect_male = [n for (n,g,p) in validation_results_incorrect if g=='male']
    validation_results_incorrect_female = [n for (n,g,p) in validation_results_incorrect if g=='female']
    print '--- %d males classified incorrectly ----------------------------------------' % len(validation_results_incorrect_male)
    print sorted(validation_results_incorrect_male)
    print '--- %d females classified incorrectly --------------------------------------' % len(validation_results_incorrect_female)
    print sorted(validation_results_incorrect_female)
    print len(validation_results_incorrect), 'incorrect in', len(validation_results)
    
    classifier.show_most_informative_features(20)

    def showPrediction(name):
        print name, 'is',  getPrediction(name)
        
        showPrediction('madeline')
 def initTrainingSet(self):
     self.getTweetText()
     self.getTerms()
     # The apply_features func processes a set of labeled tweet strings using the passed extractFeatures func
     self.trainingSet = apply_features(self.extractFeatures, self.labeledTweets)
     # End func return
     return
Exemplo n.º 7
0
    def train(self, num_samples):
        """uses connection to mongodb to read in tweets that contain :) and :(
            the size of the training data is 2 * num_samples"""
        
        print "Querying DB"
        # read samples from DB
        posTweets = self.db.get_N_results("\\:\\)",num_samples)
        negTweets = self.db.get_N_results("\\:\\(",num_samples)
        
        print "Query Returned"
        # read query results into memory
        posTweets = [(t["text"].split(" "),'positive') for t in posTweets]
        negTweets = [(t["text"].split(" "),'negative') for t in negTweets]

        labeled_tweets = posTweets + negTweets

        random.shuffle(labeled_tweets)

        print "compiling all words list"
        # extract all unique words from the tweets
        # these will be used as features
        self.all_words = self.get_all_words(labeled_tweets)

        # remove stop words
        self.all_words = self.all_words.difference(self.stop_words)
        print "num words: %d"%len(self.all_words)
        
        test_size = int(len(labeled_tweets) * TEST_SET_PROPORTION)

        # apply_features is a lazy loader, so that features are
        # computed as necessary, instead of being loaded into memory
        # all at once
        train_set = apply_features(self.document_features,labeled_tweets[:len(labeled_tweets) - test_size])
        test_set = apply_features(self.document_features,labeled_tweets[len(labeled_tweets) - test_size:])
        
        
        print "training"
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)
        
        #print accuracy on test set
        print "Accuracy on "
        print(nltk.classify.accuracy(self.classifier, test_set))
Exemplo n.º 8
0
def category_by_name():
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.corpus import names
    from nltk.classify import apply_features
    import random

    names = ([(name, 'male') for name in names.words('male.txt')] +[(name, 'female') for name in
    names.words('female.txt')])

    random.shuffle(names)

    def gender_features(word):
        return {'last_letter':word[-1]}

    train_set = apply_features(gender_features, names[500:])
    test_set = apply_features(gender_features, names[:500])

    classifier = NaiveBayesClassifier.train(train_set)
    print classifier.classify(gender_features('Neo')) 
    print classify.accuracy(classifier, train_set)
Exemplo n.º 9
0
def main():
    global best_words
    tweets = get_tweets_from_db()
    tweet_list = tweets[1000:1599000]
    test_list = tweets[:1000]+ tweets[1599000:]
    word_scores = create_word_scores()
    best_words = find_best_words(word_scores, 500000)
    f = open('bestwords.pickle', 'wb')
    pickle.dump(best_words, f)
    f.close()
    training_set = classify.apply_features(best_word_features, tweet_list)
    print "extracted features"
    # train the classifier with the training set
    classifier = NaiveBayesClassifier.train(training_set)
    print "trained classifier"
    # create the pickle file
    f = open('NBclassifier_new.pickle', 'wb')
    pickle.dump(classifier, f)
    f.close()
    print "created pickle"
    # test for precision and recall
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    test_set = classify.apply_features(best_word_features, test_list)
 
    for i, (feats, label) in enumerate(test_set):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
     
    print 'neg precision:', metrics.precision(refsets['0'], testsets['0'])
    print 'neg recall:', metrics.recall(refsets['0'], testsets['0'])
    print 'pos precision:', metrics.precision(refsets['4'], testsets['4'])
    print 'pos recall:', metrics.recall(refsets['4'], testsets['4'])
    # test_set = classify.apply_features(extract_features, test_list)
    # print "extracted features"
    print classify.accuracy(classifier, test_set)
    print classifier.show_most_informative_features(30)
Exemplo n.º 10
0
    def get_train_set(self):
        """Documentar
        """
        if self._featuresets is None:
            self.get_featuresets()

        print "-- Recuperando train_set."

        # Para não ocupar toda a memória RAM,
        # não armazena todos os documentos de uma vez nesta.
        # self._train_set = apply_features(Documento.get_features, self._documentos[100:])
        self._train_set = apply_features(Documento.get_features, self._documentos)

        return self._train_set
Exemplo n.º 11
0
def classify(training_set, test_set):
    training_set = apply_features(
        paragraph_features,
        training_set
    )
    
    test_set = apply_features(
        paragraph_features,
        test_set
    )

    print '\nTraining...'
    classifier = nltk.NaiveBayesClassifier.train(
        training_set
    )

    global count
    count = 0

    print '\nTesting...'

    results = nltk.classify.accuracy(classifier, test_set)

    print '\nAccuracy:', results
 def __init__(self, golden_data, feature_extractor):
     self.feature_extractor = feature_extractor
     train_set = []
     
     train_set = apply_features(feature_extractor, golden_data)
     
     """numtimes = 0
     for (data, bio_type) in golden_data:
         numtimes += 1
         if numtimes % 100 == 0:
             print numtimes
         featureset = self.feature_extractor(data)
         train_set.append( (featureset, bio_type) )
     print "about to train"""
         
     self.classifier = nltk.NaiveBayesClassifier.train(train_set)
     print "done training"
Exemplo n.º 13
0
    def get_featuresets(self):
        """Configura os featuresets que são construídos na
        seguinte estrutura:
            (features_do_documento, categoria)
        
        Retorna uma lista de featuresets
        """
        if self._featuresets is None:
            
            if self._documentos is None:
                self.get_documentos()

            print "-- Recuperando featuresets."

            self._featuresets = apply_features(Documento.get_features, self._documentos)
        
        return self._featuresets
Exemplo n.º 14
0
def get_words_in_tweets(tweet):
    all_words = []
    for words, sentiment in tweet:
        all_words.extend(words)
    return all_words

def get_words_features(wordlist):
    wordlist = FreqDist(wordlist)
    return wordlist.keys()

word_features = get_words_features(get_words_in_tweets(tweet))

#print word_features

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

training_set = classify.apply_features(extract_features, tweet) 
#print training_set

classifier = NaiveBayesClassifier.train(training_set)
#print classifier.show_most_informative_features(32)
print classify.accuracy(classifier, training_set)

t = 'Larry is not my friend'
print  classifier.classify(extract_features(t.split()))
Exemplo n.º 15
0
    def __init__(
        self,
        dir,
        testDir=None,
        doTest=True,
        ignoreKlass=[],
        includeKlass=None,
        extractor="ArticleExtractor",
        useHtml=False,
    ):
        RssDataReader.__init__(self, dir, testDir)
        logger.info("Start building " + self.__class__.__name__)
        self.__mutex = threading.Semaphore()

        freqDists = {}
        ignore = stopwords.words("english")
        features = set()
        klassSize = {}
        documentsWithLabel = []
        for klassId in self.klasses(ignoreKlass, includeKlass):
            freqDist = FreqDist()
            size = 0
            for url, document in self.documents(klassId, useHtml):
                try:
                    txt = document if not useHtml else Extractor(extractor=extractor, html=document).getText()
                    documentsWithLabel.append((txt, klassId))
                    txt = tokenize(txt)
                    size += 1
                    for part in txt:
                        if part.isalnum() and part not in ignore:
                            freqDist.inc(part)
                            features.add(part)
                    # for bigram in nltk.bigrams(txt):
                    #    freqDist.inc(bigram)
                    #    featureFd.inc(bigram)
                except:
                    logger.exception(u"Url: " + url)
            freqDists[klassId] = freqDist
            klassSize[klassId] = size

        random.shuffle(documentsWithLabel)

        self.__featuresGenerator = FeatureGenerator(freqDists, features, klassSize)
        trainset = apply_features(self.__featuresGenerator, documentsWithLabel)
        self.__classifier = NaiveBayesClassifier.train(trainset)
        logger.info(u"Classifier learned (set size=" + unicode(len(trainset)) + u")")
        if doTest:
            ref = []
            test = []
            testDocumentsWithLabel = [
                (
                    document if not useHtml else Extractor(extractor=extractor, html=document).getText(),
                    correctKlass,
                    url,
                )
                for correctKlass in self.klasses(ignoreKlass, includeKlass)
                for url, document in self._testDocuments(correctKlass, useHtml)
            ]
            for doc, cat, url in testDocumentsWithLabel:
                ans = self.__classifier.classify(self.__featuresGenerator(doc))
                ref.append(cat)
                test.append(ans)
                if ans != cat:
                    logger.info(u"Wrong " + ans + u"(" + cat + u"):\t" + url + u" " + doc.replace("\n", " "))
            # for correctKlass, klass, featuresWithLabel in zip(ref, test, testset):
            #    if correctKlass != klass:
            #        pd = self.__classifier.prob_classify(dict(featuresWithLabel[0]))
            #        labelProbList = sorted( [(sample, pd.logprob(sample)) for sample in pd.samples()], key=lambda x: x[1], reverse=True)
            #        logger.info( correctKlass + " as " + klass + ": " + str([(correctKlass, "%.2f" % prob) for correctKlass, prob in labelProbList]))
            #        logger.info([(key, value)for key, value in featuresWithLabel[0].items() if value > 0])
            #        logger.info(self.__findDocumentByKlassAndFeatures(correctKlass, featuresWithLabel[0]))
            logger.info("\n" + ConfusionMatrix(ref, test).pp())
            # testset = apply_features(self.__featuresGenerator, testDocumentsWithLabel
            # logger.info("Accuracy: " + str(nltk.classify.accuracy(self.__classifier, testset)))
            self.__classifier.show_most_informative_features(n=300)
Exemplo n.º 16
0
    ]
    return msg_words


def get_features(msg):
    features = {}
    for w in get_msg_words(msg, stopwords):
        features[w] = True
    return features


training = []
for file in glob.glob("data/lissa_train/*"):
    training.append((open(file, "r").read(), "lissa"))
for file in glob.glob("data/random_train/*"):
    training.append((open(file, "r").read(), "not_lissa"))

testing = []
for file in glob.glob("data/lissa_test/*"):
    testing.append((open(file, "r").read(), "lissa"))
for file in glob.glob("data/random_test/*"):
    testing.append((open(file, "r").read(), "not_lissa"))

train_set = apply_features(get_features, training)
test_set = apply_features(get_features, testing)

cl = NaiveBayesClassifier.train(train_set)

print cl.show_most_informative_features(100)
print nltk.classify.accuracy(cl, test_set)
Exemplo n.º 17
0
featuresets = [(gender_features_better(n), g) for (n,g) in names]
train_set, test_set = featuresets[2500:], featuresets[:2500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

classifier.classify(gender_features_better('Neo'))

classifier.classify(gender_features_better('Trinity'))

# pg. 224

print nltk.classify.accuracy(classifier, test_set)

classifier.show_most_informative_features(5)

from nltk.classify import apply_features
train_set = apply_features(gender_features, names[2500:])
test_set = apply_features(gender_features, names[:2500])

# pg. 225

def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

gender_features2('John')
Exemplo n.º 18
0
import random
import mongo_db
from nltk.classify import apply_features
import re
from matplotlib import pyplot as plt

def gender_features(word):
     return {'suffix1': word[-1:],
             'suffix2': word[-2:]}
             
names = ([(name, 'male') for name in names.words('male.txt')] +
          [(name, 'female') for name in names.words('female.txt')]) 
user_names = [user['id'] for user in mongo_db.load_from_mongo("project3", "reciprocal_user_features")]
random.shuffle(names)
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set = apply_features(gender_features, names)
test_set2 = apply_features(gender_features, names[:500])
classifier = nltk.NaiveBayesClassifier.train(train_set)

print nltk.classify.accuracy(classifier, test_set2)

count = 0
for username in user_names:
    if username.startswith('San') or username.startswith('Francisco') or username.startswith('SF') or username.endswith('Francisco') or username.endswith('SF') :
            count +=1            
            user_names.remove(username)
print count            

   
re.sub(ur'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]', u'', username)
for username in user_names:
Exemplo n.º 19
0
#word_features = get_words_in_tweets(training)
training = joblib.load('models/training_compressed.pkl')
word_features = get_word_features(get_words_in_tweets(training))
joblib.dump(word_features, 'models/word_features_compressed.pkl', 3)
#print word_features

training = joblib.load('models/training_compressed.pkl')
word_features = joblib.load('models/word_features_compressed.pkl')


def extract_features(document):
    """
        checks if the passed list of words
        is contained in the list 'word_features'
    """
    document_words = set(document)
    features = {}
    global word_features
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features


#print extract_features(training[0][0])

training_set = classify.apply_features(extract_features, training)
#print training_set
classifier = NaiveBayesClassifier.train(training_set)
joblib.dump(classifier, 'models/classifier_compressed.pkl', 3)
Exemplo n.º 20
0
 def getAccuracy(self, documents):
     from nltk.classify import apply_features
     from nltk.classify.util import accuracy
     testSet = apply_features(Classifier.extractFeatures, list(documents))
     return accuracy(self.classifier, testSet)
def gender_features(word):
	return { 'last_letter' : word[-1] }
#end

 

 
import random
import nltk 


names = ([(name,'male') for name in ('D:\Project_files_notes\Gender-Identifier\male.txt')] + [(name,'female') for name in ('D:\Project_files_notes\Gender-Identifier\female.txt')])




random.shuffle(names)
featuresets = [(gender_features(n),g) for (n,g) in names]
from nltk.classify import apply_features 
train_set = apply_features(gender_features,names[86:])
test_set = apply_features(gender_features, names[:85])
classifier = nltk.NaiveBayesClassifier.train(train_set)

print classifier.classify(gender_features('ravi'))


print nltk.classify.accuracy(classifier, test_set)

classifier.show_most_informative_features(5)

Exemplo n.º 22
0
gender_features = lambda word:{'last_letter':word[-1]}
from nltk.corpus import names
names = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]
from sklearn.model_selection import train_test_split
feature_sets = [(gender_features(name), gender) for name, gender in names]
train_set, test_set = train_test_split(feature_sets, test_size=0.33)

clf = nltk.NaiveBayesClassifier.train(train_set)
clf.classify(gender_features("Neo"))
print(nltk.classify.accuracy(clf, test_set))
clf.show_most_informative_features(5)

from nltk.classify import apply_features
train_set = apply_features(gender_features, names[:500])

from nltk.corpus import movie_reviews
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
words_features = list(all_words.keys())[:2000]
def document_features(document):
	document_words = set(document)
	features = {}
	for word in words_features:
		features['contains(%s)' % word] = (word in document_words)
	return features
print(document_features(movie_reviews.words('pos/cv957_8737.txt')))

from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
			 for category in movie_reviews.categories()
			 for fileid in movie_reviews.fileids(category)]
featuresets = [(document_features(d), c) for (d, c) in documents]
x_Neo = classifier.classify(gender_features('Neo'))
print('x_Neo: ', x_Neo)

x_Trinity = classifier.classify(gender_features('Trinity'))
print('x_Trinity: ', x_Trinity)

# 系统评估分类器
ratio = nltk.classify.accuracy(classifier, X_test)
print('准确率: ', ratio)

# 最有效特征
classifier.show_most_informative_features(5)

from nltk.classify import apply_features

train_set = apply_features(gender_features, names[500:])
print('train_set: ', train_set)
print('X_train: ', X_train)

# 2.选择正确的特征
# 特征提取器过拟合性别特征
print('\n特征提取器过拟合性别特征:')


def gender_features2(name):
    features = {}
    features['firstletter'] = name[0].lower()
    features['lastletter'] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features['count(%s)' % letter] = name.lower().count(letter)
        features['has(%s)' % letter] = (letter in name.lower())
def classifyTestData(filename, classifier):
	labelTestData = buildLabelData(filename)
	test_set = apply_features(charNgramfeatureDict, labelTestData)
	accuracy = nltk.classify.accuracy(classifier, test_set)
	return accuracy
Exemplo n.º 25
0
import nltk
# nltk.download()

from nltk.corpus import names
import random


def gender_features(word):
    return {'last_letter': word[-1]}


names = ([(name, 'male') for name in names.words('male.txt')] +
         [(name, 'female') for name in names.words('female.txt')])
# print(names)

random.shuffle(names)
featureset = [(gender_features(n), g) for (n, g) in names]

# print(featureset)
train_set, test_set = featureset[500:], featureset[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
# print(classifier)

# print(classifier.classify(gender_features("Neo")))

# print(nltk.classify.accuracy(classifier, test_set))
# classifier.show_most_informative_features(5)

from nltk.classify import apply_features
train_set = apply_features(gender_features(), featureset[500:])
        text += sh.cell(i, 8).value
    if sh.cell(i, 9).value:
        text += sh.cell(i, 9).value
    if not text or type(text) is float:
        continue
    else:
        text = ''.join(ch for ch in text if ch not in exclude)
    if category:
        categorized.append((text, category))
    else:
        uncategorized.append((text, category))

# Shuffle the categorized documents so that the order in which they are in the excel file does not affect the results
random.shuffle(categorized)

all_words = []
for d in categorized:
    all_words += d[0].split()

# Take the 2000 most relevant words
all_words = nltk.FreqDist(w.lower() for w in all_words if (not w in stopwords.words('dutch') and len(w) > 1))
word_features = all_words.keys()[:2000]

# Take the 500 first registers for testing and the rest for training
test_set = apply_features(document_features, categorized[:500])
train_set = apply_features(document_features, categorized[500:])

# Train the classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

print (nltk.classify.accuracy(classifier, test_set) * 100)
Exemplo n.º 27
0
print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))
print("-" * 40)

print(nltk.classify.accuracy(classifier, test_set))
print("-" * 40)

classifier.show_most_informative_features(5)
print("-" * 40)


def gender_features(word):
    return {'last_letter': word[-1], 'length': len(word), 'first_letter': word[0]}
from nltk.classify import apply_features
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
print("-" * 40)


print("""
----------------------------------------------------------------------
1.2  Choosing The Right Features
----------------------------------------------------------------------
""")


def gender_features2(name):
    features = {}
Exemplo n.º 28
0
def Naive_Bayes_classify(input_df,
                         feature_column,
                         label_column,
                         process_text=False,
                         test_size=.2,
                         random_state=1,
                         feature_fct=None,
                         most_informative_features=10,
                         show_info=True,
                         return_fscore=True):
    df = input_df.copy()
    if process_text:
        df[feature_column] = df[feature_column].progress_apply(process_sow)
    else:
        df[feature_column] = df[feature_column].apply(process_sow_quick)
    features = list(df[feature_column])
    labels = list(df[label_column])
    data_set = [(f, labels[index]) for index, f in enumerate(features)]
    if feature_fct:
        get_features = feature_fct
    else:

        def get_features(text):
            features = {}
            tokens = text.split(' ')
            features["sow length"] = len(tokens)
            fd = nltk.FreqDist(tokens)
            most_common = fd.most_common(3)
            top = most_common[0][0]
            sec = most_common[1][0]
            thr = most_common[2][0]
            features["1st word"] = top
            features["2nd word"] = sec
            features["3rd word"] = thr
            return features

    X_train, X_test = train_test_split(data_set,
                                       test_size=test_size,
                                       random_state=random_state)
    train_set = apply_features(get_features, X_train)
    test_set = apply_features(get_features, X_test)
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    y_pred = []
    y_test = []
    for i in range(len(list(test_set))):
        y_pred.append(classifier.classify(test_set[i][0]))
        y_test.append(test_set[i][1])
    f, cf = single_label_f_score(y_gold=y_test, y_pred=y_pred)
    if show_info:
        print('f-score:', f)
        print('label wise f-score', cf)
        print(
            classifier.show_most_informative_features(
                most_informative_features))
        conf_mat = confusion_matrix(y_test, y_pred)
        fig, ax = plt.subplots(figsize=(4, 4))
        labels = list(set(labels))
        sns.heatmap(conf_mat,
                    annot=True,
                    cmap="Blues",
                    fmt='d',
                    xticklabels=labels,
                    yticklabels=labels)
        plt.ylabel('Actual')
        plt.xlabel('Predicted')
        plt.title("NaiveBayes CONFUSION MATRIX", size=16)
    if return_fscore:
        return f, cf
Exemplo n.º 29
0
for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
	document_words = set(document)
	features = {}
	for word in word_features:
		features["contains({})".format(word)] = (word in document_words)
	return features	

def document_features2(document):
	words_counter = Counter(document)
	features = {}
	for word in word_features:
		features["count({})".format(word)] = words_counter[word]
	return features

length = len(documents)
test_size = int(0.2 * length)
train_set = apply_features(document_features2, documents[test_size:])
test_set = apply_features(document_features2, documents[:test_size])

classifier = nltk.NaiveBayesClassifier.train(train_set)

print nltk.classify.accuracy(classifier, test_set)

classifier.show_most_informative_features(5)
word_features = all_words.keys()[:2000]

# Get the top synsets in the document from the top 2000 words
synset_features = synsets(word_features)


def document_features(document):
    document_words = set(document)
    document_synsets = synsets(document_words)

    for word in document_words:
        document_synsets.update(str(s) for s in wordnet.synsets(word))

    features = dict()

    # for word in word_features:
    #     features['contains({})'.format(word)] = (word in document_words)

    for synset in synset_features:
        features[synset] = (synset in document_synsets)

    return features

train_set, test_set = apply_features(document_features, documents[100:]), apply_features(document_features, documents[:100])

print 'training classifier'
classifier = nltk.NaiveBayesClassifier.train(train_set)

print nltk.classify.accuracy(classifier, test_set)
classifier.show_most_informative_features(10)
def trainNaivebayes(trainFile):
	labelTrainData = buildLabelData(trainFile)
	train_set = apply_features(charNgramfeatureDict, labelTrainData)
	classifier = nltk.NaiveBayesClassifier.train(train_set)
	return classifier
Exemplo n.º 32
0
 def _getPredictedAndLabeled(self, documents):
     from nltk.classify import apply_features
     documents = list(documents)
     testSet = apply_features(Classifier.extractFeatures, list(documents))
     return (array(self.classifier.batch_classify([fs for (fs,l) in testSet])), array([l for (fs,l) in documents]))
# 'is_female'
print nltk.classify.accuracy(classifier, test_set)
# 0.758
classifier.show_most_informative_features(5)

# Most Informative Features
#              last_letter = 'a'            female : male   =     38.3 : 1.0
#              last_letter = 'k'              male : female =     31.4 : 1.0
#              last_letter = 'f'              male : female =     15.3 : 1.0
#              last_letter = 'p'              male : female =     10.6 : 1.0
#              last_letter = 'w'              male : female =     10.6 : 1.0



from nltk.classify import apply_features
train_set = apply_features(g_f, names[500:])
test_set = apply_features(g_f, names[:500])

classifier = nltk.NaiveBayesClassifier.train(train_set)

print classifier.classify(g_f('Neo'))
# 'is_male'
print classifier.classify(g_f('Trinity'))
# 'is_female'
print nltk.classify.accuracy(classifier, test_set)
# 0.758
classifier.show_most_informative_features(5)
# Most Informative Features
#              last_letter = 'a'            female : male   =     38.3 : 1.0
#              last_letter = 'k'              male : female =     31.4 : 1.0
#              last_letter = 'f'              male : female =     15.3 : 1.0
Exemplo n.º 34
0
 def trainClassifier(self, documents):
     from nltk.classify import apply_features
     trainSet = apply_features(Classifier.extractFeatures, list(documents))
     self.classifier = self.nltkClassifier.train(trainSet)
Exemplo n.º 35
0
labeled_names = ([(name, 'male') for name in names.words(
    'male.txt')] + [(name, 'female') for name in names.words('female.txt')])

# 打乱顺序
random.shuffle(labeled_names)

featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classfier = nltk.NaiveBayesClassifier.train(train_set)

print(nltk.classify.accuracy(classfier, test_set))

train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1000]
test_names = labeled_names[:500]
train_set1 = apply_features(gender_features2, train_names)
devtest_set = apply_features(gender_features2, devtest_names)
test_set1 = apply_features(gender_features2, test_names)
classfier1 = nltk.NaiveBayesClassifier.train(train_set1)

print(nltk.classify.accuracy(classfier1, devtest_set))

error = []
for (name, tag) in devtest_names:
    guess = classfier1.classify(gender_features2(name))
    if guess != tag:
        error.append((tag, guess, name))
for (tag, guess, name) in sorted(error):
    print(
        'correct = {:<8} guess = {:<8s} name={:<30}'.format(tag, guess, name))
Exemplo n.º 36
0
Arquivo: chap6.py Projeto: jbbe/lang
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features


labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                 [(name, 'female') for name in names.words('female.txt')])

random.shuffle(labeled_names)
# Just last letter gave acc of 0.752
# featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])
classifier = nltk.NaiveBayesClassifier.train(train_set)

# print(classifier.classify(gender_features('Neo')))
# print(classifier.classify(gender_features('Trinity')))
# print(classifier.show_most_informative_features(5))

train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]
# def gender_features(word):
# return {'suffix1': word[-1:], 'suffix2': word[-2:]}

train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]