def cross_validate(content_set, times, words, amount_of_words): incr = len(content_set) // times document_extraction = Doc_extract(words, amount_of_words) for i in xrange(times): train_set = apply_features(document_extraction, content_set[:i * incr] + content_set[(i + 1) * incr:]) test_set = apply_features(document_extraction, content_set[i * incr: min((i + 1) * incr, len(content_set))]) classifier = get_trained_classifier(train_set) acc = accuracy(classifier, test_set) print('\n{0} classifier\n\tAccuracy: {1:.6}'.format(i + 1, acc)) print('\tPrecision: {0:.6}\n\tRecall: {1:.6}\n\tF_measure: {2:.6}'.format(*get_f_measure(classifier, test_set)))
def runTests(the_names): feature_sets = [(genderFeatures(n), g) for (n,g) in the_names] if False: train_set, test_set = feature_sets[500:], feature_sets[:500] else: train_set = apply_features(genderFeatures, the_names[500:]) test_set = apply_features(genderFeatures, the_names[:500]) classifier = nltk.NaiveBayesClassifier.train(train_set) print '-----------------------------' if True: def getGender(name): return classifier.classify(genderFeatures(name)) def testClassifier(name): print name, 'is', getGender(name) test_names = ['Peter', 'Catherine', 'John', 'Madeline', 'Mark', 'Tom', 'Matt', 'Matthew', 'David', 'Julie', 'Chris', 'Morgan', 'Riley'] for name in test_names: testClassifier(name) print '-----------------------------' if False: for (n,g) in the_names[:500]: predicted = getGender(n) print '%10s is' % n, '%6s' % predicted, '(%6s)' % g, '***' if predicted != g else '' if False: def getDesc(a_list): return (len(a_list), sorted(a_list)) print 'Correct (male) =', getDesc([n for (n,g) in the_names[:500] if getGender(n)==g and g =='male']) print '---------------------------------------------------' print 'Correct (female) =', getDesc([n for (n,g) in the_names[:500] if getGender(n)==g and g =='female']) print '====================================================' print 'Incorrect (male) =', getDesc([n for (n,g) in the_names[:500] if getGender(n)!=g and g =='male']) print '---------------------------------------------------' print 'Incorrect (female) =', getDesc([n for (n,g) in the_names[:500] if getGender(n)!=g and g =='female']) num_correct = len([1 for (n,g) in the_names[:500] if getGender(n)==g]) total = len(the_names[:500]) print num_correct, total accuracy = num_correct/total print 'accuracy = %.02f' % accuracy print 'accuracy = %.02f' % nltk.classify.accuracy(classifier, test_set) classifier.show_most_informative_features(10)
def cross_validate(a_classiffier, content_set, number_folds, all_feats, amount_feats, type_feats): with open('result.txt', 'a') as result_file: print('\n\n\nClassiffier {0}, features is {1}'.format(a_classiffier[1], type_feats[8:]), file=result_file) # запись в файл file=result_file document_extraction = Doc_feat_extractor(all_feats, amount_feats, type_feats) incr = len(content_set) // number_folds for i in xrange(number_folds): train_set = apply_features(document_extraction, content_set[:i * incr] + content_set[(i + 1) * incr:]) test_set = apply_features(document_extraction, content_set[i * incr: min((i + 1) * incr, len(content_set))]) classifier = a_classiffier[0].train(train_set) y_true = [l for (fs, l) in test_set] y_pred = classifier.batch_classify([fs for (fs, l) in test_set]) print('\n{0} attempt\n'.format(i + 1), classification_report(y_true, y_pred,), file=result_file) result_file.flush()
def naive_bayes_gender_classifier(): from nltk.corpus import names names = ([(name, "male") for name in names.words("male.txt")] + [(name, "female") for name in names.words("female.txt")]) random.shuffle(names) # featuresets = [(_gender_features(n), g) for (n,g) in names] # train_set, test_set = featuresets[500:], featuresets[:500] # advisable to stream the sets in for large data set. train_set = apply_features(_gender_features, names[500:]) test_set = apply_features(_gender_features, names[:500]) classifier = nltk.NaiveBayesClassifier.train(train_set) print "Neo is ", classifier.classify(_gender_features("Neo")) print "Trinity is", classifier.classify(_gender_features("Trinity")) # calculate the accuracy of the classifier print nltk.classify.accuracy(classifier, test_set) classifier.show_most_informative_features(5)
def analyzeErrors(the_names): train_names = the_names[1500:] validation_names = the_names[500:1500] test_names = the_names[:500] train_set, validation_set, test_set = [apply_features(genderFeatures, n) for n in [train_names, validation_names, test_names]] classifier = nltk.NaiveBayesClassifier.train(train_set) print 'validation accuracy = %.02f' % nltk.classify.accuracy(classifier, validation_set) print ' test accuracy = %.02f' % nltk.classify.accuracy(classifier, test_set) def getPrediction(name): return classifier.classify(genderFeatures(name)) validation_results = [(n,g,getPrediction(n)) for (n,g) in validation_names] validation_results_incorrect = [(n,g,p) for (n,g,p) in validation_results if g!=p] validation_results_incorrect_male = [n for (n,g,p) in validation_results_incorrect if g=='male'] validation_results_incorrect_female = [n for (n,g,p) in validation_results_incorrect if g=='female'] print '--- %d males classified incorrectly ----------------------------------------' % len(validation_results_incorrect_male) print sorted(validation_results_incorrect_male) print '--- %d females classified incorrectly --------------------------------------' % len(validation_results_incorrect_female) print sorted(validation_results_incorrect_female) print len(validation_results_incorrect), 'incorrect in', len(validation_results) classifier.show_most_informative_features(20) def showPrediction(name): print name, 'is', getPrediction(name) showPrediction('madeline')
def initTrainingSet(self): self.getTweetText() self.getTerms() # The apply_features func processes a set of labeled tweet strings using the passed extractFeatures func self.trainingSet = apply_features(self.extractFeatures, self.labeledTweets) # End func return return
def train(self, num_samples): """uses connection to mongodb to read in tweets that contain :) and :( the size of the training data is 2 * num_samples""" print "Querying DB" # read samples from DB posTweets = self.db.get_N_results("\\:\\)",num_samples) negTweets = self.db.get_N_results("\\:\\(",num_samples) print "Query Returned" # read query results into memory posTweets = [(t["text"].split(" "),'positive') for t in posTweets] negTweets = [(t["text"].split(" "),'negative') for t in negTweets] labeled_tweets = posTweets + negTweets random.shuffle(labeled_tweets) print "compiling all words list" # extract all unique words from the tweets # these will be used as features self.all_words = self.get_all_words(labeled_tweets) # remove stop words self.all_words = self.all_words.difference(self.stop_words) print "num words: %d"%len(self.all_words) test_size = int(len(labeled_tweets) * TEST_SET_PROPORTION) # apply_features is a lazy loader, so that features are # computed as necessary, instead of being loaded into memory # all at once train_set = apply_features(self.document_features,labeled_tweets[:len(labeled_tweets) - test_size]) test_set = apply_features(self.document_features,labeled_tweets[len(labeled_tweets) - test_size:]) print "training" self.classifier = nltk.NaiveBayesClassifier.train(train_set) #print accuracy on test set print "Accuracy on " print(nltk.classify.accuracy(self.classifier, test_set))
def category_by_name(): from nltk import NaiveBayesClassifier from nltk import classify from nltk.corpus import names from nltk.classify import apply_features import random names = ([(name, 'male') for name in names.words('male.txt')] +[(name, 'female') for name in names.words('female.txt')]) random.shuffle(names) def gender_features(word): return {'last_letter':word[-1]} train_set = apply_features(gender_features, names[500:]) test_set = apply_features(gender_features, names[:500]) classifier = NaiveBayesClassifier.train(train_set) print classifier.classify(gender_features('Neo')) print classify.accuracy(classifier, train_set)
def main(): global best_words tweets = get_tweets_from_db() tweet_list = tweets[1000:1599000] test_list = tweets[:1000]+ tweets[1599000:] word_scores = create_word_scores() best_words = find_best_words(word_scores, 500000) f = open('bestwords.pickle', 'wb') pickle.dump(best_words, f) f.close() training_set = classify.apply_features(best_word_features, tweet_list) print "extracted features" # train the classifier with the training set classifier = NaiveBayesClassifier.train(training_set) print "trained classifier" # create the pickle file f = open('NBclassifier_new.pickle', 'wb') pickle.dump(classifier, f) f.close() print "created pickle" # test for precision and recall refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) test_set = classify.apply_features(best_word_features, test_list) for i, (feats, label) in enumerate(test_set): refsets[label].add(i) observed = classifier.classify(feats) testsets[observed].add(i) print 'neg precision:', metrics.precision(refsets['0'], testsets['0']) print 'neg recall:', metrics.recall(refsets['0'], testsets['0']) print 'pos precision:', metrics.precision(refsets['4'], testsets['4']) print 'pos recall:', metrics.recall(refsets['4'], testsets['4']) # test_set = classify.apply_features(extract_features, test_list) # print "extracted features" print classify.accuracy(classifier, test_set) print classifier.show_most_informative_features(30)
def get_train_set(self): """Documentar """ if self._featuresets is None: self.get_featuresets() print "-- Recuperando train_set." # Para não ocupar toda a memória RAM, # não armazena todos os documentos de uma vez nesta. # self._train_set = apply_features(Documento.get_features, self._documentos[100:]) self._train_set = apply_features(Documento.get_features, self._documentos) return self._train_set
def classify(training_set, test_set): training_set = apply_features( paragraph_features, training_set ) test_set = apply_features( paragraph_features, test_set ) print '\nTraining...' classifier = nltk.NaiveBayesClassifier.train( training_set ) global count count = 0 print '\nTesting...' results = nltk.classify.accuracy(classifier, test_set) print '\nAccuracy:', results
def __init__(self, golden_data, feature_extractor): self.feature_extractor = feature_extractor train_set = [] train_set = apply_features(feature_extractor, golden_data) """numtimes = 0 for (data, bio_type) in golden_data: numtimes += 1 if numtimes % 100 == 0: print numtimes featureset = self.feature_extractor(data) train_set.append( (featureset, bio_type) ) print "about to train""" self.classifier = nltk.NaiveBayesClassifier.train(train_set) print "done training"
def get_featuresets(self): """Configura os featuresets que são construídos na seguinte estrutura: (features_do_documento, categoria) Retorna uma lista de featuresets """ if self._featuresets is None: if self._documentos is None: self.get_documentos() print "-- Recuperando featuresets." self._featuresets = apply_features(Documento.get_features, self._documentos) return self._featuresets
def get_words_in_tweets(tweet): all_words = [] for words, sentiment in tweet: all_words.extend(words) return all_words def get_words_features(wordlist): wordlist = FreqDist(wordlist) return wordlist.keys() word_features = get_words_features(get_words_in_tweets(tweet)) #print word_features def extract_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features training_set = classify.apply_features(extract_features, tweet) #print training_set classifier = NaiveBayesClassifier.train(training_set) #print classifier.show_most_informative_features(32) print classify.accuracy(classifier, training_set) t = 'Larry is not my friend' print classifier.classify(extract_features(t.split()))
def __init__( self, dir, testDir=None, doTest=True, ignoreKlass=[], includeKlass=None, extractor="ArticleExtractor", useHtml=False, ): RssDataReader.__init__(self, dir, testDir) logger.info("Start building " + self.__class__.__name__) self.__mutex = threading.Semaphore() freqDists = {} ignore = stopwords.words("english") features = set() klassSize = {} documentsWithLabel = [] for klassId in self.klasses(ignoreKlass, includeKlass): freqDist = FreqDist() size = 0 for url, document in self.documents(klassId, useHtml): try: txt = document if not useHtml else Extractor(extractor=extractor, html=document).getText() documentsWithLabel.append((txt, klassId)) txt = tokenize(txt) size += 1 for part in txt: if part.isalnum() and part not in ignore: freqDist.inc(part) features.add(part) # for bigram in nltk.bigrams(txt): # freqDist.inc(bigram) # featureFd.inc(bigram) except: logger.exception(u"Url: " + url) freqDists[klassId] = freqDist klassSize[klassId] = size random.shuffle(documentsWithLabel) self.__featuresGenerator = FeatureGenerator(freqDists, features, klassSize) trainset = apply_features(self.__featuresGenerator, documentsWithLabel) self.__classifier = NaiveBayesClassifier.train(trainset) logger.info(u"Classifier learned (set size=" + unicode(len(trainset)) + u")") if doTest: ref = [] test = [] testDocumentsWithLabel = [ ( document if not useHtml else Extractor(extractor=extractor, html=document).getText(), correctKlass, url, ) for correctKlass in self.klasses(ignoreKlass, includeKlass) for url, document in self._testDocuments(correctKlass, useHtml) ] for doc, cat, url in testDocumentsWithLabel: ans = self.__classifier.classify(self.__featuresGenerator(doc)) ref.append(cat) test.append(ans) if ans != cat: logger.info(u"Wrong " + ans + u"(" + cat + u"):\t" + url + u" " + doc.replace("\n", " ")) # for correctKlass, klass, featuresWithLabel in zip(ref, test, testset): # if correctKlass != klass: # pd = self.__classifier.prob_classify(dict(featuresWithLabel[0])) # labelProbList = sorted( [(sample, pd.logprob(sample)) for sample in pd.samples()], key=lambda x: x[1], reverse=True) # logger.info( correctKlass + " as " + klass + ": " + str([(correctKlass, "%.2f" % prob) for correctKlass, prob in labelProbList])) # logger.info([(key, value)for key, value in featuresWithLabel[0].items() if value > 0]) # logger.info(self.__findDocumentByKlassAndFeatures(correctKlass, featuresWithLabel[0])) logger.info("\n" + ConfusionMatrix(ref, test).pp()) # testset = apply_features(self.__featuresGenerator, testDocumentsWithLabel # logger.info("Accuracy: " + str(nltk.classify.accuracy(self.__classifier, testset))) self.__classifier.show_most_informative_features(n=300)
] return msg_words def get_features(msg): features = {} for w in get_msg_words(msg, stopwords): features[w] = True return features training = [] for file in glob.glob("data/lissa_train/*"): training.append((open(file, "r").read(), "lissa")) for file in glob.glob("data/random_train/*"): training.append((open(file, "r").read(), "not_lissa")) testing = [] for file in glob.glob("data/lissa_test/*"): testing.append((open(file, "r").read(), "lissa")) for file in glob.glob("data/random_test/*"): testing.append((open(file, "r").read(), "not_lissa")) train_set = apply_features(get_features, training) test_set = apply_features(get_features, testing) cl = NaiveBayesClassifier.train(train_set) print cl.show_most_informative_features(100) print nltk.classify.accuracy(cl, test_set)
featuresets = [(gender_features_better(n), g) for (n,g) in names] train_set, test_set = featuresets[2500:], featuresets[:2500] classifier = nltk.NaiveBayesClassifier.train(train_set) classifier.classify(gender_features_better('Neo')) classifier.classify(gender_features_better('Trinity')) # pg. 224 print nltk.classify.accuracy(classifier, test_set) classifier.show_most_informative_features(5) from nltk.classify import apply_features train_set = apply_features(gender_features, names[2500:]) test_set = apply_features(gender_features, names[:2500]) # pg. 225 def gender_features2(name): features = {} features["firstletter"] = name[0].lower() features["lastletter"] = name[-1].lower() for letter in 'abcdefghijklmnopqrstuvwxyz': features["count(%s)" % letter] = name.lower().count(letter) features["has(%s)" % letter] = (letter in name.lower()) return features gender_features2('John')
import random import mongo_db from nltk.classify import apply_features import re from matplotlib import pyplot as plt def gender_features(word): return {'suffix1': word[-1:], 'suffix2': word[-2:]} names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) user_names = [user['id'] for user in mongo_db.load_from_mongo("project3", "reciprocal_user_features")] random.shuffle(names) featuresets = [(gender_features(n), g) for (n,g) in names] train_set = apply_features(gender_features, names) test_set2 = apply_features(gender_features, names[:500]) classifier = nltk.NaiveBayesClassifier.train(train_set) print nltk.classify.accuracy(classifier, test_set2) count = 0 for username in user_names: if username.startswith('San') or username.startswith('Francisco') or username.startswith('SF') or username.endswith('Francisco') or username.endswith('SF') : count +=1 user_names.remove(username) print count re.sub(ur'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]', u'', username) for username in user_names:
#word_features = get_words_in_tweets(training) training = joblib.load('models/training_compressed.pkl') word_features = get_word_features(get_words_in_tweets(training)) joblib.dump(word_features, 'models/word_features_compressed.pkl', 3) #print word_features training = joblib.load('models/training_compressed.pkl') word_features = joblib.load('models/word_features_compressed.pkl') def extract_features(document): """ checks if the passed list of words is contained in the list 'word_features' """ document_words = set(document) features = {} global word_features for word in word_features: features['contains(%s)' % word] = (word in document_words) return features #print extract_features(training[0][0]) training_set = classify.apply_features(extract_features, training) #print training_set classifier = NaiveBayesClassifier.train(training_set) joblib.dump(classifier, 'models/classifier_compressed.pkl', 3)
def getAccuracy(self, documents): from nltk.classify import apply_features from nltk.classify.util import accuracy testSet = apply_features(Classifier.extractFeatures, list(documents)) return accuracy(self.classifier, testSet)
def gender_features(word): return { 'last_letter' : word[-1] } #end import random import nltk names = ([(name,'male') for name in ('D:\Project_files_notes\Gender-Identifier\male.txt')] + [(name,'female') for name in ('D:\Project_files_notes\Gender-Identifier\female.txt')]) random.shuffle(names) featuresets = [(gender_features(n),g) for (n,g) in names] from nltk.classify import apply_features train_set = apply_features(gender_features,names[86:]) test_set = apply_features(gender_features, names[:85]) classifier = nltk.NaiveBayesClassifier.train(train_set) print classifier.classify(gender_features('ravi')) print nltk.classify.accuracy(classifier, test_set) classifier.show_most_informative_features(5)
gender_features = lambda word:{'last_letter':word[-1]} from nltk.corpus import names names = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')] from sklearn.model_selection import train_test_split feature_sets = [(gender_features(name), gender) for name, gender in names] train_set, test_set = train_test_split(feature_sets, test_size=0.33) clf = nltk.NaiveBayesClassifier.train(train_set) clf.classify(gender_features("Neo")) print(nltk.classify.accuracy(clf, test_set)) clf.show_most_informative_features(5) from nltk.classify import apply_features train_set = apply_features(gender_features, names[:500]) from nltk.corpus import movie_reviews all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) words_features = list(all_words.keys())[:2000] def document_features(document): document_words = set(document) features = {} for word in words_features: features['contains(%s)' % word] = (word in document_words) return features print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) from nltk.corpus import movie_reviews documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] featuresets = [(document_features(d), c) for (d, c) in documents]
x_Neo = classifier.classify(gender_features('Neo')) print('x_Neo: ', x_Neo) x_Trinity = classifier.classify(gender_features('Trinity')) print('x_Trinity: ', x_Trinity) # 系统评估分类器 ratio = nltk.classify.accuracy(classifier, X_test) print('准确率: ', ratio) # 最有效特征 classifier.show_most_informative_features(5) from nltk.classify import apply_features train_set = apply_features(gender_features, names[500:]) print('train_set: ', train_set) print('X_train: ', X_train) # 2.选择正确的特征 # 特征提取器过拟合性别特征 print('\n特征提取器过拟合性别特征:') def gender_features2(name): features = {} features['firstletter'] = name[0].lower() features['lastletter'] = name[-1].lower() for letter in 'abcdefghijklmnopqrstuvwxyz': features['count(%s)' % letter] = name.lower().count(letter) features['has(%s)' % letter] = (letter in name.lower())
def classifyTestData(filename, classifier): labelTestData = buildLabelData(filename) test_set = apply_features(charNgramfeatureDict, labelTestData) accuracy = nltk.classify.accuracy(classifier, test_set) return accuracy
import nltk # nltk.download() from nltk.corpus import names import random def gender_features(word): return {'last_letter': word[-1]} names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) # print(names) random.shuffle(names) featureset = [(gender_features(n), g) for (n, g) in names] # print(featureset) train_set, test_set = featureset[500:], featureset[:500] classifier = nltk.NaiveBayesClassifier.train(train_set) # print(classifier) # print(classifier.classify(gender_features("Neo"))) # print(nltk.classify.accuracy(classifier, test_set)) # classifier.show_most_informative_features(5) from nltk.classify import apply_features train_set = apply_features(gender_features(), featureset[500:])
text += sh.cell(i, 8).value if sh.cell(i, 9).value: text += sh.cell(i, 9).value if not text or type(text) is float: continue else: text = ''.join(ch for ch in text if ch not in exclude) if category: categorized.append((text, category)) else: uncategorized.append((text, category)) # Shuffle the categorized documents so that the order in which they are in the excel file does not affect the results random.shuffle(categorized) all_words = [] for d in categorized: all_words += d[0].split() # Take the 2000 most relevant words all_words = nltk.FreqDist(w.lower() for w in all_words if (not w in stopwords.words('dutch') and len(w) > 1)) word_features = all_words.keys()[:2000] # Take the 500 first registers for testing and the rest for training test_set = apply_features(document_features, categorized[:500]) train_set = apply_features(document_features, categorized[500:]) # Train the classifier classifier = nltk.NaiveBayesClassifier.train(train_set) print (nltk.classify.accuracy(classifier, test_set) * 100)
print(classifier.classify(gender_features('Neo'))) print(classifier.classify(gender_features('Trinity'))) print("-" * 40) print(nltk.classify.accuracy(classifier, test_set)) print("-" * 40) classifier.show_most_informative_features(5) print("-" * 40) def gender_features(word): return {'last_letter': word[-1], 'length': len(word), 'first_letter': word[0]} from nltk.classify import apply_features train_set = apply_features(gender_features, labeled_names[500:]) test_set = apply_features(gender_features, labeled_names[:500]) classifier = nltk.NaiveBayesClassifier.train(train_set) print(nltk.classify.accuracy(classifier, test_set)) print("-" * 40) print(""" ---------------------------------------------------------------------- 1.2 Choosing The Right Features ---------------------------------------------------------------------- """) def gender_features2(name): features = {}
def Naive_Bayes_classify(input_df, feature_column, label_column, process_text=False, test_size=.2, random_state=1, feature_fct=None, most_informative_features=10, show_info=True, return_fscore=True): df = input_df.copy() if process_text: df[feature_column] = df[feature_column].progress_apply(process_sow) else: df[feature_column] = df[feature_column].apply(process_sow_quick) features = list(df[feature_column]) labels = list(df[label_column]) data_set = [(f, labels[index]) for index, f in enumerate(features)] if feature_fct: get_features = feature_fct else: def get_features(text): features = {} tokens = text.split(' ') features["sow length"] = len(tokens) fd = nltk.FreqDist(tokens) most_common = fd.most_common(3) top = most_common[0][0] sec = most_common[1][0] thr = most_common[2][0] features["1st word"] = top features["2nd word"] = sec features["3rd word"] = thr return features X_train, X_test = train_test_split(data_set, test_size=test_size, random_state=random_state) train_set = apply_features(get_features, X_train) test_set = apply_features(get_features, X_test) classifier = nltk.NaiveBayesClassifier.train(train_set) y_pred = [] y_test = [] for i in range(len(list(test_set))): y_pred.append(classifier.classify(test_set[i][0])) y_test.append(test_set[i][1]) f, cf = single_label_f_score(y_gold=y_test, y_pred=y_pred) if show_info: print('f-score:', f) print('label wise f-score', cf) print( classifier.show_most_informative_features( most_informative_features)) conf_mat = confusion_matrix(y_test, y_pred) fig, ax = plt.subplots(figsize=(4, 4)) labels = list(set(labels)) sns.heatmap(conf_mat, annot=True, cmap="Blues", fmt='d', xticklabels=labels, yticklabels=labels) plt.ylabel('Actual') plt.xlabel('Predicted') plt.title("NaiveBayes CONFUSION MATRIX", size=16) if return_fscore: return f, cf
for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = list(all_words)[:2000] def document_features(document): document_words = set(document) features = {} for word in word_features: features["contains({})".format(word)] = (word in document_words) return features def document_features2(document): words_counter = Counter(document) features = {} for word in word_features: features["count({})".format(word)] = words_counter[word] return features length = len(documents) test_size = int(0.2 * length) train_set = apply_features(document_features2, documents[test_size:]) test_set = apply_features(document_features2, documents[:test_size]) classifier = nltk.NaiveBayesClassifier.train(train_set) print nltk.classify.accuracy(classifier, test_set) classifier.show_most_informative_features(5)
word_features = all_words.keys()[:2000] # Get the top synsets in the document from the top 2000 words synset_features = synsets(word_features) def document_features(document): document_words = set(document) document_synsets = synsets(document_words) for word in document_words: document_synsets.update(str(s) for s in wordnet.synsets(word)) features = dict() # for word in word_features: # features['contains({})'.format(word)] = (word in document_words) for synset in synset_features: features[synset] = (synset in document_synsets) return features train_set, test_set = apply_features(document_features, documents[100:]), apply_features(document_features, documents[:100]) print 'training classifier' classifier = nltk.NaiveBayesClassifier.train(train_set) print nltk.classify.accuracy(classifier, test_set) classifier.show_most_informative_features(10)
def trainNaivebayes(trainFile): labelTrainData = buildLabelData(trainFile) train_set = apply_features(charNgramfeatureDict, labelTrainData) classifier = nltk.NaiveBayesClassifier.train(train_set) return classifier
def _getPredictedAndLabeled(self, documents): from nltk.classify import apply_features documents = list(documents) testSet = apply_features(Classifier.extractFeatures, list(documents)) return (array(self.classifier.batch_classify([fs for (fs,l) in testSet])), array([l for (fs,l) in documents]))
# 'is_female' print nltk.classify.accuracy(classifier, test_set) # 0.758 classifier.show_most_informative_features(5) # Most Informative Features # last_letter = 'a' female : male = 38.3 : 1.0 # last_letter = 'k' male : female = 31.4 : 1.0 # last_letter = 'f' male : female = 15.3 : 1.0 # last_letter = 'p' male : female = 10.6 : 1.0 # last_letter = 'w' male : female = 10.6 : 1.0 from nltk.classify import apply_features train_set = apply_features(g_f, names[500:]) test_set = apply_features(g_f, names[:500]) classifier = nltk.NaiveBayesClassifier.train(train_set) print classifier.classify(g_f('Neo')) # 'is_male' print classifier.classify(g_f('Trinity')) # 'is_female' print nltk.classify.accuracy(classifier, test_set) # 0.758 classifier.show_most_informative_features(5) # Most Informative Features # last_letter = 'a' female : male = 38.3 : 1.0 # last_letter = 'k' male : female = 31.4 : 1.0 # last_letter = 'f' male : female = 15.3 : 1.0
def trainClassifier(self, documents): from nltk.classify import apply_features trainSet = apply_features(Classifier.extractFeatures, list(documents)) self.classifier = self.nltkClassifier.train(trainSet)
labeled_names = ([(name, 'male') for name in names.words( 'male.txt')] + [(name, 'female') for name in names.words('female.txt')]) # 打乱顺序 random.shuffle(labeled_names) featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names] train_set, test_set = featuresets[500:], featuresets[:500] classfier = nltk.NaiveBayesClassifier.train(train_set) print(nltk.classify.accuracy(classfier, test_set)) train_names = labeled_names[1500:] devtest_names = labeled_names[500:1000] test_names = labeled_names[:500] train_set1 = apply_features(gender_features2, train_names) devtest_set = apply_features(gender_features2, devtest_names) test_set1 = apply_features(gender_features2, test_names) classfier1 = nltk.NaiveBayesClassifier.train(train_set1) print(nltk.classify.accuracy(classfier1, devtest_set)) error = [] for (name, tag) in devtest_names: guess = classfier1.classify(gender_features2(name)) if guess != tag: error.append((tag, guess, name)) for (tag, guess, name) in sorted(error): print( 'correct = {:<8} guess = {:<8s} name={:<30}'.format(tag, guess, name))
features = {} features["first_letter"] = name[0].lower() features["last_letter"] = name[-1].lower() for letter in 'abcdefghijklmnopqrstuvwxyz': features["count({})".format(letter)] = name.lower().count(letter) features["has({})".format(letter)] = (letter in name.lower()) return features labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) random.shuffle(labeled_names) # Just last letter gave acc of 0.752 # featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names] train_set = apply_features(gender_features, labeled_names[500:]) test_set = apply_features(gender_features, labeled_names[:500]) classifier = nltk.NaiveBayesClassifier.train(train_set) # print(classifier.classify(gender_features('Neo'))) # print(classifier.classify(gender_features('Trinity'))) # print(classifier.show_most_informative_features(5)) train_names = labeled_names[1500:] devtest_names = labeled_names[500:1500] test_names = labeled_names[:500] # def gender_features(word): # return {'suffix1': word[-1:], 'suffix2': word[-2:]} train_set = [(gender_features(n), gender) for (n, gender) in train_names] devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]