from nltk.classify import accuracy as nltk_accuracy def gender_features(word, num_letters=2): return {'feature': word[-num_letters:].lower()} if __name__ == '__main__': labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) random.seed(7) random.shuffle(labeled_names) input_names = ['Leonardo', 'Amy', 'Sam', 'Rock'] for i in range(1, 5): print '\n Number of letters :', i featuresets = [(gender_features(n, i), gender) for (n, gender) in labeled_names] train_set, test_set = featuresets[500:], featuresets[:500] classifier = NaiveBayesClassifier.train(train_set) print 'Accuracy ==> ', str( 100 * nltk_accuracy(classifier, test_set)) + str('%') # test the input data for name in input_names: print name, '==>', classifier.classify(gender_features(name, i))
#importing library for naivebayes accuracy and names import random from nltk import NaiveBayesClassifier from nltk.classify import accuracy as nltk_accuracy from nltk.corpus import names #defining letter N as extract feature def extract_features(word,N=2): last_n_letters=word[-N:] return{'feature':last_n_letters.lower()} #Create the training data using labeled names alredy available in male file if __name__=='__main__': male_list=[(name,'male') for name in names.words('male.txt')] female_list=[(name,'female') for name in names.words('female.txt')] data=(male_list+female_list) random.seed(5) random.shuffle(data) #data to be tested on namesInput=['rajesh','gaurav','swati','shubha'] #declaring train and test data train_sample=int(0.8*len(data)) for i in range(1,6): print("\n number of end letters:",i) features=[(extract_features(n,i),gender)for (n,gender)in data]#feature exraction for n with gender train_data,test_data=features[:train_sample],features[train_sample:] classifier=NaiveBayesClassifier.train(train_data)#defining classifier accuracy_classifier=round(100*nltk_accuracy(classifier,test_data),2)#accuracy of classifier called as this print('accuracy='+str(accuracy_classifier)+'%') for name in namesInput:#classified with listing name print(name,'==>',classifier.classify(extract_features(name,1)))
if __name__ == '__main__': # 提取标记名称 labeled_names = ([(name, 'male') for name in names.words('male.txt')]) + \ ([(name, 'female') for name in names.words('female.txt')]) # 设置随机生成数的种子值,并混合搅乱训练数据 random.seed(7) random.shuffle(labeled_names) # 定义一些输入的姓名 input_names = ['Leonardo', 'Amy', 'Sam', 'Werner'] # 搜索参数空间 for i in range(1, 5): print("取参数为{}".format(i)) featuresets = [(gender_features(n, i), gender) for (n, gender) in labeled_names] # 分割数据为训练集和测试集 train_set, test_set = featuresets[500:], featuresets[:500] # 用朴素贝叶斯分类器做分类 classifier = NaiveBayesClassifier.train(train_set) # 打印分类器准确性 print(u"准确性:{}%".format(100 * nltk_accuracy(classifier, test_set))) # 为输入姓名预测结果 for name in input_names: print("{} ==> {}".format( name, classifier.classify(gender_features(name, i))))
# 构建模型,训练模型 from nltk import NaiveBayesClassifier from nltk.classify import accuracy as nltk_accuracy np.random.shuffle(dataset) rows=int(len(dataset)*0.8) # 80%为train set train_set,test_set=dataset[:rows],dataset[rows:] print('Num of train_set: ',len(train_set), '/nNum of test_set: ',len(test_set)) clf=NaiveBayesClassifier.train(train_set) # 查看该模型在test set上的表现 acc=nltk_accuracy(clf,test_set) #acc=clf.prob_classify(test_set) print('Accuracy: {:.2f}%'.format(acc*100)) # 用该模型来预测新样本,查看新句子的情感是积极还是消极 new_samples = [ "It is an amazing movie", "This is a dull movie. I would never recommend it to anyone.", "The cinematography is pretty great in this movie", "The direction was terrible and the story was all over the place" ]
from nltk import NaiveBayesClassifier from nltk.classify import accuracy as nltk_accuracy from nltk.corpus import names def extract_features(word, N=2): last_n_letters = word[-N:] return {'feature': last_n_letters.lower()} male_list = [(name, 'male') for name in names.words('male.txt')] female_list = [(name, 'female') for name in names.words('female.txt')] data = (male_list + female_list) num_train = int(0.8 * len(data)) random.seed(5) random.shuffle(data) for i in range(1, 6): print('\nNumber of end letters:', i) features = [(extract_features(n, i), gender) for (n, gender) in data] train_data, test_data = features[:num_train], features[num_train:] classifier = NaiveBayesClassifier.train(train_data) accuracy = round(100 * nltk_accuracy(classifier, test_data), 2) print('Accuracy = ' + str(accuracy) + '%') input_names = ['Alexander', 'Danielle', 'David', 'Cheryl'] for name in input_names: print(name, '==>', classifier.classify(extract_features(name, i)))
# coding: utf-8 import input_data as datain datain.out2csv datain.out2csv() import nltk.classify.util from nltk.classify import NaiveBayesClassifier from nltk.corpus import PlaintextCorpusReader import random mesage_corpus = PlaintextCorpusReader('./', ['spam_data.csv', 'ham_data.csv']) all_message = mesage_corpus.words() all_message def massage_feature(word,num_letter=1): return {'feature':word[-num_letter:]} labels_name = ([(massage,'垃圾') for massage in message_corpus.words('soam.csv')]+[(massage,'正常') for massage in message_corpus.words('normal.csv')]) random.seed(7) random.shuffle(labels_name) message_corpus = PlaintextCorpusReader('./', ['spam_data.csv', 'ham_data.csv']) labels_name = ([(massage,'垃圾') for massage in message_corpus.words('soam.csv')]+[(massage,'正常') for massage in message_corpus.words('normal.csv')]) random.seed(7) random.shuffle(labels_name) labels_name = ([(massage,'垃圾') for massage in message_corpus.words('spam_data.csv')]+[(massage,'正常') for massage in message_corpus.words('ham_data.csv')]) random.seed(7) random.shuffle(labels_name) from nltk.classify import accuracy as nltk_accuracy featuresets = [(massage_feature(n),massage) for (n,massage) in labels_name] train_set,test_set = featuresets[2000:],featuresets[:2000] classifier = NaiveBayesClassifier.train(train_set) rint('结果准确率:',str(100*nltk_accuracy(classifier,test_set))+str('%')) print('结果准确率:',str(100*nltk_accuracy(classifier,test_set))+str('%')) get_ipython().run_line_magic('save', 'here 1-31')
if __name__ == "__main__": # 提取标记名称 labeled_names = ([(name, 'male') for name in names.words('male.txt')]) + \ [(name, 'female') for name in names.words('female.txt')] # 设置随机生成数的种子值,并混合搅乱训练数据 random.seed(7) random.shuffle(labeled_names) input_names = ['Leonardo', 'Amy', 'Sam'] # 搜索参数空间 for i in range(1, 5): print('Number of letters: ', i) featuresets = [(gender_features(n, i), gender) for (n, gender) in labeled_names] # 将数据分为训练数据集和测试数据集 train_set, test_set = featuresets[500:], featuresets[:500] # 用朴素贝叶斯分类器做分类 classifier = NaiveBayesClassifier.train(train_set) # 打印分类器的准确性 print('Accuracy==>', str(100 * nltk_accuracy(classifier, test_set)) + str('%')) # 为新输入预测输出结果 for name in input_names: print(name , '==>', classifier.classify(gender_features(name, i)))
return {'feature': word[-num_letters:].lower()} if __name__ == '__main__': # 提取标记名称 labeled_names = ([(name, 'male') for name in names.words('male.txt')]) + \ ([(name, 'female') for name in names.words('female.txt')]) # 设置随机生成数的种子值,并混合搅乱训练数据 random.seed(7) random.shuffle(labeled_names) # 定义一些输入的姓名 input_names = ['Leonardo', 'Amy', 'Sam', 'Werner'] # 搜索参数空间 for i in range(1,5): print("取参数为{}".format(i)) featuresets = [(gender_features(n, i), gender) for (n, gender) in labeled_names] # 分割数据为训练集和测试集 train_set, test_set = featuresets[500:], featuresets[:500] # 用朴素贝叶斯分类器做分类 classifier = NaiveBayesClassifier.train(train_set) # 打印分类器准确性 print(u"准确性:{}%".format(100*nltk_accuracy(classifier, test_set))) # 为输入姓名预测结果 for name in input_names: print("{} ==> {}".format(name, classifier.classify(gender_features(name, i))))
dataframe = pd.DataFrame({'normal': normallist}) #将list用DataFrame,以便保存到CSV文件中 dataframe.to_csv('normal.csv', encoding='utf_8_sig', header=False, index=False) #保存到文件中 message_corpus = PlaintextCorpusReader('./', ['spam.csv', 'normal.csv']) #取出分词文件 all_message = message_corpus.words() #所有分词保存为list def massage_feature(word, num_letter=1): #分词特征化 return {'feature': word[-num_letter:]} labels_name = ([(massage, '垃圾') for massage in message_corpus.words('spam.csv')] + [(massage, '正常') for massage in message_corpus.words('normal.csv')]) #给特征分类 random.seed(7) random.shuffle(labels_name) featuresets = [(massage_feature(n), massage) for (n, massage) in labels_name] #调整格式 train_set, test_set = featuresets[ 400:], featuresets[:400] #取2000个数据前400个为测试后1600个为训练 classifier = NaiveBayesClassifier.train( train_set) #调用nltk中的NaiveBayesClassifier函数,传参训练 print('结果准确率:', str(100 * nltk_accuracy(classifier, test_set)) + str('%')) #传测试集参数并预测准确率
from nltk import NaiveBayesClassifier from nltk.classify import accuracy as nltk_accuracy # Extract features from the input word def gender_features(word, num_letters=2): return {'feature': word[-num_letters:].lower()} if __name__=='__main__': # Extract labeled names labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) random.seed(7) random.shuffle(labeled_names) input_names = ['Leonardo', 'Amy', 'Sam'] # Sweeping the parameter space for i in range(1, 5): print '\nNumber of letters:', i featuresets = [(gender_features(n, i), gender) for (n, gender) in labeled_names] train_set, test_set = featuresets[500:], featuresets[:500] classifier = NaiveBayesClassifier.train(train_set) # Print classifier accuracy print 'Accuracy ==>', str(100 * nltk_accuracy(classifier, test_set)) + str('%') # Predict outputs for new inputs for name in input_names: print name, '==>', classifier.classify(gender_features(name, i))