def score(classifier): classifier = nltk.SklearnClassifier(classifier) # 在nltk 中使用scikit-learn 的接口 classifier.train(train) # 训练分类器 classifier = joblib.load('model.m') # joblib.dump(classifier,'model.m') pred = classifier.classify_many(dev) # 对开发测试集的数据进行分类,给出预测的标签 return precision_recall_fscore_support(tag_dev, pred)
def get_classifier(classifier, train, mealgorithm='GIS', sklalgorithm='bernoulli'): if classifier == 'NaiveBayes': return nltk.NaiveBayesClassifier.train(train) elif classifier == 'DecisionTree': return nltk.DecisionTreeClassifier.train(train) elif classifier == 'Maxent': return nltk.DecisionTreeClassifier.train(train, mealgorithm) elif classifier == 'Sklearn': if sklalgorithm == 'bernoulli': return nltk.SklearnClassifier(BernoulliNB()).train(train) elif sklalgorithm == 'svc': return nltk.SklearnClassifier(SVC(), sparse=False).train(train) elif classifier == 'SVM': pass
def score2(classifier): classifier = nltk.SklearnClassifier(classifier) # 在nltk 中使用scikit-learn 的接口 classifier.train(train) # 训练分类器 pred = classifier.classify_many(dev) # 对开发测试集的数据进行分类,给出预测的标签 # print(classification_report(tag_dev, pred)) # print('\n') return accuracy_score(tag_dev, pred) # 对比分类预测结果和人工标注的正确结果,给出分类器准确度
def score(classifier): classifier = nltk.SklearnClassifier(classifier) # 在nltk 中使用scikit-learn的接口 classifier.train(train) # 训练分类器 pred = classifier.classify_many(dev) # 对开发测试集的数据进行分类,给出预测的标签 a_s = accuracy_score(tag_dev, pred) p_s = precision_score(tag_dev, pred, average='binary', pos_label="pos") r_s = recall_score(tag_dev, pred, average='binary', pos_label="pos") f1_s = f1_score(tag_dev, pred, average='binary', pos_label="pos") return a_s, p_s, r_s, f1_s # 对比分类预测结果和人工标注的正确结果,给出分类器准确度
def score(classifier): classifier = nltk.SklearnClassifier(classifier) classifier.train(train) joblib.dump(classifier, 'LogisticRegression.model') pred = classifier.classify_many(data) n = 0 s = len(pred) for i in range(0, s): if pred[i] == tag[i]: n = n + 1 return n / s
def saveBestClassifier(classifier, fileName, featureSet, testRatio, sklean=False): rows = len(featureSet) print("Training %s 100 times..." % (fileName)) if sklean: classifierBest = nltk.SklearnClassifier(classifier) else: classifierBest = classifier maxAcc = 0 for i in range(100): random.shuffle(featureSet) # Spliting into train and test set trainingSet = featureSet[:1900] testingSet = featureSet[1900:] if sklean: classifierCurr = nltk.SklearnClassifier(classifier).train( trainingSet) else: classifierCurr = classifier.train(trainingSet) currAcc = nltk.classify.accuracy(classifierCurr, testingSet) if currAcc > maxAcc: classifierBest = classifierCurr maxAcc = currAcc print("New best accuracy for %s:" % (fileName), maxAcc) print("Done: %.2f" % (i / 100)) print("Saving %s with max accuracy:" % (fileName), maxAcc) file = open("./classifiers/" + fileName + ".pickle", "wb") pickle.dump(classifierBest, file) file.close()
def classify(self, training_set, classifierDumpFile): """ :param training_set: Features to train the classifier :param classifierDumpFile: File name to dump -save- the trained data. :return: classifier """ # Train the module of the specified algorithm using the created training set. self.classifier = nltk.SklearnClassifier(LinearSVC()) self.classifier.train(training_set) #self.classifier = nltk.NaiveBayesClassifier.train(training_set) # Save the trainied data in pickle for further use. self.dump_files(self.classifier, self.classifierDumpFile) return self.classifier
def score(classifier): classifier = nltk.SklearnClassifier(classifier) #在nltk 中使用scikit-learn 的接口 classifier.train(train) #训练分类器 pred = classifier.classify_many(data) #对开发测试集的数据进行分类,给出预测的标签 return accuracy_score(tag, pred)
# naive_bayes = NaiveBayesClassifier.train(train_set) # print("Accuracy - Naive Bayes Classifier: ") # print(nltk.classify.accuracy(naive_bayes, test_set)) # print("Most informative features - Naive Bayes Classifier:") # print(naive_bayes.show_most_informative_features()) # maxent = MaxentClassifier.train(train_set, 'GIS', trace=0, # encoding=None, gaussian_prior_sigma=0, max_iter=100) # print("Accuracy - Max Entropy Classifier: ") # print(nltk.classify.accuracy(maxent, test_set)) # print("Most informative features - Max Entropy Classifier:") # print(maxent.show_most_informative_features()) linear_svm_classifier = nltk.SklearnClassifier(LinearSVC(C=2.0, dual=True, fit_intercept=True, intercept_scaling=0.1, loss='squared_hinge', max_iter=1500, penalty='l2', random_state=0, tol=0.0001), sparse=False) linear_svm_classifier.train(train_set) print("Accuracy - Linear SVM Classifier: ") print(nltk.classify.accuracy(linear_svm_classifier, test_set)) nonlinear_svm = SklearnClassifier(SVC(gamma='scale', kernel='poly', coef0 = 5.0, degree = 5, C = 5.0, shrinking=True, probability=False, tol=1e-3), sparse=False).train(train_set) print("Accuracy - Nonlinear SVM: ") print(nltk.classify.accuracy(nonlinear_svm, test_set)) random_forest = SklearnClassifier(RandomForestClassifier(n_estimators = 100, criterion = 'gini', max_depth = 5, min_samples_split = 2,
all_words = nltk.FreqDist(all_words) # print(all_words.most_common(15)) word_features = list(all_words.keys())[:3000] def find_features(document): words = nltk.word_tokenize(document) features = {} for w in word_features: features[w] = (w in words) return features featuresets = [(find_features(rev), category) for (category, rev) in documents] random.shuffle(featuresets) training_set = featuresets[:4500] testing_set = featuresets[4500:] # classifier_f = open("Wilfred_naivebayes.pickle", "rb") # classifier = pickle.load(classifier_f) # classifier_f.close() # # classifier = nltk.NaiveBayesClassifier.train(training_set) # print("Classifier accuracy percent:", (nltk.classify.accuracy(classifier, testing_set)) * 100) # classifier.show_most_informative_features(15) start_time = time.time() classifier = nltk.SklearnClassifier(RandomForestClassifier(n_estimators=4000, criterion='entropy')) classifier.train(training_set) print("Classifier accuracy percent:", (nltk.classify.accuracy(classifier, testing_set)) * 100) print("%s seconds" % (time.time() - start_time))
def gender_features(word): return {'last_letter': word[-1]} labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names] train_set, test_set = featuresets[500:], featuresets[:500] classifier = nltk.NaiveBayesClassifier.train(train_set) print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, test_set)) * 100) classifier.show_most_informative_features(15) MNB_classifier = nltk.SklearnClassifier(MultinomialNB()) MNB_classifier.train(train_set) print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, test_set)) * 100) # print(classifier.classify(gender_features('Neo'))) # print(MNB_classifier.classify(gender_features('Neo'))) BernoulliNB_classifier = nltk.SklearnClassifier(BernoulliNB()) BernoulliNB_classifier.train(train_set) print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, test_set)) * 100) LogisticRegression_classifier = nltk.SklearnClassifier(LogisticRegression()) LogisticRegression_classifier.train(train_set) print("LogisticRegression_classifier accuracy percent:",
def svm(training_set,dev_test_set): classifier = nltk.SklearnClassifier(ls()).train(training_set) accuracy = nltk.classify.accuracy(classifier,dev_test_set) return (classifier,accuracy)
n=1, extended=0, ): # make bag of words bow = dict() for word in nltk.ngrams(text, n): if extended == 0: if word not in bow: bow[word] = True else: bow[word] = bow.get(word, 0) + 1 return bow models = [ ("SVM", nltk.SklearnClassifier(sklearn.svm.SVC(probability=True))), ("NaiveBayes", nltk.classify.NaiveBayesClassifier), # ("RandomForest", nltk.SklearnClassifier(ensemble.RandomForestClassifier())), # ("MaxEntropy", nltk.classify.MaxentClassifier), # ("MLP", nltk.SklearnClassifier(neural_network.MLPClassifier())) ] onlyBayes = [("NaiveBayes", nltk.classify.NaiveBayesClassifier)] def test_classifiers(classifiers, ngrams, cutoffs): test_counter = 0 scores = dict() for i in range(2): for ndx, (name, cl) in enumerate(classifiers): for gram in ngrams: test_results = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}
#main data_list = getTrainingData() question_list = [x for (x,y) in data_list] label_list = [y for (x,y) in data_list] print("\n question_list = ", question_list) print("\n label_list = ", label_list) vectorizer = CountVectorizer(min_df=1) X = vectorizer.fit_transform(question_list) print("\n X.toarray()=", X.toarray()) print("\n X.shape[0]=",X.shape[0]) test_v = vectorizer.transform(['it is a test of sales order']) print("\n test_v.toarray()=", test_v.toarray()) training_set= [ (getContentFeature(question_list[i]), label_list[i]) for i in range(X.shape[0])] print("\n training_set=", training_set) myClassifier = nltk.SklearnClassifier(SVC()).train(training_set) #Test while True: query = str(input("Input query:")).strip().lower() feature = getContentFeature(query) #print feature print("\n result = ", myClassifier.classify(feature))