def update_category_by_pos(): from nltk.corpus import brown from nltk import NaiveBayesClassifier from nltk import classify from nltk.tag import untag from nltk import DecisionTreeClassifier def pos_features(sentence, i): features = {'suffix(1)':sentence[i][-1:], 'suffix(2)':sentence[i][-2:], 'suffix(3)':sentence[i][-3:] } features['prev-word'] = '<start>' if i==0 else sentence[i-1] return features print pos_features(brown.sents()[0], 8) tagged_sents = brown.tagged_sents(categories='news') featuresets = [] for tagged_sent in tagged_sents: untagged_sent = untag(tagged_sent) for i, (word, tag) in enumerate(tagged_sent): featuresets.append((pos_features(untagged_sent, i), tag)) size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] # classifier = NaiveBayesClassifier.train(train_set) classifier = DecisionTreeClassifier.train(train_set) print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
def train(self) -> ClassifierI: """Train the classifier. This takes a very long time, so after training, pickle it and save it to disk. """ training_set = apply_features( self.extract_features_from_tweet, self.training_set) testing_set = apply_features( self.extract_features_from_tweet, self.testing_set) self.testing_set = testing_set if self.classifier_name == self.NAIVE_BAYES: classifier = NaiveBayesClassifier.train(training_set) elif self.classifier_name == self.DECISION_TREE: classifier = DecisionTreeClassifier.train(training_set) else: raise ValueError("Couldn't create classifier") dump( classifier, open( 'twitteranalyser/data/%s.p' % self.classifier_name, 'wb'), HIGHEST_PROTOCOL) return classifier
def test_classification(self): dataset_name = 'breast_cancer' mode = 'classification' X, y = get_data(dataset_name) est_dict = { 'logreg': LogisticRegression(), 'dtree': DecisionTreeClassifier(max_depth=5) } features_scores = get_scores_df(est_dict, X, y, mode) print(features_scores)
def cross_validation_train(self, dev_docs): ''' Applies k-fold cross validation technique to split the docs into different pairs of training and testing sets. For each pair, it trains and evals the a classifier, choosing the one with the best accuracy Parameters ---------- dev_docs: iterable An iterable which yields a list of strings ''' dev_docs = shuffled(dev_docs) accuracies = [] best_accuracy = 0 subset_size = int(len(dev_docs) / self.n_folds) for i in range(self.n_folds): classifier_list = [] train_docs = (dev_docs[(i + 1) * subset_size:] + \ dev_docs[:i * subset_size]) test_docs = dev_docs[i * subset_size:(i + 1) * subset_size] train_set = apply_features(self.get_doc_features, train_docs) if self.t_classifier == "NB": classifier = NaiveBayesClassifier.train(train_set) elif self.t_classifier == "DT": classifier = DecisionTreeClassifier.train(train_set) elif self.t_classifier == "RF": classifier = SklearnClassifier(RandomForestClassifier())\ .train(train_set) elif self.t_classifier == "SVM": classifier = SklearnClassifier(LinearSVC(), sparse=False)\ .train(train_set) classifier_list.append(classifier) test_set = apply_features(self.get_doc_features, test_docs, True) accuracies.append((accuracy(classifier, test_set)) * 100) if accuracies[-1] > best_accuracy: best_accuracy = accuracies[-1] self._classifier = classifier self._train_docs = train_docs self._test_docs = test_docs
def all_class_train(self, dev_docs): ''' Train classifier with train_p percentage of all classes. The remaining docs of each class is used for testing. Parameters ---------- dev_docs: iterable An iterable which yields a list of strings ''' categories_count = self.count_categories(dev_docs) labeled_docs = {} for (cat, count) in categories_count.items(): labeled_docs[cat] = shuffled( [t for (t, k) in dev_docs if k == cat]) train_docs = [] test_docs = [] for cat, l in labeled_docs.items(): cat_limit = int(self.train_p * len(l)) train_docs += [(t, cat) for t in l[:cat_limit]] test_docs += [(t, cat) for t in l[cat_limit:]] self._train_docs = train_docs self._test_docs = test_docs train_set = apply_features(self.get_doc_features, self._train_docs) # create and train the classification model according to t_classifier if self.t_classifier == "NB": self._classifier = NaiveBayesClassifier.train(train_set) elif self.t_classifier == "DT": self._classifier = DecisionTreeClassifier.train(train_set) elif self.t_classifier == "RF": self._classifier = SklearnClassifier(RandomForestClassifier())\ .train(train_set) elif self.t_classifier == "SVM": self._classifier = SklearnClassifier(LinearSVC(), sparse=False)\ .train(train_set)
def equitative_class_train(self, dev_docs): categories_count = self.count_categories(dev_docs) labeled_docs = {} for (cat, count) in categories_count.items(): labeled_docs[cat] = shuffled( [t for (t, k) in dev_docs if k == cat]) train_docs = [] test_docs = [] for cat, l in labeled_docs.items(): cat_limit = int(self.train_p * len(l)) train_docs += [(t, cat) for t in l[:cat_limit]] test_docs += [(t, cat) for t in l[cat_limit:]] self._train_docs = train_docs self._test_docs = test_docs # print("len dev docs", len(dev_docs)) # print("categories count", categories_count) # print("count train", self.count_categories(train_docs)) # print("count test", self.count_categories(test_docs)) # split dev docs and create traning and test set # self.split_train_and_test(dev_docs) train_set = apply_features(self.get_doc_features, self._train_docs) # create and train the classification model according to t_classifier if self.t_classifier == "NB": self._classifier = NaiveBayesClassifier.train(train_set) elif self.t_classifier == "DT": self._classifier = DecisionTreeClassifier.train(train_set) elif self.t_classifier == "RF": self._classifier = SklearnClassifier(RandomForestClassifier())\ .train(train_set) elif self.t_classifier == "SVM": self._classifier = SklearnClassifier(LinearSVC(), sparse=False)\ .train(train_set)
''' def pos_features(sentence, i): features = { "suffix(1)": sentence[i][-1:], "suffix(2)": sentence[i][-2:], "suffix(3)": sentence[i][-3:] } if i == 0: features["prev-word"] = "<START>" else: features["prev-word"] = sentence[i - 1] return features tagged_words = brown.tagged_words(categories='news') # print(tagged_words) featuresets = [(pos_features(n), g) for (n, g) in tagged_words] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[:1000], featuresets[1000:2000] classifier = DecisionTreeClassifier.train(train_set) ac = accuracy(classifier, test_set) print(ac) print(classifier.classify(pos_features('cats'))) print(classifier.pseudocode(depth=4))
features = {} features["firstletter"] = name[0].lower() features["lastletter"] = name[-1].lower() for letter in 'abcdefghijklmnopqrstuvwxyz': features["count(%s)" % letter] = name.lower().count(letter) features["has(%s)" % letter] = (letter in name.lower()) return features def gender_features3(word): return {'suffix1': word[-1:], 'suffix2': word[-2:]} if __name__ == '__main__': print("Lab 3 - Exercise 2") data = get_data() train_set = apply_features(gender_features3, data[500:]) test_set = apply_features(gender_features3, data[:500]) print("Training classifiers") # Train the different classifiers on the training set classifier = [(NaiveBayesClassifier.train(train_set), "NaiveBayes"), (DecisionTreeClassifier.train(train_set), "DecisionTree"), (MaxentClassifier.train(train_set, max_iter=10, trace=0), "MaxEntropy")] # Test all classifiers on the test set for classifier, name in classifier: acc = accuracy(classifier, test_set) print("{} classifier test accuracy: {}".format(name, acc))
plt.ylabel("grade") plt.show() ################################################################################ ### your code here! name your classifier object clf if you want the ### visualization code (prettyPicture) to show you the decision boundary from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier from sklearn.tree import DecisionTreeClassifier #KNN # clf = KNeighborsClassifier(n_neighbors=1) # clf.fit(features_train, labels_train) #Random Forest # clf = RandomForestClassifier(n_estimators=100) # clf = clf.fit(features_train, labels_train) #AdaBoost clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200) clf = clf.fit(features_train, labels_train) predict = clf.predict(features_test) print calculate_accuracy(labels_test, predict) try: prettyPicture(clf, features_test, labels_test) except NameError: pass
featuresets = [(gender_feature(name), predictor) for (name, predictor) in gender] trainingSet, testSet = featuresets[500:], featuresets[0:500] # print(featuresets) # # print(trainingSet) # # print(testSet) name = input('Enter a Name:') classifiers = int( input( 'Enter a Classifier:\n1:Naive Bayes Classifier\n2: Decision Tree\n3: Support Vector machine (SVM)' )) accuracies = [] trainNaiveBaseClassifier = lambda data: NaiveBayesClassifier.train(data) decisionTreeBaseClassifier = lambda decision: DecisionTreeClassifier.train( decision) supportVectorMachine = lambda svm: classify.SklearnClassifier(LinearSVC()) if classifiers == 1: # naiveClassifier = NaiveBayesClassifier.train(trainingSet) naiveBayesClassifier = trainNaiveBaseClassifier(trainingSet) n = naiveBayesClassifier.classify(gender_feature(name)) print('*' * 80) print('Naive Bays Classifier') print('*' * 80) print(naiveBayesClassifier.show_most_informative_features()) #test the accuracy of the classifier using Naive Bayes # Observe that these character names from The Matrix are correctly classified. Although this science fiction movie
from nltk import DecisionTreeClassifier from sklearn.decomposition import PCA from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import RobustScaler from xgboost import XGBClassifier from imputers import FillNaTransformer, HotDeckFullImputer, ModelBasedFullImputer from transformers import BoxCoxTransformer, PolynomialsAdder, CustomBinner, OutliersClipper, FeatureProduct baseline = { 'DecisionTreeClassifier': { 'params': { 'predictor': DecisionTreeClassifier(max_depth=None), 'scaler': None, 'simple_imputer': FillNaTransformer(from_dict={}, mean=['trestbps', 'chol', 'thalach', 'oldpeak'], median=[], nan_flag=[], zero=[]) }, 'score': 0.1974390243902439, 'std': 0.0756691348271984 }, 'KNeighborsClassifier': { 'params': { 'predictor':
pos_features('test') ##feature extractor, we can use it to train a new decision tree classifier: tagged_words = brown.tagged_words(categories='news') print(tagged_words) featuresets = [(pos_features(n), g) for (n,g) in tagged_words] featuresets[0] from nltk import DecisionTreeClassifier from nltk.classify import accuracy cutoff = int(len(featuresets) * 0.1) train_set, test_set = featuresets[cutoff:], featuresets[:cutoff] classifier = DecisionTreeClassifier.train(train_set) # NLTK is a teaching toolkit which is not really optimized for speed. Therefore, this may take forever. For speed, use scikit-learn for the classifiers. accuracy(classifier, test_set) classifier.classify(pos_features('cats')) ''' To accompany the video, here is the sample code for NLTK part of speech tagging with lots of comments and info as well: POS tag list: CC coordinating conjunction CD cardinal digit DT determiner
def dttrain(train_set): # entropy_cutoff=0.05, depth_cutoff=100, # support_cutoff=10, binary=False, feature_values=None, # verbose=False classifier = DecisionTreeClassifier.train(train_set, depth_cutoff=10000) return classifier