from util import get_split_training_dataset from metrics import suite from sklearn.naive_bayes import GaussianNB def train(Xtrain, Ytrain): """ Use entirety of provided X, Y to predict Default Arguments Xtrain -- Training data Ytrain -- Training prediction Named Arguments C -- regularization parameter Returns classifier -- a tree fitted to Xtrain and Ytrain """ classifier = GaussianNB() classifier.fit(Xtrain, Ytrain) return classifier if __name__ == "__main__": # Let's take our training data and train a decision tree # on a subset. Scikit-learn provides a good module for cross- # validation. Xt, Xv, Yt, Yv = get_split_training_dataset() Classifier = train(Xt, Yt) print "Naive Bayes Classifier" suite(Yv, Classifier.predict(Xv))
from sklearn.ensemble import ExtraTreesClassifier from sklearn.grid_search import GridSearchCV def train(Xtrain, Ytrain, n=250, d=None): """ Use entirety of provided X, Y to train random forest Arguments Xtrain -- Training data Ytrain -- Training prediction Returns classifier """ classifier = ExtraTreesClassifier(n_estimators=n, max_depth=d, min_samples_split=1, random_state=0, max_features=36) classifier.fit(Xtrain, Ytrain) return classifier if __name__ == "__main__": # Let's take our training data and train a random forest # on a subset. Xt, Xv, Yt, Yv = get_split_training_dataset() Classifier = train(Xt, Yt) print "Extra Random Trees Ensemble Classifier" suite(Yv, Classifier.predict(Xv))
C -- regularization parameter Returns classifier -- a tree fitted to Xtrain and Ytrain """ # Initialize classifier parameters for adaboost # For adaboost, this means the number of estimators for now ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1)) parameters = {'n_estimators': [150]} # Classify over grid of parameters classifier = GridSearchCV(ada, parameters) classifier.fit(Xtrain, Ytrain) return classifier if __name__ == "__main__": # Let's take our training data and train a decision tree # on a subset. Scikit-learn provides a good module for cross- # validation. Xt, Xv, Yt, Yv = get_split_training_dataset() Classifier = train(Xt, Yt) print "Adaboost Classifier" suite(Yv, Classifier.predict(Xv)) # smaller feature set Xtimp, features = fclassify.get_important_data_features(Xt, Yt, max_features=25) Xvimp = fclassify.compress_data_to_important_features(Xv, features) ClassifierImp = train(Xtimp,Yt) print "Adaboosts Classiifer, 25 important features" suite(Yv, ClassifierImp.predict(Xvimp))
def suite(): return unittest.TestSuite([db.suite(), metrics.suite(), analysis.suite(), ])
classifier -- A random forest of n estimators, fitted to Xtrain and Ytrain """ if grid == True: forest = RandomForestClassifier(max_depth=None, random_state=0, min_samples_split=1,max_features=38) parameters = { 'n_estimators': [200,250,300], } # Classify over grid of parameters classifier = GridSearchCV(forest, parameters) else: classifier = RandomForestClassifier(n_estimators=n) classifier.fit(Xtrain, Ytrain) return classifier if __name__ == "__main__": # Let's take our training data and train a random forest # on a subset. Xt, Xv, Yt, Yv = get_split_training_dataset() print "Random Forest Classifier" Classifier = train(Xt, Yt) suite(Yv, Classifier.predict(Xv)) # smaller feature set Xtimp, features = fclassify.get_important_data_features(Xt, Yt) Xvimp = fclassify.compress_data_to_important_features(Xv, features) ClassifierImp = train(Xtimp,Yt) print "Forest Classiifer, ~25 important features" suite(Yv, ClassifierImp.predict(Xvimp))