class ExtraTreesClassifierImpl(): def __init__(self, n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight='balanced'): self._hyperparams = { 'n_estimators': n_estimators, 'criterion': criterion, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'bootstrap': bootstrap, 'oob_score': oob_score, 'n_jobs': n_jobs, 'random_state': random_state, 'verbose': verbose, 'warm_start': warm_start, 'class_weight': class_weight} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X)
def classify(X,y,cv): #clf = DecisionTreeClassifier() #clf = RandomForestClassifier() #clf = AdaBoostClassifier() clf = ExtraTreesClassifier() score = cross_val_score(clf, X, y, cv=cv) print '%s-fold cross validation accuracy: %s' % (cv,sum(score)/score.shape[0]) clf = clf.fit(X,y) #print 'Feature Importances' #print clf.feature_importances_ #X = clf.transform(X,threshold=.3) preds = clf.predict(X) print 'predictions counter' print Counter(clf.predict(X)) fp=0 tp=0 fn=0 tn=0 for a in range(len(y)): if y[a]==preds[a]: if preds[a]==0: tn+=1 elif preds[a]==1: tp+=1 elif preds[a]==1:fp+=1 elif preds[a]==0:fn+=1 print 'correct positives:', tp print 'correct negatives:', tn print 'false positives:', fp print 'false negatives:', fn print 'precision:',float(tp)/(tp+fp) print 'recall (tp)/(tp+fn):',float(tp)/(tp+fn) print 'false positive rate (fp)/(fp+tn):', float(fp)/(fp+tn) print 'false positive rate2 (fp)/(fp+tp):', float(fp)/(fp+tp) print 'prediction accuracy: %s%s\n' % (100*float(tp+tn)/(tp+tn+fp+fn),'%') return clf
x_train, x_test, y_train, y_test = train_test_split( dfeatures[features], dfeatures.author_num.values, test_size=0.4, random_state=123) x, y = dfeatures[features], dfeatures.author_num.values # CLASSIFIER etclf = ExtraTreesClassifier(n_estimators=20) etclf.fit(x_train, y_train) scores = cross_val_score(etclf, x, y) print scores.mean() # Print Confusion Matrix print metrics.confusion_matrix(etclf.predict(x_test), y_test) # print authors """ # # PREVIOUS RESULT 0.671469386087 ############# RESULT WITH ALL FEATURES ############ /Users/jhave/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:401: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=3. % (min_labels, self.n_folds)), Warning) 0.148101533384 [[0 0 0 ..., 0 0 0] [0 0 0 ..., 0 0 0] [0 0 0 ..., 0 0 0] ..., [0 0 0 ..., 2 0 0] [0 0 0 ..., 0 0 0] [0 0 0 ..., 0 0 0]]
from sklearn.ensemble.forest import ExtraTreesClassifier from sklearn import metrics from sklearn import preprocessing authorship = read_csv("http://people.stern.nyu.edu/jsimonof/AnalCatData/Data/Comma_separated/authorship.csv") authors = list(set(authorship.Author.values)) le = preprocessing.LabelEncoder() le.fit(authors) authorship["Author_num"] = le.transform(authorship["Author"]) # What are some of the stop words we're looking at? features = list(authorship.columns) features features.remove("Author") features.remove("Author_num") # Create a random variable (random forests work best with a random variable) # and create a test and training set authorship["random"] = [random.random() for i in range(841)] x_train, x_test, y_train, y_test = train_test_split( authorship[features], authorship.Author_num.values, test_size=0.4, random_state=123 ) # Fit Model etclf = ExtraTreesClassifier(n_estimators=20) etclf.fit(x_train, y_train) # Print Confusion Matrix metrics.confusion_matrix(etclf.predict(x_test), y_test)
import random from pandas import read_csv from sklearn.cross_validation import train_test_split from sklearn.ensemble.forest import ExtraTreesClassifier from sklearn import metrics from sklearn import preprocessing authorship = read_csv('http://people.stern.nyu.edu/jsimonof/AnalCatData/Data/Comma_separated/authorship.csv') authors = list(set(authorship.Author.values)) le = preprocessing.LabelEncoder() le.fit(authors) authorship['Author_num'] = le.transform(authorship['Author']) #What are some of the stop words we're looking at? features = list(authorship.columns) features features.remove('Author') features.remove('Author_num') # Create a random variable (random forests work best with a random variable) # and create a test and training set authorship['random'] = [random.random() for i in range(841)] x_train, x_test, y_train, y_test = train_test_split(authorship[features], authorship.Author_num.values, test_size=0.4, random_state=123) # Fit Model etclf = ExtraTreesClassifier(n_estimators=20) etclf.fit(x_train, y_train) # Print Confusion Matrix metrics.confusion_matrix(etclf.predict(x_test), y_test) print metrics.classification_report(etclf.predict(x_test), y_test)
# also tested this: # svm.SVC(kernel='linear', C=1.0), GaussianNB() # doesn't improve and takes long #running crossvalidation score on all classifiers for clf in classifiers: score = cross_val_score(clf, X, y, cv=cv) print "%s \n Accuracy: %0.2f (+/- %0.2f)\n" % (clf, score.mean(), score.std() / 2) #now let's go to OOS test testX = test[['Sex01','Fare','SibSp','Parch','Pclass']] medianFare = testX.Fare.median() testX.Fare = testX.Fare.fillna(medianFare) #print results to CSV files for Kaggle submission clf = ExtraTreesClassifier() clf.fit(X, y) test['Survived'] = pd.Series(clf.predict(testX)) test[['PassengerId','Survived']].to_csv('ETClf.csv',index=False) clf = RandomForestClassifier() clf.fit(X, y) test['Survived'] = pd.Series(clf.predict(testX)) test[['PassengerId','Survived']].to_csv('RFClf.csv',index=False) clf = DecisionTreeClassifier() clf.fit(X, y) test['Survived'] = pd.Series(clf.predict(testX)) test[['PassengerId','Survived']].to_csv('DTClf.csv',index=False)
def main(): ### Import data sets l_train = pd.read_csv('lemon_training.csv') l_test = pd.read_csv('lemon_test.csv') ### Clean/prepare data sets l_train = l_train.dropna(axis=1) l_test = l_test.dropna(axis=1) features = list(l_train.describe().columns) features.remove('RefId') features.remove('IsBadBuy') ### Create test and training sets train_features = l_train[features].values train_class = l_train.IsBadBuy.values OSS_features = l_test[features].values # Seed PRNG np.random.seed(1234) X_train, X_test, y_train, y_test = \ cross_validation.train_test_split(train_features, train_class, test_size=.3) ### Build model # model = naive_bayes.GaussianNB().fit(X_train, y_train) model = ExtraTreesClassifier(max_depth=8).fit(X_train, y_train) model.score(X_train, y_train) y_pred_train = model.predict(X_train) y_pred_test = model.predict(X_test) ### Stats print 'training:\n', metrics.confusion_matrix(y_train, y_pred_train) print metrics.classification_report(y_train, y_pred_train) print 'test:\n', metrics.confusion_matrix(y_test, y_pred_test) print metrics.classification_report(y_test, y_pred_test) fpr_train, tpr_train, thresholds_train = metrics.roc_curve(y_train, y_pred_train, pos_label=1) fpr_test, tpr_test, thresholds_test = metrics.roc_curve(y_test, y_pred_test, pos_label=1) print 'train MA: ', model.score(X_train, y_train) print 'test MA: ', model.score(X_test, y_test) print 'train AUC: ', metrics.auc(fpr_train, tpr_train) print 'test AUC: ', metrics.auc(fpr_test, tpr_test) # Cross Validation AUCs = [] # for i in xrange(10): # X_train, X_test, y_train, y_test = \ # cross_validation.train_test_split(train_features, train_class, test_size=.3) # y_pred_test = model.fit(X_train, y_train).predict(X_test) # fpr_test, tpr_test, thresholds_test = metrics.roc_curve(y_test, y_pred_test, pos_label=1) # AUCs.append(metrics.auc(fpr_test, tpr_test)) # print 'AUC cross val: ', AUCs ### Do output predicitons for OSS data OSS_features = l_test[features].values y_pred_OSS = model.predict(OSS_features) submission = pd.DataFrame({ 'RefId' : l_test.RefId, 'prediction' : y_pred_OSS })
# Apply Some Featuring poly_reg = PolynomialFeatures(degree=1) # Transform into numpy object x_train = poly_reg.fit_transform(X_train) X_test = poly_reg.fit_transform(X_test) y_test = np.array(y_test.ix[:,0]) y_train = np.array(y_train.ix[:,0]) # Build model with good params model = ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy', max_depth=None, max_features=0.6, max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=4, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) # Fit the model model.fit(x_train, y_train) # Predict y_pred = model.predict(X_test) # Scoring if regression: print('Score on test set:', mean_absolute_error(y_test, y_pred)) else: print('Score on test set:', accuracy_score(y_test, y_pred))