def main(): # catch parameters training_set_features = sys.argv[1] training_set_classes = training_set_features.replace('features', 'classes') forest_file = sys.argv[2] # loading training features with open(training_set_features, 'r') as f: training_feature_vector = numpy.load(f) if 1 == training_feature_vector.ndim: training_feature_vector = numpy.expand_dims(training_feature_vector, -1) with open(training_set_classes , 'r') as f: training_class_vector = numpy.load(f) # prepare and train the decision forest forest = ExtraTreesClassifier(n_estimators=200, criterion = 'entropy', max_features = None, min_samples_split = 2, min_samples_leaf = 1, max_depth = 500, bootstrap = True, oob_score = False, random_state=0, n_jobs=n_jobs, compute_importances=True) forest.fit(training_feature_vector, training_class_vector) # saving the decision forest with open(forest_file, 'wb') as f: pickle.dump(forest, f)
class ExtraTreesClassifierImpl(): def __init__(self, n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight='balanced'): self._hyperparams = { 'n_estimators': n_estimators, 'criterion': criterion, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'bootstrap': bootstrap, 'oob_score': oob_score, 'n_jobs': n_jobs, 'random_state': random_state, 'verbose': verbose, 'warm_start': warm_start, 'class_weight': class_weight} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X)
def select_features(X,y,X_test,n_features=100): ''' select the top n_features ''' forest = ExtraTreesClassifier(n_estimators=100,random_state=571) forest.fit(X,y) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] X = X[:,indices[0:n_features]] X_test = X_test[:,indices[0:n_features]] return X,X_test
def run_decision_tree_probabilistic_classification(train, train_labels, validate, validate_labels): # transform counts to TFIDF features tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False) train = tfidf.fit_transform(train).toarray() validate = tfidf.transform(validate).toarray() # encode labels label_encode = preprocessing.LabelEncoder() train_labels = label_encode.fit_transform(train_labels) decisionTree = ExtraTreesClassifier(n_jobs=4, n_estimators=1000, max_features=20, min_samples_split=3, bootstrap=False, verbose=3, random_state=23) decisionTree.fit(train, train_labels) predicted_labels = decisionTree.predict_proba(validate) print "Extra Trees Classifier LogLoss" print str(metrics.log_loss(validate_labels, predicted_labels))
def build_sample(regressor, name): # feature selection sample_X.shape clf = ExtraTreesClassifier() clf = clf.fit(sample_X, sample_y) print clf.feature_importances_ model = SelectFromModel(clf, prefit=True) X_new = model.transform(sample_X) X_new.shape X_new.columns # repeat the CV procedure 10 times to get more precise results n = 10 # for each iteration, randomly hold out 10% of the data as CV set for i in range(n): X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( sample_X[:, features], sample_y, test_size=.10, random_state=i*SEED) # train... regressor = regressor.fit(X_train, y_train) # save model #store_pkl(regressor, name + ".pkl") # predict on train preds = regressor.predict(X_cv) # print #print preds # create DataFrame #preds = DataFrame(preds, columns = ["prime_tot_ttc_preds"]) #print preds #print y_cv # mape mape_r = mape(y_cv, preds) # print print "MAPE of (fold %d/%d) of %s is : %f" % (i+1 , n, name, mape_r) # predict on test predict_res = regressor.predict(sample_t[:, features]) preds_on_test = DataFrame(list(zip(sample_id, predict_res)), columns = ["ID", "CODIS"]) preds_on_test['ID'].astype(int) # save predictions store_csv(preds_on_test, name + ".csv") return predict_res
def classify(X,y,cv): #clf = DecisionTreeClassifier() #clf = RandomForestClassifier() #clf = AdaBoostClassifier() clf = ExtraTreesClassifier() score = cross_val_score(clf, X, y, cv=cv) print '%s-fold cross validation accuracy: %s' % (cv,sum(score)/score.shape[0]) clf = clf.fit(X,y) #print 'Feature Importances' #print clf.feature_importances_ #X = clf.transform(X,threshold=.3) preds = clf.predict(X) print 'predictions counter' print Counter(clf.predict(X)) fp=0 tp=0 fn=0 tn=0 for a in range(len(y)): if y[a]==preds[a]: if preds[a]==0: tn+=1 elif preds[a]==1: tp+=1 elif preds[a]==1:fp+=1 elif preds[a]==0:fn+=1 print 'correct positives:', tp print 'correct negatives:', tn print 'false positives:', fp print 'false negatives:', fn print 'precision:',float(tp)/(tp+fp) print 'recall (tp)/(tp+fn):',float(tp)/(tp+fn) print 'false positive rate (fp)/(fp+tn):', float(fp)/(fp+tn) print 'false positive rate2 (fp)/(fp+tp):', float(fp)/(fp+tp) print 'prediction accuracy: %s%s\n' % (100*float(tp+tn)/(tp+tn+fp+fn),'%') return clf
#print ",dfeatures[features][:-1]\n",dfeatures[features][:-1] pd.set_option('display.max_columns', None) print "ALL columns of dfeatures[features]" print dfeatures[features].head(1) # create a test and training set x_train, x_test, y_train, y_test = train_test_split( dfeatures[features], dfeatures.author_num.values, test_size=0.4, random_state=123) x, y = dfeatures[features], dfeatures.author_num.values # CLASSIFIER etclf = ExtraTreesClassifier(n_estimators=20) etclf.fit(x_train, y_train) scores = cross_val_score(etclf, x, y) print scores.mean() # Print Confusion Matrix print metrics.confusion_matrix(etclf.predict(x_test), y_test) # print authors """ # # PREVIOUS RESULT 0.671469386087 ############# RESULT WITH ALL FEATURES ############ /Users/jhave/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:401: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=3. % (min_labels, self.n_folds)), Warning) 0.148101533384 [[0 0 0 ..., 0 0 0]
from sklearn.ensemble.forest import ExtraTreesClassifier from sklearn import metrics from sklearn import preprocessing authorship = read_csv("http://people.stern.nyu.edu/jsimonof/AnalCatData/Data/Comma_separated/authorship.csv") authors = list(set(authorship.Author.values)) le = preprocessing.LabelEncoder() le.fit(authors) authorship["Author_num"] = le.transform(authorship["Author"]) # What are some of the stop words we're looking at? features = list(authorship.columns) features features.remove("Author") features.remove("Author_num") # Create a random variable (random forests work best with a random variable) # and create a test and training set authorship["random"] = [random.random() for i in range(841)] x_train, x_test, y_train, y_test = train_test_split( authorship[features], authorship.Author_num.values, test_size=0.4, random_state=123 ) # Fit Model etclf = ExtraTreesClassifier(n_estimators=20) etclf.fit(x_train, y_train) # Print Confusion Matrix metrics.confusion_matrix(etclf.predict(x_test), y_test)
def etree_classify(X,Y): clf = ExtraTreesClassifier(n_estimators=500, max_depth=10, criterion='gini',min_samples_split=2, \ min_samples_leaf=1, max_features=None, bootstrap=False, oob_score=False, n_jobs=-1) clf.fit(X,Y) return clf
# also tested this: # svm.SVC(kernel='linear', C=1.0), GaussianNB() # doesn't improve and takes long #running crossvalidation score on all classifiers for clf in classifiers: score = cross_val_score(clf, X, y, cv=cv) print "%s \n Accuracy: %0.2f (+/- %0.2f)\n" % (clf, score.mean(), score.std() / 2) #now let's go to OOS test testX = test[['Sex01','Fare','SibSp','Parch','Pclass']] medianFare = testX.Fare.median() testX.Fare = testX.Fare.fillna(medianFare) #print results to CSV files for Kaggle submission clf = ExtraTreesClassifier() clf.fit(X, y) test['Survived'] = pd.Series(clf.predict(testX)) test[['PassengerId','Survived']].to_csv('ETClf.csv',index=False) clf = RandomForestClassifier() clf.fit(X, y) test['Survived'] = pd.Series(clf.predict(testX)) test[['PassengerId','Survived']].to_csv('RFClf.csv',index=False) clf = DecisionTreeClassifier() clf.fit(X, y) test['Survived'] = pd.Series(clf.predict(testX)) test[['PassengerId','Survived']].to_csv('DTClf.csv',index=False)
class BestClassifiers(object): ''' SVM models of different patterns. It loads model and enable prediction with new data. ''' def __init__(self, patternEnum=PatternEnum.EVENTUALLY): ''' Initialize pattern's object with corresponding model file name and the best SMV classifier and pre-processing method identified before. ''' self.patternEnum = patternEnum self.pattern = Pattern(patternEnum) modelFile = str( patternEnum.order) + "_" + patternEnum.getFullName() + ".pkl" self.modelFile = config.PROJECT_ROOT + os.sep + "models" + os.sep + modelFile self.preProcessMethod = "NONE" if (patternEnum == PatternEnum.EVENTUALLY): self.maxRandState = 196558 # random seed to shuffle data for training self.preProcessMethod = "NORMALIZE" self.clf = \ SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) elif (patternEnum == PatternEnum.ALWAYS): self.maxRandState = 124255 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) elif (patternEnum == PatternEnum.FOLLOWS): self.maxRandState = 196588 # random seed to shuffle data for training self.preProcessMethod = "NORMALIZE" self.clf = \ SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=1.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) elif (patternEnum == PatternEnum.PRECEDES): self.maxRandState = 187708 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) elif (patternEnum == PatternEnum.NEVER): self.maxRandState = 182526 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) elif (patternEnum == PatternEnum.STEADY_STATE): self.maxRandState = 119746 # random seed to shuffle data for training self.preProcessMethod = "NORMALIZE" self.clf = \ SVC(C=10000000.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) elif (patternEnum == PatternEnum.UNTIL): self.maxRandState = 114007 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) elif (patternEnum == PatternEnum.INFINITELY_OFTEN): self.maxRandState = 150000 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features='log2', max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) elif (patternEnum == PatternEnum.NEXT): self.maxRandState = 173977 # random seed to shuffle data for training self.preProcessMethod = "NORMALIZE" self.clf = \ SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=1.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) elif (patternEnum == PatternEnum.RELEASE): self.maxRandState = 105454 # random seed to shuffle data for training self.preProcessMethod = "SCALE" self.clf = \ SVC(C=10000000.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) elif (patternEnum == PatternEnum.WEAK_UNTIL): self.maxRandState = 163090 # random seed to shuffle data for training self.preProcessMethod = "NONE" self.clf = \ ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1, oob_score=False, random_state=0, verbose=0, warm_start=False) def getModel(self): "Get the classifier, and the unique class labels (the class names)" try: # print("self.modelFile",self.modelFile) clf, preprocessor = pickle.load(open(self.modelFile, "rb")) # print("Classifier found. It is loading.") return clf, preprocessor except (OSError, IOError): # Model does not exist, first train then save it # print("Classifier not found. New classifier is training.") X, preprocessor = processData(self.pattern.feature, self.preProcessMethod) # shuffle data shuffled_X, shuffled_y = shuffle(X, self.pattern.y, random_state=self.maxRandState) self.clf.fit(shuffled_X, shuffled_y) # save the model pickle.dump((self.clf, preprocessor), open(self.modelFile, "wb")) return self.clf, preprocessor except Exception as e: print(e) def predict(self, properties): clf, preprocessor = self.getModel() if preprocessor: properties = preprocessor.transform( properties ) # apply the pre-processing method done for training date targetMC = clf.predict(properties) return targetMC
tp += 1 elif preds[a] == 1: fp += 1 elif preds[a] == 0: fn += 1 print 'correct positives:', tp print 'correct negatives:', tn print 'false positives:', fp print 'false negatives:', fn extra_trees = ExtraTreesClassifier() extra_score = cross_val_score(extra_trees, X, y, cv=i) print '\nextra trees %s-fold cross validation accuracy: %s' % (i, sum(extra_score)/extra_score.shape[0]) extra_fit = extra_trees.fit(X, y) print 'Feature Importances %s' % (extra_fit.feature_importances_) for f in extra_fit.feature_importances_: print '{}: {}'.format(next(features), f) X_for_preds = extra_fit.transform(X, threshold=min(extra_fit.feature_importances_)) preds = extra_fit.predict(X_for_preds) print 'predictions counter %s' % (Counter(extra_fit.predict(X_for_preds))) fp = 0 tp = 0 fn = 0 tn = 0 for a in range(len(y)): if y[a] == preds[a]: if preds[a] == 0: tn += 1
# Apply Some Featuring poly_reg = PolynomialFeatures(degree=1) # Transform into numpy object x_train = poly_reg.fit_transform(X_train) X_test = poly_reg.fit_transform(X_test) y_test = np.array(y_test.ix[:,0]) y_train = np.array(y_train.ix[:,0]) # Build model with good params model = ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy', max_depth=None, max_features=0.6, max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=4, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) # Fit the model model.fit(x_train, y_train) # Predict y_pred = model.predict(X_test) # Scoring if regression: print('Score on test set:', mean_absolute_error(y_test, y_pred)) else: print('Score on test set:', accuracy_score(y_test, y_pred))