def predict_TestData(Food_df,People_df): cTrainF = rand(len(Food_df)) > .5 cTestF = ~cTrainF cTrainP = rand(len(People_df)) > .5 cTestP = ~cTrainP TrainX_df = pd_concat([People_df[cTrainP], Food_df[cTrainF]],axis=0) TestX_df = pd_concat([People_df[cTestP], Food_df[cTestF]],axis=0) TrainX= TrainX_df.ix[:,2:].values TestX= TestX_df.ix[:,2:].values TrainY = concatenate([ones(len(People_df[cTrainP])), zeros(len(Food_df[cTrainF]))]) TestY = concatenate([ones(len(People_df[cTestP])), zeros(len(Food_df[cTestF]))]) ET_classifier = ExtraTreesClassifier(n_estimators=50, max_depth=None, min_samples_split=1, random_state=0) ET_classifier.fit(TrainX,TrainY) ET_prediction = ET_classifier.predict(TestX) LinSVC_classifier = svm.LinearSVC() LinSVC_classifier.fit(TrainX,TrainY) LinSVC_predict = LinSVC_classifier.predict(TestX) a=DataFrame() a["url"]=TestX_df.urls.values a["answer"]=TestY a["ET_predict"]=ET_prediction a["LinSVC_predict"]=LinSVC_predict a.to_csv("prediction_for_TestData.csv")
class Identifier: def __init__(self,grabable = set([]),clf = None): self.grabable = grabable #TODO if we care to, not used at the mo self.orb = orb = cv2.ORB(nfeatures = 1000)#,nlevels = 20, scaleFactor = 1.05) self.items = [ "champion_copper_plus_spark_plug", "cheezit_big_original","crayola_64_ct", "dove_beauty_bar", "elmers_washable_no_run_school_glue","expo_dry_erase_board_eraser", "feline_greenies_dental_treats","first_years_take_and_toss_straw_cups", "genuine_joe_plastic_stir_sticks","highland_6539_self_stick_notes", "kong_air_dog_squeakair_tennis_ball","kong_duck_dog_toy", "kong_sitting_frog_dog_toy", "kygen_squeakin_eggs_plush_puppies","mark_twain_huckleberry_finn", "mead_index_cards","mommys_helper_outlet_plugs","munchkin_white_hot_duck_bath_toy", "one_with_nature_soap_dead_sea_mud","oreo_mega_stuf", "paper_mate_12_count_mirado_black_warrior","rollodex_mesh_collection_jumbo_pencil_cup", "safety_works_safety_glasses", "sharpie_accent_tank_style_highlighters", "stanley_66_052" ] if not clf: print "Training new classifier" self.clf =ExtraTreesClassifier(min_samples_split = 1,n_jobs = -1,n_estimators = 150, class_weight = 'subsample') X = np.ascontiguousarray(joblib.load('labels.pkl')) Y = np.ascontiguousarray(joblib.load('features.pkl'), dtype = np.float64) Y = preprocessing.scale(Y) self.clf.fit(Y,X) else: self.clf = clf def identify(self,im,possibilites): if im is not None: kpTest, desTest = self.orb.detectAndCompute(im,None) pred = self.clf.predict(preprocessing.scale(np.array(desTest,dtype = np.float64))) c = Counter(pred) r = [(k,c[k]) for k in sorted(set(c.keys())&possibilites, key = lambda k: c[k],reverse = True)] if r: item = r[0][0] print self.items[item], return item else: return -1 else: print "Image to recognize is None"
def stack(X, y, X_test, y_test): X, X1, y, y1 = train_test_split(X, y, test_size=0.5) #clf1 = GradientBoostingClassifier(n_estimators=10) #clf1 = RandomForestClassifier(n_estimators=20) clf1 = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0) clf2 = linear_model.SGDClassifier(loss='log') enc = OneHotEncoder() #clf2 = RandomForestClassifier(n_estimators=10) #clf2 = GradientBoostingClassifier(n_estimators=20) clf1.fit(X, y) enc.fit(clf1.apply(X)) clf2.fit(enc.transform(clf1.apply(X1)), y1) #prob = clf2.predict_proba(enc.transform(clf1.apply(X_test)[:, :, 0]))[:, 1] prob = clf2.predict_proba(enc.transform(clf1.apply(X_test)).toarray())[:, 1] res = clf2.predict(enc.transform(clf1.apply(X_test))) check = zip(y_test, res) tp, tn, fp, fn = 0, 0, 0, 0 for value, prediction in check: if (prediction and value): tp += 1 if (prediction and not value): fp += 1 if (not prediction and value): fn += 1 if (not prediction and not value): tn += 1 print ('TP: {0}, TN: {1}, FP: {2}, FN: {3}'.format(tp, tn, fp, fn)) print ("Precision Score : %f" % metrics.precision_score(y_test, res)) print ("Recall Score : %f" % metrics.recall_score(y_test, res)) return roc_curve(y_test, prob)
def learn(f): global raw_data print 'testing classifier' data = raw_data[raw_data['label'] != 'unknown'] data = data[data['file type'] == 'EXECUTE'] X = data.as_matrix(f) y = np.array(data['label'].tolist()) #clf = RandomForestClassifier(n_estimators=100) clf = ExtraTreesClassifier(n_estimators=100) #clf = AdaBoostClassifier() scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=10) print("predicted accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) seed = 3301 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed) clf.fit(X_train, y_train) scores = clf.score(X_test, y_test) print("actual accuracy: %0.2f" % scores) importances = zip(f, clf.feature_importances_) importances.sort(key=lambda k:k[1], reverse=True) for im in importances[0:20]: print im[0].ljust(30), im[1] #y_pred = clf.predict(X_test) #labels = ['good', 'bad'] #cm = confusion_matrix(y_test, y_pred, labels) #plot_cm(cm, labels) #joblib.dump(clf, 'model.pkl') return clf
def plotFeatureImportances(x, y, fieldNames, numTrees = 100): print fieldNames # fit forest = ExtraTreesClassifier(n_estimators=numTrees, compute_importances=True, random_state=0) forest.fit(x, y) # get importances importances = forest.feature_importances_ print sum(importances) std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] # present numFeatures = len(importances) print 'feature ranking:' for i in xrange(numFeatures): print '%d. feature %d (%s) has importance %f' % (i+1, indices[i], fieldNames[indices[i]], importances[indices[i]]) xtickLabels = [fieldNames[i] for i in indices] pylab.figure() pylab.title('Feature Importances From A Random Forest with %s trees' % numTrees) pylab.bar(xrange(numFeatures), importances[indices], color='r', yerr=std[indices], align='center') pylab.xticks(xrange(numFeatures), xtickLabels) pylab.xlim([-1, numFeatures]) pylab.show()
def main(): # Define the known data points or "training" data explanatory_fields = "d100 dd0 dd5 fday ffp gsdd5 gsp map mat_tenths mmax_tenths mmindd0 mmin_tenths mtcm_tenths mtwm_tenths sday".split() explanatory_rasters = [os.path.join(TRAINING_DIR, "current_" + r + ".img") for r in explanatory_fields] response_shapes = os.path.join(TRAINING_DIR, "DF.shp") # Load the training rasters using the sampled subset try: cached = json.load(open("_cached_training.json")) train_xs = np.array(cached['train_xs']) train_y = np.array(cached['train_y']) except IOError: train_xs, train_y = load_training_vector(response_shapes, explanatory_rasters, response_field='GRIDCODE') cache = {'train_xs': train_xs.tolist(), 'train_y': train_y.tolist()} with open("_cached_training.json", 'w') as fh: fh.write(json.dumps(cache)) print(train_xs.shape, train_y.shape) # Train the classifier clf = ExtraTreesClassifier(n_estimators=120, n_jobs=3) clf.fit(train_xs, train_y) print(clf) evaluate_clf(clf, train_xs, train_y, feature_names=explanatory_fields)
def ET_classif(features_df=None, labels_df=None): '''Scoring function to be used in SelectKBest feature selection class object. This scoring function assigns varaible importances to the features passed in to it using the ExtraTreesClassifier. It then returns the features as two identical arrays mimicking the scores and p-values arrays required by SelectKBest to pick the top K features. Args: features_df: Pandas dataframe of features to be used to predict using the ExtraTreesClassifier. labels_df: Pandas dataframe of the labels being predicted. Returns: Two identical arrays containing the feature importance scores returned for each feature by the ExtraTreesClassifier. ''' reducer = ExtraTreesClassifier(n_estimators=500, bootstrap=False, oob_score=False, max_features=.10, min_samples_split=10, min_samples_leaf=2, criterion='gini', random_state=42) reducer.fit(features_df, labels_df) return reducer.feature_importances_, reducer.feature_importances_
def train_tree(): word_vector_hash = knn.word_vectors(training, vector_length, False) sku_vectors, class_labels, _, sku_hash = knn.data(adapt1, vector_length, 'all', word_vector_hash) xtrees = ExtraTreesClassifier(n_estimators=1, max_depth=None, min_samples_split=1, random_state=0) model2 = xtrees.fit(sku_vectors, class_labels) sku_vectors, class_labels, _, sku_hash = knn.data(adapt2, vector_length, 'all', word_vector_hash) xtrees = ExtraTreesClassifier(n_estimators=1, max_depth=None, min_samples_split=1, random_state=0) model3 = xtrees.fit(sku_vectors, class_labels) sku_vectors, class_labels, _, sku_hash = knn.data(adapt3, vector_length, 'all', word_vector_hash) xtrees = ExtraTreesClassifier(n_estimators=1, max_depth=None, min_samples_split=1, random_state=0) model4 = xtrees.fit(sku_vectors, class_labels) # Non-adaptive data sku_vectors, class_labels, _, sku_hash = knn.data(training, vector_length, False, word_vector_hash) model2 = ConfidenceDecorator(model2, sku_vectors, class_labels) model3 = ConfidenceDecorator(model3, sku_vectors, class_labels) model4 = ConfidenceDecorator(model4, sku_vectors, class_labels) xtrees = ExtraTreesClassifier(n_estimators=1, max_depth=None, min_samples_split=1, random_state=0) model1 = xtrees.fit(sku_vectors, class_labels) model1 = ConfidenceDecorator(model1, sku_vectors, class_labels) forest = RandomForestClassifier(n_estimators=3, max_depth=None, min_samples_split=1, random_state=0) model5 = forest.fit(sku_vectors, class_labels) model5 = ConfidenceDecorator(model5, sku_vectors, class_labels) #neigh = neighbors.KNeighborsClassifier(n_neighbors=10, warn_on_equidistant=False, weights="distance") #model6 = neigh.fit(sku_vectors, class_labels) #model6 = ConfidenceDecorator(model6, sku_vectors, class_labels) models = [model1, model2, model3, model4, model5]# model6] return models, word_vector_hash
def feature_important(filename): from sklearn.datasets import make_classification from sklearn.ensemble import ExtraTreesClassifier content = read_csv(filename) X = [c.decisions for c in content] y = [c.objective for c in content] # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=250, random_state=0) forest.fit(X, y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") # for f in range(len(X[0])): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # Plot the feature importances of the forest plt.figure() plt.title("Feature importances") plt.bar(range(len(X[0])), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(len(X[0])), indices) plt.xlim([-1, len(X[0])]) plt.show()
def train_random_forest(X_train,y_train,**kwargs): from sklearn.ensemble import ExtraTreesClassifier n_estimators = kwargs.pop('n_estimators',300) max_features = kwargs.pop('max_features','auto') n_jobs = kwargs.pop('n_jobs',-1) verbose = kwargs.pop('verbose',0) tuned_params = kwargs.pop('tuned_params',None) # initialize baseline classifier clf = ExtraTreesClassifier(n_estimators=n_estimators,random_state=42, n_jobs=n_jobs,verbose=verbose,criterion='gini', max_features=max_features,oob_score=True, bootstrap=True) if tuned_params is not None: # optimize if desired from sklearn.grid_search import GridSearchCV cv = GridSearchCV(clf,tuned_params,cv=5,scoring='roc_auc', n_jobs=n_jobs,verbose=verbose,refit=True) cv.fit(X_train, y_train) clf = cv.best_estimator_ else: # otherwise train with the specified parameters (no tuning) clf.fit(X_train,y_train) return clf
def tree_based_feature_selection(self, x: np.ndarray, y: np.ndarray) -> np.ndarray: n = len(self.features) forest = ExtraTreesClassifier(n_estimators=250, random_state=0) forest.fit(x, y) importances = forest.feature_importances_ print(importances) std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] print("Feature ranking:") for f in range(n): print("%d. feature %d: %s (%f)" % (f + 1, indices[f], self.features[indices[f]],importances[indices[f]])) # Plot the feature importances of the forest # plt.figure() # plt.title("Feature importances") # plt.bar(range(n), importances[indices], # color="r", yerr=std[indices], align="center") # plt.xticks(range(n), indices) # plt.xlim([-1, n]) # plt.show() n = 12 print(indices[0:n+1]) print(self.features[indices[0:n+1]]) new_x = x[:, indices[0:n+1]] return new_x
def top_importances(features_df=None, labels_df=None, top_N=10): ''' Finds the top N importances using the ExtraTreesClassifier. Finds the top N importances of a dataframe of features and a dataframe of labels using the ExtraTreesClassifier. Args: features_df: Pandas dataframe of features used to predict. labels_df: Pandas dataframe of labels to be predicted. top_N: interger value of the top N most importance features to return. Returns: Pandas dataframe containing the top N importances and their importance scores. ''' reducer = ExtraTreesClassifier(n_estimators=2000, bootstrap=False, oob_score=False, max_features=.10, min_samples_split=10, min_samples_leaf=2, criterion='gini') reducer.fit(features_df, labels_df) scores = pd.DataFrame(reducer.feature_importances_, index=features_df.columns) scores.columns = ['Importances'] scores = scores.sort(['Importances'], ascending=False) return scores[0:top_N]
def plotImportance(X,y): forest = ExtraTreesClassifier(n_estimators=250, random_state=0) forest.fit(X, y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] n=X.shape[1] #Print the feature ranking #print("Feature ranking:") #for f in range(n): # print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # Plot the feature importances of the forest plt.figure(figsize=(20,15)) plt.title("Feature importances") plt.bar(range(n), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(n), X.columns[indices],rotation=90) plt.xlim([-1, n]) plt.savefig('featuresel.pdf')
class FeaturesSelectionRandomForests(object): def __init__(self, n_estimators = 100, feature_importance_th = 0.005): self.n_estimators = n_estimators self.feature_importance_th = feature_importance_th def fit(self, X, y, n_estimators = None, feature_importance_th = None): if n_estimators is not None: assert isinstance(n_estimators,(int,long,float)) self.n_estimators = n_estimators if feature_importance_th is not None: assert isinstance(feature_importance_th,(int,long,float)) self.feature_importance_th = feature_importance_th #filter features by forest model self.trees = ExtraTreesClassifier(n_estimators=100, compute_importances=True) self.trees.fit(X, y) self.features_mask = np.where(self.trees.feature_importances_ > 0.005)[0] def plot_features_importance(self): pd.DataFrame(self.trees.feature_importances_).plot(kind='bar') plt.show() def transform(self, X): assert hasattr(self,"features_mask") return X[:, self.features_mask]
def get_important_features(Xtrain, Ytrain, n=250, threshold=0.01, verbose=False): """ Use entirety of provided X, Y to train random forest Arguments Xtrain -- Training data Ytrain -- Training prediction Optional Arguments n -- number of ensemble members threshold -- threshold of importance above which a feature is relevant verbose -- if true, prints results of ranking Returns ranking -- a ranked list of indices of important features """ # Train and fit tree classifier ensemble classifier = ExtraTreesClassifier(n_estimators=n, random_state=0) classifier.fit(Xtrain, Ytrain) # Compute important features importances = classifier.feature_importances_ std = np.std([tree.feature_importances_ for tree in classifier.estimators_], axis=0) indices = np.argsort(importances)[::-1] ranking = [[indices[f], importances[indices[f]]] for f in range(Xtrain.shape[1])] ranking = filter(lambda r: r[1] >= threshold, ranking) if verbose: for r in range(len(ranking)): print str(r+1) + ". ", ranking[r][0], ranking[r][1] return ranking
def select_with_forest(X, y, n_trees=10, treshold=0.01): from sklearn.preprocessing import LabelEncoder from sklearn.ensemble import ExtraTreesClassifier import pandas as pd import numpy as np # encode labels (str -> int): le = LabelEncoder() X = X.copy() for col in X.columns: le.fit(X[col].unique()) X[col] = le.transform(X[col]) # train the classifier: forest = ExtraTreesClassifier(criterion="entropy", n_estimators=n_trees) forest.fit(X, y) print('number of selected features: ', np.sum(forest.feature_importances_ >= treshold)) # select important features: importances = pd.DataFrame() importances['predictor name'] = X.columns.tolist() importances['importance'] = forest.feature_importances_ importances = importances.sort_values(by='importance', ascending=False) #X2 = forest.transform(X, treshold) #labels2 = X.columns[list(forest.feature_importances_>=treshold)] #X2 = pd.DataFrame(X2) #X2.columns = labels2 return importances #X2
def kfold_cv(X_train, y_train,idx,k): kf = StratifiedKFold(y_train,n_folds=k) xx=[] count=0 for train_index, test_index in kf: count+=1 X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:] gc.collect() y_train_cv, y_test_cv = y_train[train_index],y_train[test_index] y_pred=np.zeros(X_test_cv.shape[0]) m=0 for j in range(m): clf=xgb_classifier(eta=0.1,min_child_weight=20,col=0.5,subsample=0.7,depth=5,num_round=200,seed=j*77,gamma=0.1) y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv)) #y_pred/=m; clf=ExtraTreesClassifier(n_estimators=700,max_features= 50,criterion= 'entropy',min_samples_split= 3, max_depth= 60, min_samples_leaf= 4,verbose=1,n_jobs=-1) #clf=RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100) clf.fit(X_train_cv,(y_train_cv)) y_pred=clf.predict_proba(X_test_cv).T[1] print y_pred.shape xx.append(llfun(y_test_cv,(y_pred))) ypred=y_pred yreal=y_test_cv idx=idx[test_index] print xx[-1]#,y_pred.shape break print xx,'average:',np.mean(xx),'std',np.std(xx) return ypred,yreal,idx#np.mean(xx)
def get_most_important_features(train): train = train.drop('ID', 1) train_y = train['TARGET'] train_X = train.drop('TARGET', 1) random_forest = RandomForestClassifier(n_estimators=100) random_forest.fit(train_X, train_y) feater_importance = pd.Series(random_forest.feature_importances_, index=train_X.columns) feater_importance.sort_values(inplace=True) feater_importance.tail(20).plot(kind='barh', figsize=(15 ,7), title='Feature importance by random forest') # plt.savefig("feature_importance.png") grad_boosting = GradientBoostingClassifier() grad_boosting.fit(train_X, train_y) feater_importance = pd.Series(grad_boosting.feature_importances_, index=train_X.columns) feater_importance.sort_values(inplace=True) feater_importance.tail(20).plot(kind='barh', figsize=(10,7), title='Feature importance by gradient boosting') # plt.savefig("feature_importance2.png") extra_trees = ExtraTreesClassifier() extra_trees.fit(train_X, train_y) feater_importance = pd.Series(extra_trees.feature_importances_, index=train_X.columns) feater_importance.sort_values(inplace=True) feater_importance.tail(20).plot(kind='barh', figsize=(20,7), title='Feature importance by extra trees classifier')
def train_UsingExtraTreesClassifier(df,header,x_train, y_train,x_test,y_test) : # training clf = ExtraTreesClassifier(n_estimators=200,random_state=0,criterion='gini',bootstrap=True,oob_score=1,compute_importances=True) # Also tried entropy for the information gain but 'gini' seemed to give marginally better fit, bith in sample & out of sample clf.fit(x_train, y_train) #estimation of goodness of fit print "Estimation of goodness of fit using the ExtraTreesClassifier is : %f \n" % clf.score(x_test,y_test) print "Estimation of out of bag score using the ExtraTreesClassifier is : %f \n \n " % clf.oob_score_ # getting paramters back, if needed clf.get_params() # get the vector of predicted prob back y_test_predicted= clf.predict(x_test) X = df[df.columns - [header[-1]]] feature_importance = clf.feature_importances_ # On a scale of 10 - make importances relative to max importance and plot them feature_importance = 10.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) #Returns the indices that would sort an array. pos = np.arange(sorted_idx.shape[0]) + .5 plt.figure(figsize=(12, 6)) plt.subplot(1, 1, 1) plt.barh(pos, feature_importance[sorted_idx], align='center') plt.yticks(pos, X.columns[sorted_idx]) plt.xlabel('Relative Importance') plt.title('Variable Importance') plt.show() return y_test_predicted
def FeaturesImportance(trainData, trainLabels): forest = ExtraTreesClassifier(n_estimators=250, random_state=0) forest.fit(trainData, trainLabels) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(16): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # Plot the feature importances of the forest plt.figure() plt.title("Feature importances") plt.bar(range(16), importances[range(16)], color="r", align="center") plt.xticks(range(16), [r'$x_1$', r'$x_2$', r'$x_3$', r'$x_4$', r'$x_5$', r'$x_6$', r'$x_7$', r'$x_8$', r'$x_9$', r'$x_{10}$', r'$x_{11}$', r'$x_{12}$', r'$x_{13}$', r'$x_{14}$', r'$x_{15}$', r'$x_{16}$']) plt.yticks([0.0, 0.05, 0.10, 0.15, 0.20, 0.25], [r'$0.00$', r'$0.05$', r'$0.10$', r'$0.15$', r'$0.20$', r'$0.25$']) plt.xlabel('Features') plt.ylabel('Importance') plt.xlim([-1, 16]) plt.show() return importances
def remove_feature_tree_based(train_X,train_Y): ''' Removes features based on trees - see sklearn: http://scikit-learn.org/dev/auto_examples/ensemble/plot_forest_importances.html#example-ensemble-plot-forest-importances-py Actually removes based on "importance" ''' forest = ExtraTreesClassifier(n_estimators=1000, compute_importances = True, random_state = 0) forest.fit(train_X, train_Y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] x_labels = ['rc1', 'rc2', 'dca1', 'dca2','dcm1', 'dcm2','ace1','ace2','acsc1', 'acsc2', 'acsv1', 'acsv2', 'acss1','acss2', 'acsk1', 'acsk2', 'taca1', 'taca2', 'tdc1', 'tdc2', 'gmin', 'gmean', 'trd','ep111','ep112','ep211', 'ep212', 'ep311','ep312', 'ep411','ep412','ep511','ep512','ep611','ep612','ep121','ep122','ep221', 'ep222', 'ep321','ep322', 'ep421','ep422','ep521','ep522','ep621','ep622'] # Print the feature ranking print "Feature ranking:" for f in xrange(46): print "%d. feature %s (%f)" % (f + 1, x_labels[indices[f]], importances[indices[f]]) # Transform the data to have only the features that are important x_new = forest.transform(train_X) return (forest, x_new)
def algo_fit_cross_validated(training_matrix, target): # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=250, random_state=0) forest.fit(training_matrix, target) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] l = list(training_matrix.columns.values) for f in range(training_matrix.shape[1]): print("%d. feature %d(%s) (%f)" % (f + 1, indices[f], l[indices[f]], importances[indices[f]])) ##### Works well ###### # SVM # svm = SVC(kernel="linear", C=0.06) # svm.fit(training_matrix, target) # # scores_svm = cross_validation.cross_val_score(svm, training_matrix, target, cv=5) # print("(svm) Accuracy: %0.5f (+/- %0.2f)" % (scores_svm.mean(), scores_svm.std() * 2)) # # return svm ##### Works well ###### # Random Forest rf = RandomForestClassifier(n_estimators=1500, max_depth=2, max_features=4) scores_rf = cross_validation.cross_val_score(rf, training_matrix, target, cv=5) print("(Random Forest) Accuracy: %0.5f (+/- %0.2f)" % (scores_rf.mean(), scores_rf.std() * 2)) rf.fit(training_matrix, target) return rf
def extratreeclassifier(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans extratreeclassifier split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf = ExtraTreesClassifier(n_estimators=10) clf.fit(X_train,y_train) y_pred = clf.predict(X_test) print "Extremely Randomized Trees" print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"_Extremely_Random_Forest_metrics_test.txt" file = open(results, "w") file.write("Extremely Random Forest Classifier estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Extremely Randomized Trees %f"%test_size save = Output + "Extremely_Randomized_Trees_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) lvltrace.lvltrace("LVLSortie dans extratreeclassifier split_test")
def fit(self, X, Y, sample_weight=None): from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel num_features = X.shape[1] max_features = int(float(self.max_features) * (np.log(num_features) + 1)) # Use at most half of the features max_features = max(1, min(int(X.shape[1] / 2), max_features)) estimator = ExtraTreesClassifier( n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, class_weight=self.class_weight, ) estimator.fit(X, Y, sample_weight=sample_weight) self.preprocessor = SelectFromModel(estimator=estimator, threshold="mean", prefit=True) return self
def train_classifiers(X_data, y_data): ############ Linear SVM: 0.908 ############# clf_LSVM = svm.SVC(kernel = 'linear') clf_LSVM.fit(X_data, y_data) ############ MultinomialNB: 0.875 ############# clf_MNB = MultinomialNB() clf_MNB.fit(X_data, y_data) ############ Random Forest: 0.910 ############# clf_RF = RandomForestClassifier(n_estimators=200, criterion='entropy') clf_RF.fit(X_data, y_data) ############ Extra Tree: 0.915 ################## clf_ETC = ExtraTreesClassifier(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0) clf_ETC.fit(X_data, y_data) ############ AdaBoost: 0.88 ################## clf_Ada = AdaBoostClassifier() clf_Ada.fit(X_data, y_data) ############ rbf SVM: 0.895 ############# clf_rbf = svm.SVC(C=200, gamma=0.06, kernel='rbf') clf_rbf.fit(X_data, y_data) ############ GradientBoosting: 0.88 ############# clf_GBC = GradientBoostingClassifier() clf_GBC.fit(X_data, y_data) return clf_LSVM, clf_MNB, clf_RF, clf_ETC, clf_Ada, clf_rbf, clf_GBC
def _cascade_layer(self, X, y=None, layer=0): n_tree = getattr(self, 'n_cascadeRFtree') n_cascadeRF = getattr(self, 'n_cascadeRF') min_samples = getattr(self, 'min_samples_cascade') prf = RandomForestClassifier( n_estimators=100, max_features=8, bootstrap=True, criterion="entropy", min_samples_split=20, max_depth=None, class_weight='balanced', oob_score=True) crf = ExtraTreesClassifier( n_estimators=100, max_depth=None, bootstrap=True, oob_score=True) prf_pred = [] if y is not None: # print('Adding/Training Layer, n_layer={}'.format(self.n_layer)) for irf in range(n_cascadeRF): prf.fit(X, y) crf.fit(X, y) setattr(self, '_casprf{}_{}'.format(self.n_layer, irf), prf) setattr(self, '_cascrf{}_{}'.format(self.n_layer, irf), crf) probas = prf.oob_decision_function_ probas += crf.oob_decision_function_ prf_pred.append(probas) elif y is None: for irf in range(n_cascadeRF): prf = getattr(self, '_casprf{}_{}'.format(layer, irf)) crf = getattr(self, '_cascrf{}_{}'.format(layer, irf)) probas = prf.predict_proba(X) probas += crf.predict_proba(X) prf_pred.append(probas) return prf_pred
class MyExtraTree(MyClassifier): def __init__(self, params=dict()): self._params = params self._extree = ExtraTreesClassifier(**(self._params)) def update_params(self, updates): self._params.update(updates) self._extree = ExtraTreesClassifier(**(self._params)) def fit(self, Xtrain, ytrain): self._extree.fit(Xtrain, ytrain) # def predict(self, Xtest, option = None): # return self._extree.predict(Xtest) def predict_proba(self, Xtest, option = None): return self._extree.predict_proba(Xtest)[:, 1] def predict_proba_multi(self, Xtest, option = None): return self._extree.predict_proba(Xtest) def plt_feature_importance(self, fname_list, f_range = list()): importances = self._extree.feature_importances_ std = np.std([tree.feature_importances_ for tree in self._extree.estimators_], axis=0) indices = np.argsort(importances)[::-1] fname_array = np.array(fname_list) if not f_range: f_range = range(indices.shape[0]) n_f = len(f_range) plt.figure() plt.title("Extra Tree Feature importances") plt.barh(range(n_f), importances[indices[f_range]], color="b", xerr=std[indices[f_range]], ecolor='k',align="center") plt.yticks(range(n_f), fname_array[indices[f_range]]) plt.ylim([-1, n_f]) plt.show() def list_feature_importance(self, fname_list, f_range = list(), return_list = False): importances = self._extree.feature_importances_ indices = np.argsort(importances)[::-1] print 'Extra tree feature ranking:' if not f_range : f_range = range(indices.shape[0]) n_f = len(f_range) for i in range(n_f): f = f_range[i] print '{0:d}. feature[{1:d}] {2:s} ({3:f})'.format(f + 1, indices[f], fname_list[indices[f]], importances[indices[f]]) if return_list: return [indices[f_range[i]] for i in range(n_f)]
def tree(train_data, train_labels, all_bigrams, task): forest = ExtraTreesClassifier(n_estimators=100, random_state=0) forest.fit(train_data, train_labels) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] # Print the feature ranking print "-"*45 print task for f in range(20): print("%d. feature, name: %s, importance: %f" % (f + 1, all_bigrams[indices[f]], importances[indices[f]])) # Plot the feature importances of the forest pl.figure() n = train_data.shape[1] n = 2000 pl.title("Sorted feature importance for %s" %(task)) pl.bar(range(n), importances[indices][:n], color="black", align="center") pl.xlim([0, (n)]) pl.xticks([num for num in range(0, n+1, 250)]) pl.savefig(task+'.pdf', bbox_inches='tight') print "plot saved" return indices
def calc_prob(df_features_driver, df_features_other): df_train = df_features_driver.append(df_features_other) df_train.reset_index(inplace = True) df_train.Driver = df_train.Driver.astype(int) # So far, the best result was achieved by using a RandomForestClassifier with Bagging # model = BaggingClassifier(base_estimator = ExtraTreesClassifier()) # model = BaggingClassifier(base_estimator = svm.SVC(gamma=2, C=1)) # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression()) # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression()) # model = BaggingClassifier(base_estimator = AdaBoostClassifier()) #model = RandomForestClassifier(200) # model = BaggingClassifier(base_estimator = [RandomForestClassifier(), linear_model.LogisticRegression()]) # model = EnsembleClassifier([BaggingClassifier(base_estimator = RandomForestClassifier()), # GradientBoostingClassifier]) #model = GradientBoostingClassifier(n_estimators = 10000) model = ExtraTreesClassifier(n_estimators=100,max_features='auto',random_state=0, n_jobs=2, criterion='entropy', bootstrap=True) # model = ExtraTreesClassifier(500, criterion='entropy') feature_columns = df_train.iloc[:, 4:] # Train the classifier model.fit(feature_columns, df_train.Driver) df_submission = pd.DataFrame() df_submission['driver_trip'] = create_first_column(df_features_driver) probs_array = model.predict_proba(feature_columns[:200]) # Return array with the probability for every driver probs_df = pd.DataFrame(probs_array) df_submission['prob'] = np.array(probs_df.iloc[:, 1]) return df_submission
def reduceRF(label): global x_data_rf_reduced, importantFeatureLocs model = ExtraTreesClassifier() model.fit(x_data, y_data[:, label]) # the relative importance of each attribute importance = model.feature_importances_ weight = float(0) del importantFeatureLocs[:] # reset #print(importance) for ele in np.sort(importance)[::-1]: weight += float(ele) featureIndex = np.where(importance==ele) for loc in featureIndex[0]: importantFeatureLocs.append(loc) if weight > RFThreshold : break # remove duplications importantFeatureLocs = list(set(importantFeatureLocs)) # extracting relevant columns from input data. Note that importantFeatureLocs # may be unsorted (since python 'set' is unsorted), so features are extracted # in unorderd fashion. This info is stored in the softmax model class x_data_rf_reduced = x_data[:, importantFeatureLocs]
import matplotlib.pyplot as plt from sklearn.datasets import make_classification from sklearn.ensemble import ExtraTreesClassifier # Build a classification task using 3 informative features x, y = make_classification(n_samples=1000, n_features=5, n_informative=3, n_redundant=0, n_repeated=0, n_classes=2, random_state=0, shuffle=False) forest = ExtraTreesClassifier(n_estimators=2000, random_state=0) forest.fit(x, y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(x.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # Plot the feature importances of the forest plt.figure() plt.title("Feature importances") plt.bar(range(Covid_2_sklearn.shape[1]),
a_train = X.values[train_index] a_test = X.values[test_index] b_train = y.values[train_index] b_test = y.values[test_index] clf = ExtraTreesClassifier(n_estimators=n_estimators, min_samples_split=min_samples_split, max_features=max_features, max_depth=max_depth, min_samples_leaf=min_samples_leaf, n_jobs=2, random_state=random_state, criterion='entropy') clf.fit(a_train, b_train) preds = clf.predict_proba(a_test)[:, 1] # print clf.predict( xgb.DMatrix(check_agreement[features].values) )[:10] agreement_probs = clf.predict_proba(check_agreement[features])[:, 1] ks = compute_ks( agreement_probs[check_agreement['signal'].values == 0], agreement_probs[check_agreement['signal'].values == 1], check_agreement[check_agreement['signal'] == 0]['weight'].values, check_agreement[check_agreement['signal'] == 1]['weight'].values) print ('KS metric', ks, ks < 0.09) if ks >= 0.09: sys.exit()
# ## Get an array of the features ranked #rank = fit.ranking_ # ## Creae a dataframe of the column names by ranking. #col_names = list(df.columns.values) #col_names.pop() #cols_ranked = pd.DataFrame({'features': col_names, 'rank': list(rank)}) # ------------------------------------------------------ # Extremely Randomized Trees # ------------------------------------------------------ from sklearn.ensemble import ExtraTreesClassifier model = ExtraTreesClassifier() model.fit(X_samp, y_samp) # Get an array of the features ranked rank = model.feature_importances_ # Creae a dataframe of the column names by ranking. col_names = list(df_samp.columns.values) col_names.pop() cols_ranked = pd.DataFrame({'features': col_names, 'rank': list(rank)}) cols_ranked['rank'] -= cols_ranked['rank'].min() cols_ranked['rank'] /= cols_ranked['rank'].max() important_cols = cols_ranked.loc[ cols_ranked['rank'] >= extra_trees_keep_thresh] important_cols = list(important_cols['features']) important_cols.append(dep_var)
import numpy as np import pandas as pd from sklearn.ensemble import ExtraTreesClassifier from sklearn.model_selection import train_test_split # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 0.8164430115022656 exported_pipeline = ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.7500000000000001, min_samples_leaf=7, min_samples_split=20, n_estimators=100) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
# KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from sklearn.datasets import make_classification from sklearn.ensemble import ExtraTreesClassifier from sklearn.cross_validation import cross_val_score import pandas as pd import numpy as np df = pd.read_csv('/dataset/lab/data.csv', sep=' ', header=None) X = df[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]].as_matrix() y = df[[0]][0].tolist() # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=1000, n_jobs=2, random_state=0) forest.fit(X, y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(10): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) print cross_val_score(forest, X, y, cv=4, n_jobs=4)
def ModelAnalyzer(X, y, regressor=True): # INPUTS: # - X: (DataFrame) Explanatory variables to be used as features for ML models # - y: (Vector) Response variables to be used as target for ML models # - regressor: (bool) Determines whether a regressor or classifier will be used # OUTPUTS: # - out: (str) Multiline report of the accuracy and fit time of each model import time from sklearn.metrics import mean_absolute_error, accuracy_score from sklearn.model_selection import train_test_split import warnings warnings.filterwarnings('ignore') # Split dataset into train and test dataset (train_size is the proportion of train to test lengths) train_X, test_X, train_Y, test_Y = train_test_split(X, y, train_size=0.5, shuffle=False, random_state=1) if regressor: # Run several models and determine prediction accuracy using accuracy score. # Model Selection # Decision Tree from sklearn.tree import DecisionTreeRegressor start_dt = time.time() dt = DecisionTreeRegressor(random_state=1) dt.fit(train_X, train_Y) dt_test_predictions = dt.predict(test_X) dt_mae = mean_absolute_error(dt_test_predictions, test_Y) finish_dt = str(round(time.time() - start_dt, 5)) out_dt = "Decision Tree MAE: " + str(dt_mae) + ', Time: ' + str( finish_dt) + ' seconds.' # Random Forest from sklearn.ensemble import RandomForestRegressor start_rf = time.time() rf = RandomForestRegressor(random_state=1, max_features='auto', min_samples_split=2, min_samples_leaf=1, n_estimators=650) rf.fit(train_X, train_Y) rf_test_predictions = rf.predict(test_X) rf_mae = mean_absolute_error(rf_test_predictions, test_Y) finish_rf = str(round(time.time() - start_rf, 5)) out_rf = "Random Forest MAE: " + str(rf_mae) + ', Time: ' + str( finish_rf) + ' seconds.' # Support Vector Regressor from sklearn.svm import SVR start_svr = time.time() svr = SVR(gamma='scale', C=1.0) svr.fit(train_X, train_Y) svr_test_predictions = svr.predict(test_X) svr_mae = mean_absolute_error(svr_test_predictions, test_Y) finish_svr = str(round(time.time() - start_svr, 5)) out_svr = "Support Vector MAE: " + str(svr_mae) + ', Time: ' + str( finish_svr) + ' seconds.' # EXTRA TREES MODEL from sklearn.ensemble import ExtraTreesRegressor start_etr = time.time() etr = ExtraTreesRegressor(max_features='auto', n_estimators=125, min_samples_split=3, random_state=1) etr.fit(train_X, train_Y) etr_test_predictions = etr.predict(test_X) etr_mae = mean_absolute_error(etr_test_predictions, test_Y) finish_etr = str(round(time.time() - start_etr, 5)) out_etr = "Extra Trees MAE: " + str(etr_mae) + ', Time: ' + str( finish_etr) + ' seconds.' from sklearn.linear_model import LassoCV start_lasso = time.time() lasso = LassoCV() lasso.fit(train_X, train_Y) lasso_test_predictions = lasso.predict(test_X) lasso_mae = mean_absolute_error(lasso_test_predictions, test_Y) finish_lasso = str(round(time.time() - start_lasso, 5)) out_lasso = "Lasso MAE: " + str(lasso_mae) + ', Time: ' + str( finish_lasso) + ' seconds.' from sklearn.linear_model import RidgeCV start_ridge = time.time() ridge = RidgeCV() ridge.fit(train_X, train_Y) ridge_test_predictions = ridge.predict(test_X) ridge_mae = mean_absolute_error(ridge_test_predictions, test_Y) finish_ridge = str(round(time.time() - start_ridge, 5)) out_ridge = "Ridge MAE: " + str(ridge_mae) + ', Time: ' + str( finish_ridge) + ' seconds.' from sklearn.linear_model import ElasticNetCV start_en = time.time() en = ElasticNetCV() en.fit(train_X, train_Y) en_test_predictions = en.predict(test_X) en_mae = mean_absolute_error(en_test_predictions, test_Y) finish_en = str(round(time.time() - start_en, 5)) out_en = "Elastic Net MAE: " + str(en_mae) + ', Time: ' + str( finish_en) + ' seconds.' out = out_dt + '\n' + out_rf + '\n' + out_svr + '\n' + out_etr + '\n' + out_lasso + '\n' + out_ridge + '\n' + out_en else: # Run several models and determine prediction accuracy using accuracy score. # Logistic Regression from sklearn.linear_model import LogisticRegression start = time.time() lr = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=2000) lr.fit(train_X, train_Y) lr_predictions = lr.predict(test_X) finish_lr = str(round(time.time() - start, 5)) lr_accuracy = accuracy_score(test_Y, lr_predictions) out_lr = "Logistic Regression Accuracy: " + str( lr_accuracy) + ', Time: ' + str(finish_lr) + ' seconds.' # Naïve Bayes from sklearn.naive_bayes import GaussianNB start = time.time() nb = GaussianNB() nb.fit(train_X, train_Y) nb_predictions = nb.predict(test_X) finish_nb = str(round(time.time() - start, 5)) nb_accuracy = accuracy_score(test_Y, nb_predictions) out_nb = "Naive Bayes Accuracy: " + str( nb_accuracy) + ', Time: ' + str(finish_nb) + ' seconds.' # Stochastic Gradient Descent from sklearn.linear_model import SGDClassifier start = time.time() sgd = SGDClassifier(loss='modified_huber', shuffle=True, random_state=101, tol=1e-3, max_iter=1000) sgd.fit(train_X, train_Y) sgd_predictions = sgd.predict(test_X) finish_sgd = str(round(time.time() - start, 5)) sgd_accuracy = accuracy_score(test_Y, sgd_predictions) out_sgd = "SGD Accuracy: " + str(sgd_accuracy) + ', Time: ' + str( finish_sgd) + ' seconds.' # K-Nearest Neighbors from sklearn.neighbors import KNeighborsClassifier start = time.time() knn = KNeighborsClassifier(n_neighbors=10) knn.fit(train_X, train_Y) knn_predictions = knn.predict(test_X) finish_knn = str(round(time.time() - start, 5)) knn_accuracy = accuracy_score(test_Y, knn_predictions) out_knn = "KNN Accuracy: " + str(knn_accuracy) + ', Time: ' + str( finish_knn) + ' seconds.' # Decision Tree from sklearn.tree import DecisionTreeClassifier start = time.time() dt = DecisionTreeClassifier(max_depth=10, random_state=101, max_features=None, min_samples_leaf=5) dt.fit(train_X, train_Y) dt_predictions = dt.predict(test_X) finish_dt = str(round(time.time() - start, 5)) dt_accuracy = accuracy_score(test_Y, dt_predictions) out_dt = "Decision Tree Accuracy: " + str( dt_accuracy) + ', Time: ' + str(finish_dt) + ' seconds.' # Random Forest from sklearn.ensemble import RandomForestClassifier start = time.time() rfm = RandomForestClassifier(n_estimators=125, oob_score=True, n_jobs=1, random_state=101, max_features=None, min_samples_leaf=3) rfm.fit(train_X, train_Y) rfm_predictions = rfm.predict(test_X) finish_rfm = str(round(time.time() - start, 5)) rfm_accuracy = accuracy_score(test_Y, rfm_predictions) out_rfm = "Random Forest Accuracy: " + str( rfm_accuracy) + ', Time: ' + str(finish_rfm) + ' seconds.' # Support Vector Classifier from sklearn.svm import SVC start = time.time() svm = SVC(gamma='scale', C=1.0, random_state=101) svm.fit(train_X, train_Y) svm_predictions = svm.predict(test_X) finish_svm = str(round(time.time() - start, 5)) svm_accuracy = accuracy_score(test_Y, svm_predictions) out_svm = "SVC Accuracy: " + str(svm_accuracy) + ', Time: ' + str( finish_svm) + ' seconds.' # Extra Trees from sklearn.ensemble import ExtraTreesClassifier start = time.time() etc = ExtraTreesClassifier(n_estimators=125) etc.fit(train_X, train_Y) etc_predictions = etc.predict(test_X) finish_etc = str(round(time.time() - start, 5)) etc_accuracy = accuracy_score(test_Y, etc_predictions) out_etc = "Extra Trees Accuracy: " + str( etc_accuracy) + ', Time: ' + str(finish_etc) + ' seconds.' out = out_lr + '\n' + out_nb + '\n' + out_sgd + '\n' + out_knn + '\n' + out_dt + '\n' + out_rfm + '\n' + out_svm + '\n' + out_etc return print(out)
__author__ = 'shi' # Feature Importance import numpy as np from sklearn import datasets from sklearn import metrics from sklearn.ensemble import ExtraTreesClassifier data = np.loadtxt("output_res(1).txt") #f1 = open("phenotype.txt") #f1.readline() result = np.loadtxt("phenotype.txt") print data.shape print result.shape # fit an Extra Trees model to the data from sklearn.cross_validation import train_test_split x_train, x_test, y_train, y_test = train_test_split(data, result, test_size=0.3) model = ExtraTreesClassifier(n_estimators=200) model.fit(x_train, y_train) answer = model.predict(x_test) print "predict_result:", np.mean(answer == y_test) # display the relative importance of each attribute for m in range(len(model.feature_importances_)): if model.feature_importances_[m] > 0.0005: print "feature_importance", m, model.feature_importances_[m]
#Paths for file saving module_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..') models_path = os.path.join(module_path, 'dummy_models') baselline_path = os.path.join(module_path, 'baseline_images') # import some data to play with iris = datasets.load_iris() X = iris.data y = iris.target # Split the data into a training set and a test set X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=10, random_state=0) forest.fit(X_train, y_train) y_true = y_test y_pred = forest.predict(X_test) y_score = forest.predict_proba(X_test) #Pickle model joblib.dump(forest, os.path.join(models_path, 'classifier_with_feature_importances_model.pkl')) #Pickle y_true joblib.dump(y_true, os.path.join(models_path, 'classifier_with_feature_importances_y_true.pkl')) #Pickle y_pred joblib.dump(y_pred, os.path.join(models_path, 'classifier_with_feature_importances_y_pred.pkl')) #Pickle y_score joblib.dump(y_score, os.path.join(models_path, 'classifier_with_feature_importances_y_score.pkl')) #Pickle X joblib.dump(X, os.path.join(models_path, 'classifier_with_feature_importances_x.pkl'))
data['phonecharge_day_num'][i], data['phonelock_sum'][i], data['phonelock_var'][i], data['phonelock_mean'][i], data['phonelock_day_num'][i], data['in_time_second'][i], data['near_time_second'][i], data['in_all_percentage'][i], data['pre_score'][i] ]) return feature if __name__ == '__main__': filename = '..\\preprocess\\data\\features_and_flourishing.csv' all_data = get_file(filename) data_label = get_label(all_data) data_feature = get_feature(all_data) model = ExtraTreesClassifier() model.fit(data_feature, data_label) print(model.feature_importances_) j = 2 label = [] importance = [] for i in model.feature_importances_: importance.append(i) for j in all_data.columns.values: label.append(j) print(label[2:len(model.feature_importances_)]) print(importance) plt.bar(label[2:len(model.feature_importances_) + 2], importance) plt.xticks(rotation=270) plt.gca().margins(x=0) plt.gcf().canvas.draw() tl = plt.gca().get_xticklabels()
from pandas import read_csv from sklearn.ensemble import ExtraTreesClassifier import numpy fileName = "pima-indians-diabetes.data.csv" rawData = open(fileName, "rt") colNames = [ "preg", "plas", "pres", "skin", "test", "mass", "pedi", "age", "class" ] data = read_csv(rawData, names=colNames) array = data.values X = array[:, 0:8] Y = array[:, 8] model = ExtraTreesClassifier() model.fit(X, Y) # numpy.set_printoptions(precision=3) print("Feature Importance Values : %s" % model.feature_importances_)
feature_name = "selected_%s" % suffix fname = os.path.join(config.FEAT_DIR + "/Combine", feature_name + config.FEAT_FILE_SUFFIX) data_dict = pkl_utils._load(fname) X_train = data_dict["X_train_basic"] X_test = data_dict["X_test"] y_train = data_dict["y_train"] splitter = data_dict["splitter"] n_iter = data_dict["n_iter"] i = n_iter - 1 # use the last splitter to split the cv X_train_cv = data_dict["X_train_basic"][splitter[i][0], :] X_valid_cv = data_dict["X_train_basic"][splitter[i][1], :] y_train_cv = data_dict["y_train"][splitter[i][0]] y_valid_cv = data_dict["y_train"][splitter[i][1]] learner = ExtraTreesClassifier(n_estimators=500, criterion='gini', max_depth=5, min_weight_fraction_leaf=0.0, max_features='auto', n_jobs=-1, random_state=config.RANDOM_SEED, verbose=10) learner.fit(X_train_cv, y_train_cv) p_test = learner.predict_proba(X_valid_cv) print("The log loss of valid set is {}".format(log_loss(y_valid_cv, p_test))) index = learner.feature_importances_.argsort() for i in range(-1, -len(index), -1): print("{:30} {:30}".format(data_dict['feature_names'][index[i]], learner.feature_importances_[index[i]]))
def getAmount(weekday, date, hour, month, degree, rain, sun): data = pd.read_csv("dataset_city_people_hour.csv") data.sort_values('Date', ascending=True, inplace=True) data.drop_duplicates(keep=False, inplace=True) # Drop unessecary fields data = data.drop(['Facility'], axis=1) data = data.drop(['Activity'], axis=1) # Replace strings with integers data.replace(to_replace=[ "Jan", "Feb", "Mar", "Apr", "Maj", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" ], value=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], inplace=True) data.replace(to_replace=[ "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday" ], value=[1, 2, 3, 4, 5, 6, 7], inplace=True) # Fill NaN data.fillna(method='ffill', inplace=True) # Seperate weather features into different classes data.loc[(data["Sun"] > 0) & (data["Sun"] < 1200), "Sun"] = 1 # Slight sun data.loc[(data["Sun"] >= 1200) & (data["Sun"] < 2400), "Sun"] = 2 # Moderate sun data.loc[(data["Sun"] >= 2400) & (data["Sun"] < 3600), "Sun"] = 3 # Heavy sun data.loc[(data["Sun"] == 3600), "Sun"] = 4 # Very heavy sun data.loc[(data["Rain"] > 0.0) & (data["Rain"] < 0.5), "Rain"] = 1 # Slight rain data.loc[(data["Rain"] >= 0.5) & (data["Rain"] < 4.0), "Rain"] = 2 # Moderate rain data.loc[(data["Rain"] >= 4.0) & (data["Rain"] < 8.0), "Rain"] = 3 # Heavy rain data.loc[(data["Rain"] > 8), "Rain"] = 4 # Very heavy rain data.loc[(data["Temp"] < -10.0), "Temp"] = 0 data.loc[(data["Temp"] >= -10.0) & (data["Temp"] < -5.0), "Temp"] = 1 data.loc[(data["Temp"] >= -5.0) & (data["Temp"] < 0.0), "Temp"] = 2 data.loc[(data["Temp"] >= 0.0) & (data["Temp"] < 5.0), "Temp"] = 3 data.loc[(data["Temp"] >= 5.0) & (data["Temp"] < 10.0), "Temp"] = 4 data.loc[(data["Temp"] >= 10.0) & (data["Temp"] < 15.0), "Temp"] = 5 data.loc[(data["Temp"] >= 15.0) & (data["Temp"] < 20.0), "Temp"] = 6 data.loc[(data["Temp"] >= 20.0) & (data["Temp"] < 25.0), "Temp"] = 7 data.loc[(data["Temp"] >= 25.0) & (data["Temp"] < 30.0), "Temp"] = 8 data.loc[(data["Temp"] >= 30.0), "Temp"] = 9 numOfPeople = 10 counter = 1 # Seperate the number of people each hour to different classes for i in range(1, 80, numOfPeople): data.loc[(data["People"] >= i) & (data["People"] < i + numOfPeople), "People"] = counter - 1 counter += 1 data.loc[(data["People"] > 80), "People"] = counter - 1 # Convert to int data["Temp"] = data["Temp"].astype(int) data["Rain"] = data["Rain"].astype(int) # Get Data and target Y = data.iloc[:, 7] X = data.drop(["People"], axis=1) # Drop features to compare result #X = X.drop(["Date"], axis=1) # Drop Date #X = X.drop(["Rain"], axis=1) # Drop Rain #X = X.drop(["Sun"], axis=1) # Drop Sun #X = X.drop(["Month"], axis=1) # Drop Month #X = X.drop(["Day"], axis=1) # Drop Day #X = X.drop(["Temp"], axis=1) # Drop Temp cv2 = KFold(shuffle=True, n_splits=5) # Split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42) #KNN knn = KNeighborsClassifier(n_neighbors=5) knn.fit(X_train, Y_train) #print(knn.predict([[3, 28, hour, 9, 8, 0, 0]])) print("Knn: " + str(knn.score(X_test, Y_test))) # SVM svm_model_linear = SVC(kernel='rbf', C=2, gamma='auto').fit(X_train, Y_train) svm_predictions = svm_model_linear.predict(X_test) print("SVM: " + str(svm_model_linear.score(X_test, Y_test))) # Random forest rndF = RandomForestClassifier(100, random_state=0) rndF.fit(X_train, Y_train) rndfPred = rndF.predict(X_test) cm = confusion_matrix(Y_test, rndfPred) print("Random forest: " + str(rndF.score(X_test, Y_test))) # Desision tree DTree = DecisionTreeClassifier(random_state=0) DTree.fit(X_train, Y_train) print("Decision trees: " + str(DTree.score(X_test, Y_test))) # Extra trees exT = ExtraTreesClassifier(n_estimators=100, random_state=0) exT.fit(X_train, Y_train) print("Extra trees: " + str(exT.score(X_test, Y_test))) # Naive bayes NB = MultinomialNB() NB.fit(X_train, Y_train) print("Naive-bayes: " + str(NB.score(X_test, Y_test))) return ( exT.predict([[weekday, date, hour, month, degree, rain, sun]]) ) # 0 = weekday, 1 = date, 2 = hour, 3 = month, 4 = temp, 5 = rain, 6 = sun
iris_data = load_iris() X = iris_data.data y = iris_data.target print(X.shape, y.shape) # %% [markdown] # ### Train classifier # %% from sklearn.ensemble import ExtraTreesClassifier clf = ExtraTreesClassifier(n_estimators=15, random_state=0) clf.fit(X, y) # %% [markdown] # ### Transpile classifier # %% from sklearn_porter import Porter porter = Porter(clf, language='java') output = porter.export(export_data=True) print(output) # %% [markdown] # ### Run classification in Java
def train(): # # load tweet featurese # tweet_features = np.loadtxt('output/devset_tweet_features.dat', delimiter=',') tweet_labels = np.array(tweet_features[:, -1], dtype=int) tweet_features = tweet_features[:, :-1] # make the training set balanced training_posts = read_list('dataset_for_training/real_tweet_id.data') training_posts.extend(read_list('dataset_for_training/fake_tweet_id.data')) all_posts = read_list('output/devset_eff_posts.dat') used_ind = np.ones((len(all_posts), ), dtype=bool) for ind, p in enumerate(all_posts): if not p in training_posts: used_ind[ind] = False tweet_features = tweet_features[used_ind, :] tweet_labels = tweet_labels[used_ind] # # training classifier 1 # detector = None if classifier1 == 'logis': detector = logis(C=1e5, solver='liblinear', multi_class='ovr') elif classifier1 == 'svm': detector = svm.SVC() elif classifier1 == 'randomforest': detector = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=1, random_state=0) scaler_1 = preprocessing.StandardScaler().fit(tweet_features) tweet_features = scaler_1.transform(tweet_features) detector.fit(tweet_features, tweet_labels) with open('output/RUN_2_classifier_1.pickle', 'wb') as handle: pickle.dump(detector, handle) with open('output/RUN_2_scaler_1.pickle', 'wb') as handle: pickle.dump(scaler_1, handle) # # load textual and forensic features # forensic_features = np.loadtxt('output/devset_forensic_features.dat', delimiter=',', dtype=float) eff_forensic_topics = read_list('output/devset_eff_forensic_topics.dat') textual_features = np.loadtxt('output/devset_textual_features.dat', delimiter=',', dtype=float) eff_textual_topics = read_list('output/devset_eff_textual_topics.dat') real_mul_list = read_list('dataset_for_training/real_image_id.data') fake_mul_list = read_list('dataset_for_training/fake_image_id.data') mul_list = list(real_mul_list) mul_list.extend(fake_mul_list) topic_features = np.zeros( (len(mul_list), forensic_features.shape[1] + textual_features.shape[1]), dtype=float) topic_labels = np.zeros((len(mul_list), ), dtype=int) used_ind = np.ones((len(mul_list), ), dtype=bool) for ind, m in enumerate(mul_list): if m in eff_forensic_topics: ind1 = eff_forensic_topics.index(m) topic_features[ ind, :forensic_features.shape[1]] = forensic_features[ind1] if m in eff_textual_topics: ind2 = eff_textual_topics.index(m) topic_features[ ind, forensic_features.shape[1]:] = textual_features[ind2] if not (m in eff_forensic_topics or m in eff_textual_topics): used_ind[ind] = False label = 1 if m in fake_mul_list: label = -1 topic_labels[ind] = label # remove unused topic features topic_features = topic_features[used_ind, :] topic_labels = topic_labels[used_ind] detector_2 = None if classifier2 == 'logis': detector_2 = logis(C=1e5, solver='liblinear', multi_class='ovr') elif classifier2 == 'svm': detector_2 = svm.SVC() elif classifier2 == 'randomforest': detector_2 = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=1, random_state=0) scaler_2 = preprocessing.StandardScaler().fit(topic_features) topic_features = scaler_2.transform(topic_features) detector_2.fit(topic_features, topic_labels) with open('output/RUN_2_classifier_2.pickle', 'wb') as handle: pickle.dump(detector_2, handle) with open('output/RUN_2_scaler_2.pickle', 'wb') as handle: pickle.dump(scaler_2, handle) print('Training statistics\n') print('Number of real tweets: ', sum(tweet_labels == 1)) print('Number of fake tweets: ', sum(tweet_labels == -1)) print('Number of real topics: ', sum(topic_labels == 1)) print('Number of fake topics: ', sum(topic_labels == -1))
def model_builder(self): self.df = self.df.drop( ["duration", "job", "contact", "month", "poutcome"], axis=1) self.df.head() self.df.columns self.df["marital"] = [ 0 if each == "single" else 1 for each in self.df.marital ] self.df["default"] = [ 0 if each == "no" else 1 for each in self.df.default ] self.df["housing"] = [ 0 if each == "no" else 1 for each in self.df.housing ] self.df["loan"] = [0 if each == "no" else 1 for each in self.df.loan] self.df["y"] = [0 if each == "no" else 1 for each in self.df.y] for each in self.df.education: if each == "unknown" or each == "primary": self.df["education"] = 0 elif each == "seondary": self.df["education"] = 1 else: self.df["education"] = 2 # Splitting the dataset into the Training set and Test set X = self.df.iloc[:, 0:11].values y = self.df.iloc[:, 11].values from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # Feature Scaling sc = StandardScaler() X = sc.fit_transform(X) X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) newdata = [] #using 9 ML model to create a secondary dataset knn = KNeighborsClassifier(n_neighbors=10) # n_neighbors means k knn.fit(X_train, y_train) y_pred_knn = knn.predict(X_test) file_knn1 = 'file_knn1.sav' if self.test != True: pickle.dump(knn, open(file_knn1, 'wb')) RF = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0) RF.fit(X_train, y_train) y_pred_RF = RF.predict(X_test) file_rf1 = 'file_rf1.sav' if self.test != True: pickle.dump(RF, open(file_rf1, 'wb')) dtclassifier = DecisionTreeClassifier(criterion='entropy') dtclassifier.fit(X_train, y_train) y_pred_DT = dtclassifier.predict(X_test) file_dt1 = 'file_dt1.sav' if self.test != True: pickle.dump(dtclassifier, open(file_dt1, 'wb')) from sklearn.naive_bayes import GaussianNB nbclassifier = GaussianNB() nbclassifier.fit(X_train, y_train) nb_y_pred = nbclassifier.predict(X_test) file_nb1 = 'file_nb1.sav' if self.test != True: pickle.dump(nbclassifier, open(file_nb1, 'wb')) svmkclassifier = SVC(kernel='rbf', random_state=0, gamma='auto') svmkclassifier.fit(X_train, y_train) y_pred_SVMK = svmkclassifier.predict(X_test) file_svm1 = 'file_svm1.sav' if self.test != True: pickle.dump(svmkclassifier, open(file_svm1, 'wb')) bg = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=15) bg.fit(X_train, y_train) y_pred_bg = bg.predict(X_test) file_bg1 = 'file_bg1.sav' if self.test != True: pickle.dump(bg, open(file_bg1, 'wb')) et = ExtraTreesClassifier(n_estimators=100, max_features=4) et.fit(X_train, y_train) y_pred_et = et.predict(X_test) file_et1 = 'file_et1.sav' if self.test != True: pickle.dump(et, open(file_et1, 'wb')) adb = AdaBoostClassifier(n_estimators=50, random_state=4) adb.fit(X_train, y_train) y_pred_adb = adb.predict(X_test) file_adb1 = 'file_adb1.sav' if self.test != True: pickle.dump(adb, open(file_adb1, 'wb')) gb = GradientBoostingClassifier(n_estimators=1000, random_state=4) gb.fit(X_train, y_train) y_pred_gb = gb.predict(X_test) file_gb1 = 'file_gb1.sav' if self.test != True: pickle.dump(gb, open(file_gb1, 'wb')) #creation of secondary dataset using the primary dataset newdata = pd.DataFrame({ "knn": y_pred_knn, "rf": y_pred_RF, "DT": y_pred_DT, "nb": nb_y_pred, "SVM": y_pred_SVMK, "BG": y_pred_bg, "ET": y_pred_et, "ADB": y_pred_adb, "GB": y_pred_gb }) if self.test != True: newdata.to_csv("secondary_dataset.csv") # In[ ]: X_train, X_test, y_train, y_test = train_test_split(newdata, y_test, test_size=0.1, random_state=0) from sklearn.naive_bayes import GaussianNB nbclassifier2 = GaussianNB() nbclassifier2.fit(X_train, y_train) nb_y_pred = nbclassifier2.predict(X_test) self.accuracy = accuracy_score(y_test, nb_y_pred) * 100 file_final = 'file_final.sav' if self.test != True: pickle.dump(nbclassifier2, open(file_final, 'wb')) return self.accuracy
import pandas as pd import numpy as np #read files arquivo = pd.read_csv('C:/Users/jvict/OneDrive/Documents/wine_dataset.csv') #red=0 and white=1 arquivo['style'] = arquivo['style'].replace('red', 0) arquivo['style'] = arquivo['style'].replace('white', 1) #set the array y = arquivo['style'] X = arquivo.drop('style', axis=1) #split the arrays between train and test from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) #set the tree to module and fit it from sklearn.ensemble import ExtraTreesClassifier clt = ExtraTreesClassifier() clt.fit(X_train, y_train) #measure the accuracy of the IA resultado = clt.score(X_test, y_test) print(resultado)
num_round = 100 lgb_model = lgb.train(param, train_data, num_round, valid_sets=[lgb.Dataset(X_test, y_test)], early_stopping_rounds=1) print("Test") eval_metric(confusion_matrix(y_test, lgb_model.predict(X_test).round())) print("Training") eval_metric(confusion_matrix(y_train, lgb_model.predict(X_train).round())) print("Extra decision tree classifier") model = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=2) model.fit(X_train, y_train) print("Test") eval_metric(confusion_matrix(y_test, model.predict(X_test).round())) print("Training") eval_metric(confusion_matrix(y_train, model.predict(X_train).round())) print("Decision tree classifier") model = DecisionTreeClassifier(max_depth=None, min_samples_split=2) model.fit(X_train, y_train) print("Test") eval_metric(confusion_matrix(y_test, model.predict(X_test).round())) print("Training") eval_metric(confusion_matrix(y_train, model.predict(X_train).round())) print("Decision tree classifier with scaler and PCA") model = make_pipeline(
bestfeatures = SelectKBest(score_func=chi2, k=7) fit = bestfeatures.fit(X,y) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(X.columns) #concat two dataframes for better visualization featureScores = pd.concat([dfcolumns,dfscores],axis=1) featureScores.columns = ['Features','Score'] #naming the dataframe columns print(featureScores.nlargest(7,'Score')) #print 10 best features # Feature Importance feature_importance = [] for i in range(250): model = ExtraTreesClassifier() model.fit(X, y) model_feature_importance = model.feature_importances_ print(model_feature_importance) #use inbuilt class feature_importances of tree based classifiers feature_importance.append(model_feature_importance) feature_importance = np.array(feature_importance) #plot graph of feature importances for better visualization avg = np.mean(feature_importance, axis=0) feat_importances = pd.Series(avg, index=X.columns) feat_importances.nlargest(10).plot(kind='barh') plt.xlabel('Importance Score') plt.ylabel('Feature') plt.title('Feature Importance') plt.show()
def models(dataset): print("Models") x_train_res, x_val_res, y_train_res, y_val_res = train_test(dataset) rf = RandomForestClassifier(n_estimators=40, max_depth=10) rf.fit(x_train_res, y_train_res) filename = 'rf_model.pckl' pickle.dump(rf, open(filename, 'wb')) # some time later... # load the model from disk RandomForest_model = pickle.load(open(filename, 'rb')) print("RandomForestClassifier") knn = KNeighborsClassifier(n_neighbors=4) # fitting the model knn.fit(x_train_res, y_train_res) filename = 'knn_model.pckl' pickle.dump(knn, open(filename, 'wb')) # some time later... # load the model from disk K_nearest_model = pickle.load(open(filename, 'rb')) print("KNeighborsClassifier") lr = LogisticRegression() # fitting the model lr.fit(x_train_res, y_train_res) filename = 'lr_model.pckl' pickle.dump(lr, open(filename, 'wb')) # some time later... # load the model from disk Log_Reg_model = pickle.load(open(filename, 'rb')) print("LogisticRegression") bnb = GaussianNB() # fitting the model bnb.fit(x_train_res, y_train_res) filename = 'bnb_model.pckl' pickle.dump(bnb, open(filename, 'wb')) # some time later... # load the model from disk Bernoulli_Nb_model = pickle.load(open(filename, 'rb')) print("BernoulliNB") extr = ExtraTreesClassifier(n_estimators = 50, random_state = 123) # fitting the model extr.fit(x_train_res, y_train_res) filename = 'extra_tree_model.pckl' pickle.dump(extr, open(filename, 'wb')) # some time later... # load the model from disk Extra_Tree_model = pickle.load(open(filename, 'rb')) print("ExtraTreesClassifier") #randomForest_model = random_forest(dataset) #K_nearest_model = k_n(dataset) #Log_Reg_model = logReg(dataset) #Bernoulli_Nb_model = BernouNb(dataset) #Extra_Tree_model = ex_tr(dataset) #ExtraTreez_model = xtraTree(dataset) model = [RandomForest_model, K_nearest_model, Log_Reg_model, Bernoulli_Nb_model, Extra_Tree_model ] return(model)
from sklearn import datasets mnist = datasets.fetch_mldata('MNIST original') x, y = mnist.data, mnist.target #基于树模型 from sklearn.datasets import load_iris iris = load_iris() ix, iy = iris.data, iris.target from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier from sklearn.feature_selection import SelectFromModel model1 = ExtraTreesClassifier() model2 = GradientBoostingClassifier() model1.fit(ix, iy) model2.fit(ix, iy) model1.feature_importances_ model2.feature_importances_ clf1 = SelectFromModel(model1, prefit=True) clf2 = SelectFromModel(model2, prefit=True) clf1.get_support() clf2.get_support() #--- # sklearn 交叉验证 from sklearn.cross_validation import cross_val_score #cross_val_score(model, X, y, cv=10) from sklearn.cross_validation import cross_val_predict #cross_val_predict(model, X, y, cv=10) from sklearn.cross_validation import LeaveOneOut
class perform_ml(): def __init__(self, df): self.df = df self.conn = sqlite3.connect('earnings.db', timeout=120) self.features = list(self.df.columns) #print(self.features) for remove_me in [ '5 Day Change', '10 Day Change', '5 Day Change Abnormal', '10 Day Change Abnormal', 'Date Reported', 'Time Reported', 'Symbol', 'Market Cap Text' ]: self.features.remove(remove_me) self.first_run = True self.max_means = -90 self.iterations = 5 self.start_feature_imp = [0] while True: self.buy_cutoff = .03 self.cutoff_found = False self.test_df = df self.prepare_data() self.means = [] self.num_trades = [] self.accuracys = [] self.current_means = [] self.current_num_trades = [] self.current_accuracys = [] # TODO: if we keep a feature, start over again print('======================') print('using features', self.features) for i in range(self.iterations): num_trades = 500 mean = self.find_cutoff() if mean < 0 and self.first_run == False: break self.prepare_data() self.train_model() self.predict() mean_return, num_trades, accuracy = self.get_results(self.test) self.means.append(mean_return) self.num_trades.append(num_trades) self.accuracys.append(accuracy) mean_return, num_trades, accuracy = self.get_results( self.test_2019) self.current_means.append(mean_return) self.current_num_trades.append(num_trades) self.current_accuracys.append(accuracy) if self.first_run: #print('starting result', this_runs_avg, this_runs_num_trades,self.buy_cutoff, self.means) self.store_results() self.start_feature_imp = list(self.feature_imp.keys()) self.start_feature_imp.insert(0, 'Before and After') self.initial_feature_imp = self.start_feature_imp.copy() self.features = [] self.first_run = False self.add_feature() continue self.store_results() #self.add_feature() try: if self.this_runs_avg > self.max_means: self.max_means = self.this_runs_avg self.start_feature_imp = self.initial_feature_imp.copy() self.add_feature() else: self.remove_added_feature() self.add_feature() except: break def find_cutoff(self): while True: if self.cutoff_found == True: mean_return = 1 break self.prepare_data() self.train_model() self.predict() mean_return, num_trades, accuracy = self.get_results(self.test) scaler = int(num_trades / 250) + 1 #print('finding cutoff', mean, num_trades, self.buy_cutoff, scaler) if num_trades < 300: print('found cutoff') self.cutoff_found = True break self.buy_cutoff = round(self.buy_cutoff + (.005 * scaler), 4) return mean_return def store_results(self): try: self.this_runs_avg = sum(self.means) / self.iterations this_runs_num_trades = sum(self.num_trades) / self.iterations accuracy = sum(self.accuracys) / self.iterations stddev = np.std(self.means) current_avg = sum(self.current_means) / self.iterations current_num_trades = sum(self.current_num_trades) / self.iterations current_accuracy = sum(self.current_accuracys) / self.iterations current_stddev = np.std(self.current_means) #print(self.this_runs_avg, this_runs_num_trades, self.buy_cutoff, self.means, stddev, self.this_years_avg, accuracy) out_df = pd.DataFrame([[ self.this_runs_avg, stddev, this_runs_num_trades, accuracy, current_avg, current_stddev, current_num_trades, current_accuracy, self.buy_cutoff, str(self.means), str(self.num_trades), str(self.features) ]]) out_df.columns = [ 'Avg Return', 'Std Dev', 'Avg Num Trades', 'Accuracy', 'Current Avg Return', 'Current Std Dev', 'Current Avg Num Trades', 'Current Accuracy', 'Buy Cutoff', 'Returns', 'Num Trades', 'Features' ] print(out_df) print(self.max_means) #self.test.to_csv('test.csv') #input() out_df.to_sql('current_predictions', self.conn, if_exists='append') except Exception as e: print(e) pass def remove_added_feature(self): print('removing added feature ', self.feature_added) self.features.remove(self.feature_added) # TODO: add two features at a time def add_feature(self): self.feature_added = self.start_feature_imp.pop(0) while self.feature_added in self.features: print('not adding feature', self.feature_added, 'as it already exists') self.feature_added = self.start_feature_imp.pop(0) print('adding feature', self.feature_added) self.features.append(self.feature_added) def prepare_data(self): self.test_df['is_train'] = 'Train' self.test_df['is_train'].values[ (self.test_df['Date Reported'] >= datetime.strptime( '2018-01-01', '%Y-%m-%d')) & (self.test_df['Date Reported'] <= datetime.strptime( '2018-12-31', '%Y-%m-%d'))] = 'Test 2018' self.test_df['is_train'].values[ self.test_df['Date Reported'] >= datetime.strptime( '2019-01-01', '%Y-%m-%d')] = 'Test 2019' self.test_df['Action'] = 'None' self.test_df['Action'].values[self.test_df['10 Day Change Abnormal']. values > self.buy_cutoff] = "Buy" self.test_df['Action'] = self.test_df['Action'].astype('category') self.test_df["Action Code"] = self.test_df["Action"].cat.codes self.test_df = self.test_df[self.features + [ 'Action', 'Action Code', 'is_train', '10 Day Change Abnormal', '10 Day Change', 'Date Reported', 'Symbol' ]] self.test_df = self.test_df.replace('-', np.nan) self.test_df = self.test_df.replace([np.inf, -np.inf], np.nan) self.test_df = self.test_df.dropna() self.train, self.test = self.test_df[ self.test_df['is_train'] == 'Train'], self.test_df[ self.test_df['is_train'] == 'Test 2018'] self.train_2019 = pd.concat([ self.test_df[self.test_df['is_train'] == 'Train'], self.test_df[self.test_df['is_train'] == 'Test 2018'] ]) self.test_2019 = self.test_df[self.test_df['is_train'] == 'Test 2019'] def train_model(self, fast=False): self.clf = ExtraTreesClassifier(n_jobs=-1, n_estimators=500) #self.clf = RandomForestClassifier(n_jobs=-1) y = self.train['Action Code'] train = self.train[self.features] self.clf.fit(train, y) self.clf_2019 = ExtraTreesClassifier(n_jobs=-1, n_estimators=500) #self.clf = RandomForestClassifier(n_jobs=-1) y = self.train_2019['Action Code'] train = self.train_2019[self.features] self.clf_2019.fit(train, y) def predict(self): preds = self.clf.predict(self.test[self.features]) preds = pd.DataFrame(preds).astype(str) preds.columns = ['Predicted'] preds = preds.replace('0', 'Buy').replace('1', 'None') self.test['Predicted'] = list(preds['Predicted']) preds = self.clf_2019.predict(self.test_2019[self.features]) preds = pd.DataFrame(preds).astype(str) preds.columns = ['Predicted'] preds = preds.replace('0', 'Buy').replace('1', 'None') self.test_2019['Predicted'] = list(preds['Predicted']) def get_results(self, test_data): self.feature_imp = pd.Series( self.clf.feature_importances_, index=self.features).sort_values(ascending=False) if self.first_run: print(self.feature_imp) chosen = test_data[test_data['Predicted'] == 'Buy'] mean_return = round(chosen['10 Day Change'].mean() * 100, 4) accuracy = len(chosen[chosen['10 Day Change'] > 0]) / float( len(chosen)) return mean_return, len(chosen), accuracy
plt.matshow(corrmat, fignum=figure.number) plt.colorbar().ax.tick_params(labelsize=20, length=10) # plt.title(f"Correlations at {window_ms}ms windows and {stride_ms}ms overlap", fontsize=30) plt.xticks(range(data.shape[1]), list(range(22)), fontsize=20) plt.tick_params(length=10, bottom=False) plt.clim(-1, 1) # Add feature names as y-axis labels plt.yticks([-0.5] + list(range(data.shape[1])) + [data.shape[1] - 0.5], [""] + column_labels + [""], fontsize=25) plt.savefig('heatmap.png', bbox_inches='tight') plt.show() # Feature importance indices = [ datapoint_attribute_descriptions[label] for label in datapoint_features ] classifier = ExtraTreesClassifier(n_estimators=250) classifier.fit(X, y) importance = pd.Series(classifier.feature_importances_, index=indices) importance.nlargest(15).plot(kind='barh') plt.show() # Feature durations durations_path = f"data\\feature\\{conf.imp_type}\\{conf.dos_type}\\mixed_validation_time_100ms_100ms.csv" feature_times = datareader_csv.load_feature_durations(durations_path) del feature_times['time_ms'] del feature_times['class_label'] feature_plotting.plot_feature_barcharts(feature_times)
pred_gbt = pred_gbt + list(model_gbt.fit(X[indxs_to_fit[:]], y[indxs_to_fit[:]]).predict_proba(X[indxs,:])[:,1]) new_Y = new_Y + list(y[indxs[:]]) new_X = np.hstack((np.array(pred_ridge).reshape(len(pred_ridge), 1), np.array(pred_randomforest).reshape(len(pred_randomforest), 1), np.array(pred_lasso).reshape(len(pred_lasso), 1), np.array(pred_gbt).reshape(len(pred_gbt), 1))) print new_X new_Y = np.array(new_Y).reshape(len(new_Y), 1) # <codecell> #model_stacker = lm.LogisticRegression() model_stacker = ExtraTreesClassifier(n_estimators=250, random_state=0) print np.mean(cross_val_score(model_stacker, new_X, new_Y.reshape(new_Y.shape[0]), cv=5)) model_stacker.fit(new_X, new_Y.reshape(new_Y.shape[0])) #save model to disk filename = 'blendedmodel.sav' pickle.dump(model_stacker, open(filename, 'wb')) print "all done Teerth" importances = model_stacker.feature_importances_ std = np.std([tree.feature_importances_ for tree in model_stacker.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(X.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
SVM = LinearSVC(random_state=42, loss="hinge") SVC = SVC(random_state=42, kernel = "poly", degree = 3, C=67) named_estimators = [ ("random_forest_clf", random_forest_clf), ("extra_trees_clf", extra_trees_clf), ("SVM", SVM) ] poly= Pipeline([ ("polyfeat", PolynomialFeatures(degree=3)), ("svm_clf", LinearSVC(C=67, loss="hinge")) ]) extra_trees_clf.fit(X_train, y_train) y_pred = extra_trees_clf.predict(X_test) accuracy_score(y_test, y_pred) # In[ ]: estimators = [random_forest_clf, extra_trees_clf, SVM] for estimator in estimators: print("Training the", estimator) estimator.fit(X_train, y_train) # In[ ]:
tar_test.shape tar_train.describe() # 1 (positive) more often -> always predict positive tar_test.describe() # 0.56 from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(n_estimators=25) classifier = classifier.fit(pred_train, tar_train) predictions = classifier.predict(pred_test) sklearn.metrics.confusion_matrix(tar_test, predictions) sklearn.metrics.accuracy_score(tar_test, predictions) #0.57 model = ExtraTreesClassifier() model.fit(pred_train, tar_train) var_name = (pred_train.columns.tolist()) var_sig = (list(model.feature_importances_)) var_imp = DataFrame(columns=var_name) var_imp.loc['Imp'] = [list(model.feature_importances_)[n] for n in range(7)] var_imp[var_imp.ix[var_imp.last_valid_index()].argsort()[::-1]] trees = range(25) accuracy = np.zeros(25) for idx in range(len(trees)): classifier = RandomForestClassifier(n_estimators=idx + 1) classifier = classifier.fit(pred_train, tar_train)
class ExtraTreesClassifier(IterativeComponentWithSampleWeight, BaseClassificationModel): def __init__(self, criterion, min_samples_leaf, min_samples_split, max_features, bootstrap, max_leaf_nodes, max_depth, min_weight_fraction_leaf, min_impurity_decrease, oob_score=False, n_jobs=1, random_state=None, verbose=0, class_weight=None): self.n_estimators = self.get_max_iter() if criterion not in ("gini", "entropy"): raise ValueError("'criterion' is not in ('gini', 'entropy'): " "%s" % criterion) self.criterion = criterion if check_none(max_depth): self.max_depth = None else: self.max_depth = int(max_depth) if check_none(max_leaf_nodes): self.max_leaf_nodes = None else: self.max_leaf_nodes = int(max_leaf_nodes) self.min_samples_leaf = int(min_samples_leaf) self.min_samples_split = int(min_samples_split) self.max_features = float(max_features) self.bootstrap = check_for_bool(bootstrap) self.min_weight_fraction_leaf = float(min_weight_fraction_leaf) self.min_impurity_decrease = float(min_impurity_decrease) self.oob_score = oob_score self.n_jobs = int(n_jobs) self.random_state = random_state self.verbose = int(verbose) self.class_weight = class_weight self.estimator = None @staticmethod def get_max_iter(): return 512 def get_current_iter(self): return self.estimator.n_estimators def iterative_fit(self, X, y, sample_weight=None, n_iter=1, refit=False): from sklearn.ensemble import ExtraTreesClassifier as ETC if refit: self.estimator = None if self.estimator is None: max_features = int(X.shape[1]**float(self.max_features)) self.estimator = ETC( n_estimators=n_iter, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, max_features=max_features, max_leaf_nodes=self.max_leaf_nodes, min_weight_fraction_leaf=self.min_weight_fraction_leaf, min_impurity_decrease=self.min_impurity_decrease, oob_score=self.oob_score, n_jobs=self.n_jobs, verbose=self.verbose, random_state=self.random_state, class_weight=self.class_weight, warm_start=True) else: self.estimator.n_estimators += n_iter self.estimator.n_estimators = min(self.estimator.n_estimators, self.n_estimators) self.estimator.fit(X, y, sample_weight=sample_weight) return self def configuration_fully_fitted(self): if self.estimator is None: return False return not len(self.estimator.estimators_) < self.n_estimators def predict(self, X): if self.estimator is None: raise NotImplementedError return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() probas = self.estimator.predict_proba(X) probas = convert_multioutput_multiclass_to_multilabel(probas) return probas @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'ET', 'name': 'Extra Trees Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': True, 'handles_multioutput': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'): cs = ConfigurationSpace() criterion = CategoricalHyperparameter("criterion", ["gini", "entropy"], default_value="gini") # The maximum number of features used in the forest is calculated as m^max_features, where # m is the total number of features, and max_features is the hyperparameter specified below. # The default is 0.5, which yields sqrt(m) features as max_features in the estimator. This # corresponds with Geurts' heuristic. max_features = UniformFloatHyperparameter("max_features", 0., 1., default_value=0.5) max_depth = UnParametrizedHyperparameter(name="max_depth", value="None") min_samples_split = UniformIntegerHyperparameter("min_samples_split", 2, 20, default_value=2) min_samples_leaf = UniformIntegerHyperparameter("min_samples_leaf", 1, 20, default_value=1) min_weight_fraction_leaf = UnParametrizedHyperparameter( 'min_weight_fraction_leaf', 0.) max_leaf_nodes = UnParametrizedHyperparameter("max_leaf_nodes", "None") min_impurity_decrease = UnParametrizedHyperparameter( 'min_impurity_decrease', 0.0) bootstrap = CategoricalHyperparameter("bootstrap", ["True", "False"], default_value="False") cs.add_hyperparameters([ criterion, max_features, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_leaf_nodes, min_impurity_decrease, bootstrap ]) return cs
import numpy as np import pandas as pd pif = np.loadtxt('processed_imputed_features.txt') y = pd.read_csv('train.csv')['Complaint-Status'] train_length = len(y) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y = le.fit_transform(y) from sklearn.ensemble import ExtraTreesClassifier etc = ExtraTreesClassifier() etc.fit(pif[:train_length, :], y) print(etc.feature_importances_) from sklearn.decomposition import PCA pca = PCA(n_components=2) pcad = pca.fit_transform(pif[:, :3]) import matplotlib import matplotlib.pyplot as plt colors = ['red', 'green', 'blue', 'purple', 'yellow'] plt.scatter(pcad[:train_length, 0], pcad[:train_length, 1], c=y, cmap=matplotlib.colors.ListedColormap(colors)) plt.show()
print(XGBClassifier_accy) # 0.816 # AdaBoost Classifier from sklearn.ensemble import AdaBoostClassifier adaboost = AdaBoostClassifier() adaboost.fit(x_train, y_train) y_pred = adaboost.predict(x_test) adaboost_accy = round(accuracy_score(y_pred, y_test), 3) print(adaboost_accy) # 0.786 # Extra Trees Classifier from sklearn.ensemble import ExtraTreesClassifier ExtraTreesClassifier = ExtraTreesClassifier() ExtraTreesClassifier.fit(x_train, y_train) y_pred = ExtraTreesClassifier.predict(x_test) extraTree_accy = round(accuracy_score(y_pred, y_test), 3) print(extraTree_accy) # 0.786 # Gaussian Process Classifier from sklearn.gaussian_process import GaussianProcessClassifier GaussianProcessClassifier = GaussianProcessClassifier() GaussianProcessClassifier.fit(x_train, y_train) y_pred = GaussianProcessClassifier.predict(x_test) gau_pro_accy = round(accuracy_score(y_pred, y_test), 3) print(gau_pro_accy) # 0.786 # 投票法 from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier from sklearn import tree from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import ExtraTreesClassifier filename = '../../datasets/pima-indians_classification_train.csv' names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] dataframe = read_csv(filename, names=names) array = dataframe.values inputx = array[:,0:8] outputy = array[:,8] num_folds = 10 kfold = KFold(n_splits=10, random_state=None) model = ExtraTreesClassifier(n_estimators=100) results = cross_val_score(model, inputx, outputy, cv=kfold) print(results.mean()) model.fit(inputx,outputy) filename = '../../datasets/pima-indians_classification_test.csv' names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age'] newdataframe = read_csv(filename, names=names) array = newdataframe.values inputx = array[:,0:8] print(inputx) results = model.predict(inputx) print(model.predict(inputx)) for val in results: if val == 0: print("diabetes not probable",end=" ") else: print("probability of getting diabetes",end=" ") print()
#####KNN #X=np.vstack((ca3,cb3)) X=c5.drop('Class',axis=1) #x_n4=X[0:int(len(X)/4)] #X=x_n4 #label=np.zeros(len(ca3)+len(cb3)) #label[0:200]=1 #label[200:len(label)]=2 label=c5['Class'] y=label # fit an Extra Trees model to the data model = ExtraTreesClassifier() model.fit(X_res, y_res) # display the relative importance of each attribute print(model.feature_importances_) a=model.feature_importances_ yy=pd.DataFrame(label) yn=(yy == 0).astype(int).sum() yp=(yy == 1).astype(int).sum() ## SMOTE from collections import Counter from sklearn.datasets import make_classification from imblearn.over_sampling import SMOTE