from sklearn.naive_bayes import GaussianNB # from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis h = .02 # step size in the mesh names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes" ] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), ] X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) datasets = [make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable ] import pdb; pdb.set_trace()
if feat_select == 1: '''Three steps: 1) Run Feature Selection 2) Get lists of selected and non-selected features 3) Filter columns from original dataset ''' print('--FEATURE SELECTION ON--', '\n') ##1) Run Feature Selection ####### if fs_type == 1: #Stepwise Recursive Backwards Feature removal if binning == 1: clf = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=3, criterion='entropy', random_state=rand_st) sel = RFE(clf, n_features_to_select=k_cnt, step=.1) print('Stepwise Recursive Backwards - Random Forest: ') if binning == 0: rgr = RandomForestRegressor(n_estimators=500, max_depth=None, min_samples_split=3, criterion='mse', random_state=rand_st) sel = RFE(rgr, n_features_to_select=k_cnt, step=.1) print('Stepwise Recursive Backwards - Random Forest: ') fit_mod = sel.fit(data_np, target_np) print(sel.ranking_)
def mcode(ite): R = 0.5 e11 = [] e12 = [] e21 = [] e22 = [] e31 = [] e32 = [] e41 = [] e42 = [] e51 = [] e52 = [] e8 = [] elaterf = [] elaterfdis = [] #data reading if ite == 0: ss = "lowGrade" url = '../lowGrade/text_lg_1.csv' dataframe = pandas.read_csv(url, header=None) array = dataframe.values X = array Y = pandas.read_csv('../lowGrade/label_lowGrade.csv', header=None) Y = Y.values Y = np.ravel(Y) print(Y.shape) for i in range(4): url = '../lowGrade/text_lg_' + str(i + 2) + '.csv' dataframe = pandas.read_csv(url, header=None) array = dataframe.values X1 = array print(X1.shape) X = np.concatenate((X, X1), axis=1) Xnew1 = X[:10, 0:1680] Xnew2 = X[:10, 1680:3360] Xnew3 = X[:10, 3360:5040] Xnew4 = X[:10, 5040:6720] Xnew5 = X[:10, 6720:6745] elif ite == 1: ss = "IDHCodel" url = '../IDHCodel/text_pr_1.csv' dataframe = pandas.read_csv(url, header=None) array = dataframe.values X = array Y = pandas.read_csv('../IDHCodel/label_IDHCodel.csv', header=None) Y = Y.values Y = np.ravel(Y) print(Y.shape) Y = Y[:10] for i in range(4): url = '../IDHCodel/text_pr_' + str(i + 2) + '.csv' dataframe = pandas.read_csv(url, header=None) array = dataframe.values X1 = array print(X1.shape) X = np.concatenate((X, X1), axis=1) Xnew1 = X[:, 0:1680] Xnew2 = X[:, 1680:3360] Xnew3 = X[:, 3360:5040] Xnew4 = X[:, 5040:6720] Xnew5 = X[:, 6720:6745] elif ite == 2: ss = "nonIDH1" url = '../nonIDH1/text_nonIDH1_1.csv' dataframe = pandas.read_csv(url, header=None) array = dataframe.values X = array Y = pandas.read_csv('../nonIDH1/label_nonIDH1.csv', header=None) Y = Y.values Y = np.ravel(Y) print(Y.shape) for i in range(4): url = '../nonIDH1/text_nonIDH1_' + str(i + 2) + '.csv' dataframe = pandas.read_csv(url, header=None) array = dataframe.values X1 = array print(X1.shape) X = np.concatenate((X, X1), axis=1) Xnew1 = X[:, 0:1680] Xnew2 = X[:, 1680:3360] Xnew3 = X[:, 3360:5040] Xnew4 = X[:, 5040:6720] Xnew5 = X[:, 6720:6745] else: ss = "progression" url = '../progression/text_pr_1.csv' dataframe = pandas.read_csv(url, header=None) array = dataframe.values X = array Y = pandas.read_csv('../progression/label_progression.csv', header=None) Y = Y.values Y = np.ravel(Y) print(Y.shape) for i in range(4): url = '../progression/text_pr_' + str(i + 2) + '.csv' dataframe = pandas.read_csv(url, header=None) array = dataframe.values X1 = array print(X1.shape) X = np.concatenate((X, X1), axis=1) Xnew1 = X[:, 0:1680] Xnew2 = X[:, 1680:3360] Xnew3 = X[:, 3360:5040] Xnew4 = X[:, 5040:6720] Xnew5 = X[:, 6720:6745] testfile = open(("RR" + ss + "%f_%f.txt" % (R, ite)), 'w') erfsvm = [] for ii in range(1): seed = 1000 + ii train_indices, test_indices = splitdata(X=X[:10, :], Y=Y, ratio=R, seed=seed) print("Start rest") # view1 X_features_train1, X_features_test1, w1, pred1 = RR_rf_dis( n_trees=10, X=Xnew1, Y=Y, train_indices=train_indices, test_indices=test_indices, seed=seed) m12 = RandomForestClassifier(n_estimators=500, random_state=seed, oob_score=True, n_jobs=1).fit(X_features_train1, Y[train_indices]) pre1 = m12.predict(X_features_test1) print("finished view1") #e12.append(m12.score(X_features_test1, Y[test_indices])) #e11.append(w1) # view 2 X_features_train2, X_features_test2, w2, pred2 = RR_rf_dis( n_trees=500, X=Xnew2, Y=Y, train_indices=train_indices, test_indices=test_indices, seed=seed) m22 = RandomForestClassifier(n_estimators=500, random_state=seed, oob_score=True, n_jobs=1).fit(X_features_train2, Y[train_indices]) pre2 = m22.predict(X_features_test2) #e22.append(m22.score(X_features_test2, Y[test_indices])) #e21.append(w2) # view 3 X_features_train3, X_features_test3, w3, pred3 = RR_rf_dis( n_trees=500, X=Xnew3, Y=Y, train_indices=train_indices, test_indices=test_indices, seed=seed) m32 = RandomForestClassifier(n_estimators=500, random_state=seed, oob_score=True, n_jobs=1).fit(X_features_train3, Y[train_indices]) pre3 = m32.predict(X_features_test3) #e32.append(m32.score(X_features_test3, Y[test_indices])) #e31.append(w3) # view 4 X_features_train4, X_features_test4, w4, pred4 = RR_rf_dis( n_trees=500, X=Xnew4, Y=Y, train_indices=train_indices, test_indices=test_indices, seed=seed) m42 = RandomForestClassifier(n_estimators=500, random_state=seed, oob_score=True, n_jobs=1).fit(X_features_train4, Y[train_indices]) pre4 = m42.predict(X_features_test4) #e42.append(m42.score(X_features_test4, Y[test_indices])) #e41.append(w4) # view 5 X_features_train5, X_features_test5, w5, pred5 = RR_rf_dis( n_trees=500, X=Xnew5, Y=Y, train_indices=train_indices, test_indices=test_indices, seed=seed) m52 = RandomForestClassifier(n_estimators=500, random_state=seed, oob_score=True, n_jobs=1).fit(X_features_train5, Y[train_indices]) pre5 = m52.predict(X_features_test5) #e52.append(m52.score(X_features_test5, Y[test_indices])) #e51.append(w5) # Late RF resall1 = np.column_stack((pred1, pred2, pred3, pred4, pred5)) Laterf = list(range(len(test_indices))) for i in range(len(test_indices)): Laterf[i], empty = Counter(resall1[i]).most_common()[0] LRF = accuracy_score(Y[test_indices], Laterf) elaterf.append(LRF) # Late RF dis resall = np.column_stack((pre1, pre2, pre3, pre4, pre5)) LSVTres = list(range(len(test_indices))) for i in range(len(test_indices)): LSVTres[i], empty = Counter(resall[i]).most_common()[0] LSVTscore = accuracy_score(Y[test_indices], LSVTres) elaterfdis.append(LSVTscore) # multi view X_features_trainm = (X_features_train1 + X_features_train2 + X_features_train3 + X_features_train4 + X_features_train5) / 5 X_features_testm = (X_features_test1 + X_features_test2 + X_features_test3 + X_features_test4 + X_features_test5) / 5 mv = RandomForestClassifier(n_estimators=500, random_state=seed, oob_score=True, n_jobs=1).fit(X_features_trainm, Y[train_indices]) e8.append(mv.score(X_features_testm, Y[test_indices])) # RFSVM c = nLsvm_patatune(train_x=X_features_trainm, train_y=Y[train_indices], test_x=X_features_testm, test_y=Y[test_indices]) clf = SVC(C=c, kernel='precomputed') clf.fit(X_features_trainm, Y[train_indices]) erfsvm.append(clf.score(X_features_testm, Y[test_indices])) testfile.write("RFSVM&%s pm%s & " % (floored_percentage(np.mean(erfsvm), 2), floored_percentage(np.std(erfsvm), 2)) + '\n') testfile.write("RFDIS &%s pm%s & " % (floored_percentage(np.mean(e8), 2), floored_percentage(np.std(e8), 2)) + '\n') testfile.write(" LATERF&%s pm%s &" % (floored_percentage(np.mean(elaterf), 2), floored_percentage(np.std(elaterf), 2)) + '\n') testfile.write(" LATERFDIS&%s pm%s & " % (floored_percentage(np.mean(elaterfdis), 2), floored_percentage(np.std(elaterfdis), 2)) + '\n') print(ss) print("RFSVM&%s pm%s & " % (floored_percentage(np.mean(erfsvm), 2), floored_percentage(np.std(erfsvm), 2)) + '\n') print("RFDIS &%s pm%s & " % (floored_percentage(np.mean(e8), 2), floored_percentage(np.std(e8), 2)) + '\n') print(" LATERF&%s pm%s &" % (floored_percentage(np.mean(elaterf), 2), floored_percentage(np.std(elaterf), 2)) + '\n') print(" LATERFDIS&%s pm%s & " % (floored_percentage(np.mean(elaterfdis), 2), floored_percentage(np.std(elaterfdis), 2)) + '\n')
X = df.drop(['ticker'], axis=1).iloc[1:, :] y = df.pct_chg[1:] > 0.01 #y = df.pct_chg.shift(-1) y = df.pct_chg X_train, X_test, y_train, y_test = train_test_split(X, y[:-1], test_size=0.02, random_state=42) y = df.adjclose X_train, X_test, y_train, y_test = train_test_split(X.drop(['close'], axis=1), y, test_size=0.02, random_state=42) X_train = X.drop(['close'], axis=1).iloc[:-30,:] X_test = X.drop(['close'], axis=1).iloc[-30:,:] y_train = y[:-31] y_test = y[-30:] # Instantiate model with 1000 decision trees rf = RandomForestClassifier(n_estimators = 1000, random_state = 42) rf = RandomForestRegressor(n_estimators = 1000, random_state = 42) # Train the model on training data rf.fit(X_train, y_train) pred_ = pd.Series(rf.predict(y_test.values.reshape(-1,1))) pred = pd.Series(rf.predict(y_test.values.reshape(-1,1))) pred_.set_axis(y_test.axes, inplace=True) errors_ = pd.DataFrame(deepcopy(y_test)) errors_['preds'] = pred_.values confusion_matrix = pd.crosstab(errors_['pct_chg'], errors_['preds'], rownames=['Actual'], colnames=['Predicted']) print(confusion_matrix)
from sklearn.model_selection import cross_val_score from sklearn.metrics import roc_curve from sklearn.metrics import roc_auc_score from sklearn.metrics import classification_report, confusion_matrix from matplotlib import pyplot from sklearn.model_selection import GridSearchCV dados_dengue = pd.read_csv('dados/caso-dengue2018_C.csv', delimiter=';', low_memory=False) X = dados_dengue.drop(['tp_sexo','tp_classificacao_final','tp_criterio_confirmacao', 'resultado'], axis=1) y = dados_dengue['resultado'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) rfc = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=10, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None, oob_score=False, random_state=0, verbose=0, warm_start=False) rfc.fit(X_train, y_train) rfc_predict = rfc.predict(X_test) param_grid = [ {'n_estimators': [100, 250, 500], 'max_features': [5, 10, 'auto'], 'max_depth': [10, 50, None], 'bootstrap': [True, False]} ] grid_search_forest = GridSearchCV(rfc, param_grid, cv=10, scoring='roc_auc') grid_search_forest.fit(X_train, y_train)
''' logistic regression ''' lr = LogisticRegression().fit(train, yTrain) yhat = lr.predict(test) result = gen_result(test_id, yhat) result.to_csv('./data/submission2.csv', index=False) print('logistic regression finished!') ''' random forest ''' classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=42) classifier.fit(train, yTrain) y_pred = classifier.predict(test) result = gen_result(test_id, y_pred) result.to_csv('./data/submission3.csv', index=False) print('random forest finished!') ''' Ada boost ''' ada_params = { 'n_estimators': 200, 'learning_rate' : 0.75 } clf = AdaBoostClassifier(**ada_params)
# Not sure which one is used by mljar # Based on trial/error, chose 2016 for the constructor and cv_state for the train_test_split # Documented in diary/_posts/2017-06-29....md random_seed = [ 2016, clf_mlj.selected_algorithm.params['random_seed'], clf_mlj.selected_algorithm.params['train_params']['cv_state'], None ] ######################## print("Random forest with same params") i = 0 j = 2 clf_skl = RandomForestClassifier( n_estimators=5, criterion=mljar_fit_params['criterion'], max_features=mljar_fit_params['max_features'], min_samples_split=mljar_fit_params['min_samples_split'], min_samples_leaf=mljar_fit_params['min_samples_leaf'], random_state=random_seed[i]) # http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold # skf = 5 skf = StratifiedKFold(n_splits=validation_kfolds, shuffle=True, random_state=random_seed[j]) # http://scikit-learn.org/stable/modules/generated/sklearn.calibration.CalibratedClassifierCV.html clf_skl_sig = CalibratedClassifierCV(clf_skl, cv=skf, method='isotonic') #sigmoid') clf_skl_sig.fit(X, y)
# scores = ['precision', 'recall'] # from sklearn.model_selection import GridSearchCV # for score in scores: # #model = GridSearchCV(SVC(), tuned_parameters, cv=5,scoring='%s_macro' % score) # model = GridSearchCV(RandomForestClassifier(), tuned_parameters,scoring='%s_macro' % score) # model.fit(xtrain,ytrain) # test_pred = model.predict(xtest) # train_pred = model.predict(xtrain) # from sklearn.metrics import confusion_matrix # cfmatrix1 = confusion_matrix(ytest,test_pred) # cfmatrix2 = confusion_matrix(ytrain,train_pred) # print cfmatrix1 # print cfmatrix2 # print("Best parameters set found on development set:") # print(model.best_params_) #model = SVC(kernel= 'rbf', C= 100, gamma= 0.0001) #model = SVC(kernel='linear', C=10) model = RandomForestClassifier(n_estimators=10) #model = RandomForestClassifier() model.fit(xtrain,ytrain) test_pred = model.predict(xtest) train_pred = model.predict(xtrain) from sklearn.metrics import confusion_matrix cfmatrix1 = confusion_matrix(ytest,test_pred) cfmatrix2 = confusion_matrix(ytrain,train_pred) print cfmatrix1 print cfmatrix2
def random_forest(X,y): model_tree = RandomForestClassifier(random_state=100, n_estimators=50) sel_rfe_tree = RFE(estimator=model_tree, n_features_to_select=8, step=1) X_train_rfe_tree = sel_rfe_tree.fit_transform(X, y) return sel_rfe_tree.get_support()
#Data for training from sklearn.model_selection import train_test_split y = df['Diabetes'] X = df.drop('Diabetes', axis=1) from imblearn.over_sampling import SMOTE X_resampled, y_resampled = SMOTE(kind='borderline2').fit_sample(X, y) YR = pd.Series(y_resampled) XR = pd.DataFrame(X_resampled) X_train, X_test, y_train, y_test = train_test_split(XR, YR, random_state=0) #Model creation: from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=15, max_depth=None, min_samples_split=4, random_state=0) #Training clf = clf.fit(X_train, y_train) from sklearn.cross_validation import cross_val_score scores = cross_val_score(clf, X_test, y_test) from sklearn.metrics import log_loss, f1_score, precision_score, accuracy_score, confusion_matrix, roc_curve, auc yrdn_pre = clf.predict(X_test) fpr, tpr, _ = roc_curve(y_test, clf.predict_proba(X_test)[:, 1]) roc_auc = auc(fpr, tpr) Result = pd.DataFrame() Result['Test'] = ["Logloss", "F1 Score", "Precision", "Accuracy", 'ROC AUC'] Result['Random F'] = [ log_loss(y_test, yrdn_pre), f1_score(y_test, yrdn_pre),
def third_generation(X, y, size=200, seed=None): mlp_parameters = list(itertools.product([1, 2, 4, 8, 32, 128],\ [0, 0.2, 0.5, 0.9], [0.1, 0.3, 0.6])) mlp_clf = [ MLPClassifier(hidden_layer_sizes=(h, ), momentum=m, learning_rate_init=a) for (h, m, a) in mlp_parameters ] mlp_name = ['mlp_{0}_{1}_{2}'.format(*param) for param in mlp_parameters] neigbhors_number = [int(i) for i in np.linspace(1, X.shape[0], 40)] weighting_methods = ['uniform', 'distance'] knn_clf = [ KNeighborsClassifier(n_neighbors=nn, weights=w) for (nn, w) in itertools.product(neigbhors_number, weighting_methods) ] knn_name = [ 'knn_{0}_{1}'.format(*param) for param in itertools.product( neigbhors_number, ['uniform', 'distance']) ] C = np.logspace(-3, 7, num=11) degree = [2, 3, 4] gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2] svm_clf_poly = [ SVC(C=c, kernel='poly', degree=d) for (c, d) in itertools.product(C, degree) ] svm_clf_poly_name = [ 'svm_poly_{0}_{1}'.format(*param) for param in itertools.product(C, degree) ] svm_clf_rbf = [ SVC(C=c, kernel='rbf', gamma=g) for (c, g) in itertools.product(C, gamma) ] svm_clf_rbf_name = [ 'svm_rbf_{0}_{1}'.format(*param) for param in itertools.product(C, gamma) ] dt_params = list(itertools.product(['gini', 'entropy'], \ [1, 2, 3, 4, 5, None], \ [None, 'sqrt', 'log2'], \ ['best', 'random'])) dt_clf = [ DecisionTreeClassifier(criterion=c, max_depth=d, max_features=f, splitter=s) for (c, d, f, s) in dt_params ] dt_name = ['dt_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params] et_clf = [ ExtraTreeClassifier(criterion=c, max_depth=d, max_features=f, splitter=s) for (c, d, f, s) in dt_params ] et_name = ['et_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params] ada_params = list(itertools.product([2**i for i in range(1, 14)], \ [1, 2, 3])) ada_dt_clf = [ AdaBoostClassifier(n_estimators=n, base_estimator=DecisionTreeClassifier(max_depth=m)) for (n, m) in ada_params ] ada_et_clf = [ AdaBoostClassifier(n_estimators=n, base_estimator=ExtraTreeClassifier(max_depth=m)) for (n, m) in ada_params ] ada_dt_name = ['ada_dt_{0}_{1}'.format(*param) for param in ada_params] ada_et_name = ['ada_et_{0}_{1}'.format(*param) for param in ada_params] nb_bag_est = 50 nb_bag_stumps = 200 bag_dt = BaggingClassifier(n_estimators=nb_bag_est, base_estimator=DecisionTreeClassifier()) bag_et = BaggingClassifier(n_estimators=nb_bag_est, base_estimator=ExtraTreeClassifier()) bag_stumps = BaggingClassifier( n_estimators=nb_bag_stumps, base_estimator=DecisionTreeClassifier(max_depth=1)) bag_dt.fit(X, y) bag_et.fit(X, y) bag_stumps.fit(X, y) dt_bag_clf = bag_dt.estimators_ et_bag_clf = bag_et.estimators_ stump_bag_clf = bag_stumps.estimators_ dt_bag_name = ['dt_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)] et_bag_name = ['et_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)] stump_bag_name = [ 'stump_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_stumps) ] bag_dt_clf = [bag_dt] bag_et_clf = [bag_dt] bag_stump_clf = [bag_stumps] bag_dt_name = ['bag_dt_{0}'.format(str(nb_bag_est))] bag_et_name = ['bag_et_{0}'.format(str(nb_bag_est))] bag_stump_name = ['bag_stump_{0}'.format(str(200))] nb_rf = 15 rf = RandomForestClassifier(n_estimators=nb_rf) rf.fit(X, y) dt_rf_clf = rf.estimators_ dt_rf_name = ['dt_rf_{0}'.format(nb_est) for nb_est in range(nb_rf)] log_parameters = list(itertools.product(['l1', 'l2'],\ np.logspace(-5, 9, num=15), [True, False])) log_clf = [ LogisticRegression(penalty=l, C=c, fit_intercept=f) for (l, c, f) in log_parameters ] log_name = ['log_{0}_{1}_{2}'.format(*param) for param in log_parameters] sgd_parameters = list( itertools.product([ 'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive' ], ['elasticnet'], [True, False], np.arange(0, 1.1, 0.1))) sgd_clf = [ SGDClassifier(loss=l, penalty=p, fit_intercept=f, l1_ratio=l1) for (l, p, f, l1) in sgd_parameters ] sgd_name = [ 'sgd_{0}_{1}_{2}_{3}'.format(*param) for param in sgd_parameters ] pool = mlp_clf + knn_clf + svm_clf_poly + svm_clf_rbf + dt_clf + et_clf + ada_dt_clf + ada_et_clf + \ dt_bag_clf + et_bag_clf + stump_bag_clf + bag_dt_clf + bag_et_clf + bag_stump_clf + dt_rf_clf + \ log_clf + sgd_clf pool_name = mlp_name + knn_name + svm_clf_poly_name + svm_clf_rbf_name + dt_name + et_name + ada_dt_name + \ ada_et_name + dt_bag_name + et_bag_name + stump_bag_name + bag_dt_name + bag_et_name + \ bag_stump_name + dt_rf_name + log_name + sgd_name for model in pool: if not check_model_is_fitted(model, X[0, :].reshape((1, -1))): model.fit(X, y) np.random.seed(seed) order = np.random.permutation(range(len(pool))) estimators = [pool[i] for i in order[:size]] return estimators, pool_name
def createModel(data, scoring='precision', drop_backers_count = False, drop_staff_pick = True): data = featureEngineering.prepDataFrameForPreprocessor(data, drop_backers_count = drop_backers_count, drop_staff_pick = drop_staff_pick) print(data.columns) preprocessor = featureEngineering.fitPreprocessor(data) X = data.drop("state", axis=1) y = data["state"] print("before preprocessing") X = preprocessor.transform(X) print("features engineered") print("X",X.shape) print("y",y.shape) rf_model = RandomForestClassifier() param_rf = { "n_estimators":[1000], "criterion":['entropy'], "max_depth":[None], "min_samples_split":[2], "min_samples_leaf":[1], "min_weight_fraction_leaf":[0.0], "max_features":['auto'], "max_leaf_nodes":[None], "min_impurity_decrease":[0.0], "min_impurity_split":[None], "bootstrap":[True], "oob_score":[False], "n_jobs":[None], "random_state":[RSEED], "verbose":[0], "warm_start":[False], "class_weight":[None], "ccp_alpha":[0.0], "max_samples":[None], } grid_rf = GridSearchCV(rf_model, param_grid=param_rf, cv=5, scoring=scoring, verbose=5, n_jobs=-1) grid_rf.fit(X,y) print("model trained") rf_model = grid_rf.best_estimator_ #y_log_pred_test = lg_model.predict(X_test) filename = './models/modelBackerRF.sav' pickle.dump(rf_model, open(filename, 'wb')) print("model saved") filename = './models/preprocessorBackerRF.sav' pickle.dump(preprocessor, open(filename, 'wb')) print("preprocessor saved")
# construct the argument parser and parse the arguments ap = argparse.ArgumentParser() ap.add_argument("-m", "--model", type=str, default="knn", help="type of python machine learning model to use") args = vars(ap.parse_args()) # define the dictionary of models our script can use, where the key # to the dictionary is the name of the model (supplied via command # line argument) and the value is the model itself models = { "knn": KNeighborsClassifier(n_neighbors=1), "naive_bayes": GaussianNB(), "logit": LogisticRegression(solver="lbfgs", multi_class="auto"), "svm": SVC(kernel="rbf", gamma="auto"), "decision_tree": DecisionTreeClassifier(), "random_forest": RandomForestClassifier(n_estimators=100), "mlp": MLPClassifier() } # load the Iris dataset and perform a training and testing split, # using 75% of the data for training and 25% for evaluation print("[INFO] loading data...") dataset = load_iris() (trainX, testX, trainY, testY) = train_test_split(dataset.data, dataset.target, random_state=3, test_size=0.25) # train the model print("[INFO] using '{}' model".format(args["model"])) model = models[args["model"]] model.fit(trainX, trainY)
dataset = pd.DataFrame(X) dataset['Label'] = Y print(dataset['Label'].unique()) print(dataset['Label'].value_counts()) ##If we do not want to include pixels with value 0 ##e.g. Sometimes unlabeled pixels may be given a value 0. dataset = dataset[dataset['Label'] != 0] #Redefine X and Y for Random Forest X_for_RF = dataset.drop(labels = ['Label'], axis=1) Y_for_RF = dataset['Label'] #RANDOM FOREST from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(n_estimators = 30, random_state = 42) # Train the model on training data model.fit(X_for_RF, Y_for_RF) ############################################# #Save model for future use filename = 'RF_model.sav' pickle.dump(model, open(filename, 'wb')) #Load model.... loaded_model = pickle.load(open(filename, 'rb')) #Test on a different image #READ EXTERNAL IMAGE...
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) ### Model Building ##### from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(random_state=0, n_estimators=100, criterion='entropy') classifier.fit(X_train, y_train) # Predicting Test Set y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score cm = confusion_matrix(y_test, y_pred) acc = accuracy_score(y_test, y_pred) prec = precision_score(y_test, y_pred) rec = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) model_results = pd.DataFrame(
a[1].title.set_text('relative frequency') sb.heatmap(m2, annot=True, ax=a[1]) plt.show() print(m1) print(m2) # In[107]: print(classification_report(yte, ypr)) print(accuracy_score(yte, ypr)) # # Random Forest # In[111]: rnf_cls = RandomForestClassifier(criterion='entropy', random_state=0) rnf_cls.fit(Xtr.toarray(), ytr) # In[112]: ypr = rnf_cls.predict(Xte.toarray()) # In[113]: f, a = plt.subplots(1, 2, figsize=(20, 8)) m1, m2 = confusion_matrix(yte, ypr), confusion_matrix(yte, ypr, normalize='true') a[0].title.set_text('absolute frequency') sb.heatmap(m1, annot=True, ax=a[0]) a[1].title.set_text('relative frequency')
knn.fit(X_train_std[:, k3], y_train) print('Training accuracy:', knn.score(X_train_std[:, k3], y_train)) print('Test accuracy:', knn.score(X_test_std[:, k3], y_test)) # # Assessing feature importance with Random Forests feat_labels = df_wine.columns[1:] forest = RandomForestClassifier(n_estimators=500, random_state=1) forest.fit(X_train, y_train) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] for f in range(X_train.shape[1]): print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]])) plt.title('Feature Importance') plt.bar(range(X_train.shape[1]), importances[indices], align='center')
n=labels.size, classes=labels)) #Print the matrix size of the image and the training data print('Our img matrix is sized: {sz}'.format(sz=img.shape)) print('Our roi array is sized: {sz}'.format(sz=roi.shape)) #Initialize the feacture and the training data X = img[:, :, :] y = roi[roi > 0] #Extra: cross validation with 5 splits kf = KFold(n_splits=5, shuffle=True, random_state=2) # Initialize our model with 500 trees rf = RandomForestClassifier(n_estimators=10, oob_score=True) #Split the feacture and training sample into 5 for Train_index, Test_index in kf.split(X): X_Train, X_Test = X[Train_index], X[Test_index] y_Train, y_Test = y[Train_index], y[Test_index] print('Our X_Train is sized: {sz}'.format(sz=X_Train.shape)) print('Our X_Test is sized: {sz}'.format(sz=X_Test.shape)) print('Our y_Train is sized: {sz}'.format(sz=y_Train.shape)) print('Our y_Test is sized: {sz}'.format(sz=y_Test.shape)) nsamples, nx, ny = X_Train.shape d2_X_Train = X_Train.reshape((nsamples, nx * ny)) print('Our d2_X_Train is sized: {sz}'.format(sz=d2_X_Train.shape)) nsamples, nx, ny = X_Test.shape
def cross_validation(list_models, X_train, y_train, scoring, cv): fitted_models = {key: None for key in list_models} for eModel in list_models: if eModel == 'SGDClassifier': from sklearn.linear_model import SGDClassifier clf_sgd = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42) scores = cross_val_score(clf_sgd, X_train, y_train, scoring=scoring, cv=cv) fitted_models[eModel] = {} fitted_models[eModel]['model'] = clf_sgd fitted_models[eModel]['Scores'] = scores fitted_models[eModel]['Mean'] = scores.mean() fitted_models[eModel]['Standard deviation'] = scores.std() y_scores = cross_val_predict(clf_sgd, X_train, y_train, cv=cv, method='decision_function') classification_performance_measure(fitted_models, eModel, clf_sgd, X_train, y_train, cv, y_scores) if eModel == 'RandomForestClassifier': from sklearn.ensemble import RandomForestClassifier clf_forest = RandomForestClassifier(n_estimators=100, random_state=42) y_probas_forest = cross_val_predict(clf_forest, X_train, y_train, cv=cv, method="predict_proba") y_scores = y_probas_forest[:, 1] # Positive class probabilities fitted_models[eModel] = {} fitted_models[eModel]['model'] = clf_forest classification_performance_measure(fitted_models, eModel, clf_forest, X_train, y_train, cv, y_scores) if eModel == 'LogisticRegression': from sklearn.linear_model import LogisticRegression train_samples = X_train.shape[0] #reg_log = LogisticRegression() #reg_log = LogisticRegression(C=50. / train_samples, penalty='l1', solver='saga', tol=0.1) reg_log = LogisticRegression(solver='newton-cg') y_probas_log = cross_val_predict(reg_log, X_train, y_train, cv=cv, method="predict_proba") y_scores = y_probas_log[:, 1] # Positive class probabilities fitted_models[eModel] = {} fitted_models[eModel]['model'] = reg_log classification_performance_measure(fitted_models, eModel, reg_log, X_train, y_train, cv, y_scores) if eModel == 'SGDRegressor': from sklearn.linear_model import SGSRegressor reg_sgd = SGSRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.1) print('MISSING FITTING') fitted_models[eModel] = {} fitted_models[eModel]['model'] = reg_sgd
from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error from sklearn.ensemble import ExtraTreesRegressor from sklearn.ensemble import RandomForestClassifier from vecstack import stacking from sklearn.metrics import precision_recall_fscore_support, accuracy_score from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import BaggingClassifier from sklearn.model_selection import cross_val_score from drop_highlycorelated import clf,xtrain,ytrain,xtest,ytest,X_important_train,X_important_test models = [ svm.SVC(kernel='linear',C=1), RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=1000, max_depth=3), BaggingClassifier(svm.SVC(kernel='linear',C=1)) ] S_train, S_test = stacking(models, # list of models X_important_train, ytrain, X_important_test, # data, # regression task (if you need # classification - set to False) mode='oof_pred_bag', # mode: oof for train set, predict test regression=True, # set in each fold and find mean save_dir=None, # do not save result and log (to save # in current dir - set to '.') metric=mean_absolute_error, # metric: callable n_folds=4, # number of folds shuffle=True, # shuffle the data random_state=0, # ensure reproducibility verbose=2)
def predefined_estimators(estimator, random_state, n_jobs, p): """ Provides the classifiers and parameters using by the module Parameters ----------- estimator : str Name of scikit learn estimator. random_state : Any number Seed to use in randomized components. n_jobs : int Number of processing cores to use. p : dict Classifier setttings (keys) and values. Returns ------- clf : object Scikit-learn classifier object mode : str Flag to indicate whether classifier performs classification or regression. """ try: from sklearn.experimental import enable_hist_gradient_boosting except ImportError: pass from sklearn.linear_model import ( LogisticRegression, LinearRegression, SGDRegressor, SGDClassifier, ) from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.ensemble import ( RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor, ) from sklearn.ensemble import (GradientBoostingClassifier, GradientBoostingRegressor) from sklearn.svm import SVC, SVR from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.neural_network import MLPClassifier, MLPRegressor estimators = { "SVC": SVC(C=p["C"], probability=True, random_state=random_state), "SVR": SVR(C=p["C"], epsilon=p["epsilon"]), "LogisticRegression": LogisticRegression( C=p["C"], solver="liblinear", random_state=random_state, multi_class="auto", n_jobs=1, fit_intercept=True, ), "LinearRegression": LinearRegression(n_jobs=n_jobs, fit_intercept=True), "SGDClassifier": SGDClassifier( penalty=p["penalty"], alpha=p["alpha"], l1_ratio=p["l1_ratio"], n_jobs=n_jobs, random_state=random_state, ), "SGDRegressor": SGDRegressor( penalty=p["penalty"], alpha=p["alpha"], l1_ratio=p["l1_ratio"], random_state=random_state, ), "DecisionTreeClassifier": DecisionTreeClassifier( max_depth=p["max_depth"], max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, ), "DecisionTreeRegressor": DecisionTreeRegressor( max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, ), "RandomForestClassifier": RandomForestClassifier( n_estimators=p["n_estimators"], max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, n_jobs=n_jobs, oob_score=True, ), "RandomForestRegressor": RandomForestRegressor( n_estimators=p["n_estimators"], max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, n_jobs=n_jobs, oob_score=True, ), "ExtraTreesClassifier": ExtraTreesClassifier( n_estimators=p["n_estimators"], max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, n_jobs=n_jobs, bootstrap=True, oob_score=True, ), "ExtraTreesRegressor": ExtraTreesRegressor( n_estimators=p["n_estimators"], max_features=p["max_features"], min_samples_leaf=p["min_samples_leaf"], random_state=random_state, bootstrap=True, n_jobs=n_jobs, oob_score=True, ), "GradientBoostingClassifier": GradientBoostingClassifier( learning_rate=p["learning_rate"], n_estimators=p["n_estimators"], max_depth=p["max_depth"], min_samples_leaf=p["min_samples_leaf"], subsample=p["subsample"], max_features=p["max_features"], random_state=random_state, ), "GradientBoostingRegressor": GradientBoostingRegressor( learning_rate=p["learning_rate"], n_estimators=p["n_estimators"], max_depth=p["max_depth"], min_samples_leaf=p["min_samples_leaf"], subsample=p["subsample"], max_features=p["max_features"], random_state=random_state, ), "HistGradientBoostingClassifier": GradientBoostingClassifier( learning_rate=p["learning_rate"], n_estimators=p["n_estimators"], max_depth=p["max_depth"], min_samples_leaf=p["min_samples_leaf"], subsample=p["subsample"], max_features=p["max_features"], random_state=random_state, ), "HistGradientBoostingRegressor": GradientBoostingRegressor( learning_rate=p["learning_rate"], n_estimators=p["n_estimators"], max_depth=p["max_depth"], min_samples_leaf=p["min_samples_leaf"], subsample=p["subsample"], max_features=p["max_features"], random_state=random_state, ), "MLPClassifier": MLPClassifier( hidden_layer_sizes=p["hidden_layer_sizes"], alpha=p["alpha"], random_state=random_state, ), "MLPRegressor": MLPRegressor( hidden_layer_sizes=p["hidden_layer_sizes"], alpha=p["alpha"], random_state=random_state, ), "GaussianNB": GaussianNB(), "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(), "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(), "KNeighborsClassifier": KNeighborsClassifier(n_neighbors=p["n_neighbors"], weights=p["weights"], n_jobs=n_jobs), "KNeighborsRegressor": KNeighborsRegressor(n_neighbors=p["n_neighbors"], weights=p["weights"], n_jobs=n_jobs), } # define classifier model = estimators[estimator] # classification or regression if (estimator == "LogisticRegression" or estimator == "SGDClassifier" or estimator == "MLPClassifier" or estimator == "DecisionTreeClassifier" or estimator == "RandomForestClassifier" or estimator == "ExtraTreesClassifier" or estimator == "GradientBoostingClassifier" or estimator == "HistGradientBoostingClassifier" or estimator == "GaussianNB" or estimator == "LinearDiscriminantAnalysis" or estimator == "QuadraticDiscriminantAnalysis" or estimator == "SVC" or estimator == "KNeighborsClassifier"): mode = "classification" else: mode = "regression" return (model, mode)
data_train = pd.concat(frames) data_train.info() bb=data_train.iloc[:, 4:6749] cc = bb.apply(lambda x: x.fillna(x.mean()), axis=0) cc['tag'] = data_train.iloc[:, 3:4] test = dfp.iloc[:, 2:6744].apply(lambda x: x.fillna(x.mean()), axis=0) Xtest = adalist x_data_output = dfp.iloc[:, 0:1].values predictors = adalist alg = RandomForestClassifier(random_state=1, n_estimators=62, min_samples_split=2, min_samples_leaf=1) kf = model_selection.KFold(n_splits=33, shuffle=False, random_state=1) scores = model_selection.cross_val_score(alg, cc[predictors], cc['tag'], cv=kf) print("scores.mean=", scores.mean()) File = open("data/prob_radomforest_features.txt", "w",encoding=u'utf-8', errors='ignore') File.write("id"+",") File.write("prob" + "\n") classifier = alg.fit(cc[predictors], cc['tag']) predictiontest = classifier.predict_proba(test) for step in range(len(test)): File.write(str(x_data_output[step])+",") File.write(str(predictiontest[step]) + "\n")
from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.dummy import DummyClassifier from xgboost import XGBClassifier # define a dictionary for different classifiers and their parameters classifiers = { "Dummy" : DummyClassifier(strategy='uniform', random_state=2), "KNN(3)" : KNeighborsClassifier(3), "RBF SVM" : SVC(gamma=2, C=1), "Decision Tree": DecisionTreeClassifier(max_depth=7), "Random Forest": RandomForestClassifier(max_depth=7, n_estimators=10, max_features=4), "xgboost" : XGBClassifier(), "Neural Net" : MLPClassifier(alpha=1), "AdaBoost" : AdaBoostClassifier(), "Naive Bayes" : GaussianNB(), "QDA" : QuadraticDiscriminantAnalysis(), "Linear SVC" : LinearSVC(), "Linear SVM" : SVC(kernel="linear"), "Gaussian Proc": GaussianProcessClassifier(1.0 * RBF(1.0)), } from time import time nfast = 10 # Run the first nfast learner. Don't run the very slow ones at the end head = list(classifiers.items())[:nfast] for name, classifier in head: start = time() # remember starting training time
def RandomForest(Number_leaves, X_train, y_train, X_test): clf = RandomForestClassifier(n_estimators = 100, min_samples_leaf=Number_leaves,class_weight = "balanced") #define decision tree with selected variable clf = clf.fit(X_train,y_train) #train the decision tree on training set y_pred = clf.predict(X_test) #predict the values of X_test return(clf, y_pred)
test_data[test_data[0::,9] == '',9] = np.round(np.mean(test_data[test_data[0::,9]\ != '',9].astype(np.float))) #All the missing prices assume median of their respectice class for i in xrange(np.size(test_data[0::, 0])): if test_data[i, 7] == '': test_data[i,7] = np.median(test_data[(test_data[0::,7] != '') &\ (test_data[0::,0] == test_data[i,0])\ ,7].astype(np.float)) test_data = np.delete(test_data, [1, 6], 1) #remove the name data, cabin and ticket #The data is now ready to go. So lets train then test! print 'Training ' forest = RandomForestClassifier(n_estimators=100) forest = forest.fit(train_data[0::,1::],\ train_data[0::,0]) print 'Predicting' output = forest.predict(test_data) open_file_object = csv.writer(open("../csv/myfirstforest.csv", "wb")) test_file_object = csv.reader(open('../csv/test.csv', 'rb')) #Load in the csv file test_file_object.next() i = 0 for row in test_file_object: row.insert(0, output[i].astype(np.uint8))
def build_model(): pipeline = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tfidf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier()))]) return pipeline
y = lbl.fit_transform(y) y #male convert to 1 #female convert to 0 #split the dataset into train and test from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #Random Forest from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, roc_auc_score acc_scores = [] roc_scores = [] clf = RandomForestClassifier(n_estimators=150) clf.fit(X_train, y_train) clf.score(X_train, y_train) y_pred = clf.predict(X_test) acc_scores.append(accuracy_score(y_test, y_pred)) roc_scores.append(roc_auc_score(y_test, y_pred)) acc_scores[0], roc_scores[0] import pickle pickle.dump(clf, open('model.pkl', 'wb')) # Loading model to compare the results model = pickle.load(open('model.pkl', 'rb')) print( model.predict([[ 0.077315503, 0.083829421, 0.036718459, 0.008701057, 0.131908017,
train_X = train[feature_list] train_y = train['AdoptionSpeed'] random_grid = {'n_estimators': [int(x) for x in np.linspace(start = 100, stop = 500, num = 5)], 'max_features': ['auto', 'sqrt'], 'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]} #skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False) classifier = RandomForestClassifier() param_search = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1) param_search.fit(train_X, train_y) fitted_classifier = classifier.fit(train_X, train_y) fitted_classifier.oob_score_ cross_val_score(fitted_classifier, train_X, train_y, cv=5, scoring='f1_macro') predictions = classifier.predict(train_y)
#filename='blood.csv' #filename='2dplanes.csv' filename = 'custom_satisfaction.csv' ns = [1, 2, 4, 8, 16, 32] for n in ns: data = pd.read_csv(filename) data = data.drop('ID', axis=1) X = data.iloc[:, :-1] y = data.iloc[:, -1] #f=int(math.log(X.shape[1]+1,2)) # In[24]: start = time.time() clf = RandomForestClassifier(n_estimators=n, random_state=0) clf.fit(X, y) taken = time.time() - start print(taken) pd.DataFrame([[filename, taken, n]]).to_csv('rf_sk.txt', mode='a', index=False, header=False) # In[25]: start = time.time() clf = BaggingClassifier(n_estimators=n, random_state=0) clf.fit(X, y) taken = time.time() - start print(taken)
#plot.target(X) y = X['Type'] #(target we want to predict) X.drop('Type', axis=1, inplace=True) #print(X.head()) X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0) ###first model model = RandomForestClassifier() model.fit(X_train, y_train) predictions = model.predict(X_valid) print(confusion_matrix(y_valid, predictions)) print("Standard Random Forrest: ", model.score(X_valid, y_valid)) ###model with paramet optimization #### n_estimators ns = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] scores = [] for n in ns: model = RandomForestClassifier(n_estimators=n) model.fit(X_train, y_train) predictions = model.predict(X_valid)