# Grid Search from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split, GridSearchCV pipeline = Pipeline([('clf', RandomForestClassifier(criterion='gini'))]) parameters = { 'clf__n_estimators': (1000, 2000, 3000), 'clf__max_depth': (100, 200, 300), 'clf__min_samples_split': (2, 3), 'clf__min_samples_leaf': (1, 2) } grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=5, verbose=1, scoring='accuracy') grid_search.fit(x_train, y_train) print('Best Training score: %0.3f' % grid_search.best_score_) print('Best parameters set:') best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print('\t%s: %r' % (param_name, best_parameters[param_name])) predictions = grid_search.predict(x_test) print("Testing accuracy:", round(accuracy_score(y_test, predictions), 4)) print("\nComplete report of Testing data\n", classification_report(y_test, predictions))
X_train, X_test, y_train, y_test = train_test_split(select_X, y1, test_size=0.2, random_state=0) print(X_train.shape) print(y_train.shape) print(X_test.shape) print(y_test.shape) #cross validation param_dist = {'n_neighbors': range(1, 30), 'weights': ["uniform", "distance"]} cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0) grid = GridSearchCV(kNN(), param_grid=param_dist, cv=cv) grid.fit(X_train, y_train.values.ravel()) best_estimator = grid.best_estimator_ print(best_estimator) #nach cross validation bekommen wir best_estimator. clf = best_estimator print('the acuracy for all is:') print(clf.score(X_test, y_test.values.ravel())) prediction = clf.predict(X_test) print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, prediction)) print("Classification report:\n %s\n" %
optimizer='adadelta', metrics=['accuracy']) return model dense_size_candidates = [[32], [64], [32, 32], [64, 64]] my_classifier = KerasClassifier(make_model, batch_size=32) validator = GridSearchCV( my_classifier, param_grid={ 'dense_layer_sizes': dense_size_candidates, # nb_epoch is avail for tuning even when not # an argument to model building function 'nb_epoch': [3, 6], 'nb_filters': [8], 'nb_conv': [3], 'nb_pool': [2] }, scoring='neg_log_loss', n_jobs=1) validator.fit(X_train, y_train) print('The parameters of the best model are: ') print(validator.best_params_) # validator.best_estimator_ returns sklearn-wrapped version of best model. # validator.best_estimator_.model returns the (unwrapped) keras model best_model = validator.best_estimator_.model metric_names = best_model.metrics_names
axes[idx].set_title(label) #不能这样加坐标轴标题,因为plt是加到当前绘制的图(最后一个子图)里 #plt.xlabel("Sepal Width [std]") #plt.ylabel("Petal length [std]") plt.text(-3.5, -4.5, s="Sepal Width [std]", ha="center", va="center", fontsize=12) plt.text(-10.5, 4.5, s="Petal Length [std]", ha="center", va="center", fontsize=12, rotation=90) plt.show() # mv_clf.get_params() # 获得当前 estimator的全部参数名称,便于进行 GridSearch # In[4]: from sklearn.model_selection import GridSearchCV params = {'decisiontreeclassifier__max_depth': [1, 2], 'pipeline-1__clf__C': [0.001, 0.1, 100] } grid = GridSearchCV(cv=5, estimator=mv_clf, n_jobs=1, scoring="roc_auc", param_grid=params) grid.fit(X_train, y_train) import pandas as pd gridSearchResult = pd.DataFrame(grid.cv_results_) gridSearchResult[["mean_test_score", "params"]].head(5) print("best score: %0.3f ; best params: %s" % (grid.best_score_, grid.best_params_))
descriptor__k=[10], classify__kernel=["rbf"], classify__gamma= [.002], classify__C=[1]) ''' params = dict(descriptor__descType=["SpatialPyramids"], descriptor__numFeatures=[512], descriptor__k=[600], classify__kernel=[CodeBook.pyramidMatchKernel], classify__gamma=[0.0001, 0.01, 10], classify__C=[1]) # Cross-validate start = time.time() grid = GridSearchCV(pipe, cv=6, n_jobs=1, param_grid=params) grid.fit(train_images_filenames, train_labels) end = time.time() # save results in a file saveXVal(grid) # print results print(grid.best_params_) print("All done in ", str(end - start), " seconds.") print("Best parameters set found on development set:") print() print(grid.best_params_) print()
"lambda_": [1e-5, 1e-4, 1e-3], } X_train = data_tr y_train = target_tr X_test = data_ts y_test = target_ts # initialise model n_int_fold = 10 # number of folds model = ValentiMLP(**default_params, n_batch=int(len(data_tr) / n_int_fold)) # grid search grid_search = GridSearchCV(model, cv=n_int_fold, n_jobs=-1, param_grid=param_grid, verbose=2, return_train_score=True) grid_search.fit(X_train, y_train) # print results cv_result = pd.DataFrame(grid_search.cv_results_) pprinter = pp.PrettyPrinter(indent=4) print(cv_result[[ "param_eta", "param_alpha", "param_lambda_", "param_n_hidden", "mean_train_score", "std_train_score", "mean_test_score", "std_test_score" ]]) print("Best parameters:") pprinter.pprint(grid_search.best_params_) # refit model over whole training set
def main(): #************************************************************************************* #1.load data (training and test) and preprocessing data(replace NA,98,96,0(age) with NaN) #read data using pandas #replace 98, 96 with NAN for NOTime30-59,90,60-90 #replace 0 with NAN for age #************************************************************************************* colnames = ['ID', 'label', 'RUUnsecuredL', 'age', 'NOTime30-59', \ 'DebtRatio', 'Income', 'NOCredit', 'NOTimes90', \ 'NORealEstate', 'NOTime60-89', 'NODependents'] col_nas = ['', 'NA', 'NA', 0, [98, 96], 'NA', 'NA', 'NA', \ [98, 96], 'NA', [98, 96], 'NA'] col_na_values = creatDictKV(colnames, col_nas) dftrain = pd.read_csv("cs-training.csv", names=colnames, \ na_values=col_na_values, skiprows=[0]) train_id = [int(x) for x in dftrain.pop("ID")] y_train = np.asarray([int(x) for x in dftrain.pop("label")]) x_train = dftrain.as_matrix() dftest = pd.read_csv("cs-test.csv", names=colnames, \ na_values=col_na_values, skiprows=[0]) test_id = [int(x) for x in dftest.pop("ID")] y_test = np.asarray(dftest.pop("label")) x_test = dftest.as_matrix() #************************************************************************************* #2.split training data into training_new and test_new (for validation model) # to keep the class ratio using StratifiedShuffleSplit to do the split #************************************************************************************* sss = StratifiedShuffleSplit(n_splits=1, test_size=0.33333, random_state=0) for train_index, test_index in sss.split(x_train, y_train): print("TRAIN:", train_index, "TEST:", test_index) x_train_new, x_test_new = x_train[train_index], x_train[test_index] y_train_new, y_test_new = y_train[train_index], y_train[test_index] y_train = y_train_new x_train = x_train_new #***************************************************************************************** #3.impute the data with imputer: replace MVs with Mean #***************************************************************************************** imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(x_train) x_train = imp.transform(x_train) x_test_new = imp.transform(x_test_new) x_test = imp.transform(x_test) #***************************************************************************************** #4.Build RF model using the training_new data: # a. handle imbalanced data distribution by # setting class_weight="balanced"/"balanced_subsample" # n_samples / (n_classes * np.bincount(y)) #***************************************************************************************** # Initialize the model: #***************************************************************************************** rf = RandomForestClassifier(n_estimators=100, \ oob_score=True, \ min_samples_split=2, \ min_samples_leaf=50, \ n_jobs=-1, \ #class_weight="balanced",\ class_weight="balanced_subsample", \ bootstrap=True\ ) #************************************************************************************* # b. perform parameter tuning using grid search with CrossValidation #************************************************************************************* #param_grid={"max_features": [2,3,4,5],\ # "min_samples_leaf": [30,40,50,100],\ # "criterion": ["gini", "entropy"]} param_grid = {"max_features": [2, 3, 4], "min_samples_leaf": [50]} grid_search = GridSearchCV(rf, cv=10, scoring='roc_auc', param_grid=param_grid, iid=False) #************************************************************************************* # c. output the best model and make predictions for test data # - Use best parameter to build model with training_new data #************************************************************************************* grid_search.fit(x_train, y_train) print "the best parameter:", grid_search.best_params_ print "the best score:", grid_search.best_score_ #print "the parameters used:",grid_search.get_params #************************************************************************************* # To see how fit the model with the training_new data # -Use the model trained to make predication for train_new data #************************************************************************************* predicted_probs_train = grid_search.predict_proba(x_train) predicted_probs_train = [x[1] for x in predicted_probs_train] computeAUC(y_train, predicted_probs_train) #************************************************************************************* # To see how well the model performs with the test_new data # -Use the model trained to make predication for validataion data (test_new) #************************************************************************************* predicted_probs_test_new = grid_search.predict_proba(x_test_new) predicted_probs_test_new = [x[1] for x in predicted_probs_test_new] computeAUC(y_test_new, predicted_probs_test_new) #************************************************************************************* # use the model to predict for test and output submission file #************************************************************************************* predicted_probs_test = grid_search.predict_proba(x_test) predicted_probs_test = ["%.9f" % x[1] for x in predicted_probs_test] submission = pd.DataFrame({ 'ID': test_id, 'Probabilities': predicted_probs_test }) submission.to_csv("rf_benchmark.csv", index=False)
def train(self, datas, labels, model="LGBM", gridsearch=False, parameters=None): x_vec = np.array([self.get_vec(i) for i in datas]) label_uni = labels.unique() num_class = label_uni.size label_map = {label: ind for ind, label in enumerate(label_uni)} print(label_map) _labels = labels.map(label_map) print("Original dataset shape %s" % Counter(_labels)) smote_enn = SMOTEENN(random_state=0) x_sample, y_sample = smote_enn.fit_sample(x_vec, _labels) print(sorted(Counter(y_sample).items())) print('re-sampled dataset shape %s' % Counter(y_sample)) x_train, x_test, y_train, y_test = train_test_split(x_sample, y_sample, test_size=0.2, random_state=123) print("classifier model is:%s" % model) y_one_hot = label_binarize(y_test, np.arange(num_class)) clf_model = self.clf_models[model] if gridsearch: gsearch = GridSearchCV(clf_model, param_grid=parameters['params'], scoring='accuracy', cv=parameters['cv'], n_jobs=-1) gsearch.fit(x_train, y_train) print("Best score: %0.3f" % gsearch.best_score_) print("Best parameters set:") best_parameters = gsearch.best_estimator_.get_params() for param_name in sorted(parameters['params'].keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) evaluation = self.model_metrics(gsearch.best_estimator_, x_test, y_test) print(evaluation) y_score = gsearch.predict_proba(x_test) fpr, tpr, threshold = roc_curve(y_one_hot.ravel(), y_score.ravel()) roc_auc = auc(fpr, tpr) plt.figure(figsize=(10, 10)) plt.plot(fpr, tpr, color='darkorange', lw=2.0, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=2.0, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('%s ROC curve' % model) plt.legend(loc="lower right") plt.show() joblib.dump(gsearch.best_estimator_, os.path.join(os.getcwd(), model + "gsearch.m")) else: clf_model.fit(x_train, y_train) joblib.dump(clf_model, os.path.join(os.getcwd(), model + ".m")) evaluation = self.model_metrics(clf_model, x_test, y_test) print(evaluation) y_score = clf_model.predict_proba(x_test) print(y_score - y_one_hot) fpr, tpr, threshold = roc_curve(y_one_hot.ravel(), y_score.ravel()) roc_auc = auc(fpr, tpr) plt.figure(figsize=(10, 10)) plt.plot(fpr, tpr, color='darkorange', lw=2.0, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=2.0, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('%s ROC curve' % model) plt.legend(loc="lower right") plt.show() with open("training_evaluation.txt", 'w') as f: s = model + '\n' + str(evaluation) + str(label_map) f.write(s)
for i in range(n_onehot): col.append(onehot_attributes[i]) df_imputed_scaled = pd.DataFrame(data_array, columns=col) print(df_imputed_scaled.shape) x = df_imputed_scaled.drop(['good_bad'], axis=1) print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++") y = df['good_bad'] from sklearn.model_selection import GridSearchCV from sklearn.metrics import confusion_matrix from sklearn import tree parameters = {'max_depth': range(4, 20)} clf = GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=4, cv=10) clf.fit(X=x, y=y) predictions = clf.predict(x) tree_model = clf.best_estimator_ print(clf.best_score_, clf.best_params_) print("Accuracy") print(accuracy_score(y, predictions)) mat = confusion_matrix(y, predictions) print(classification_report(y, predictions)) features = list(x.columns.values) print(features) from IPython.core.display import display, Image print("To display the tree")
#Nous initialisons ensuite un DecisionTreeClassifierobjet avec deux arguments. decTree = DecisionTreeClassifier(criterion='gini', random_state=50) #Enfin, nous ajustons le modèle sur les données d’entraînement decTree.fit(X_train, y_train) # évaluons sa précision sur les données de test. decTree.score(X_test, y_test) y_pred = decTree.predict(X_test) # Evalution avec Matrice de Confusion cm = confusion_matrix(y_test, y_pred) #Creation de grille des differents hyperparameters grid_params = { 'max_depth': [1, 2, 3, 4, 5, 6], 'min_samples_leaf': [0.02, 0.04, 0.06, 0.08] } #ous créons un GridSearchCVobjet avec le classifieur de l’arbre de décision comme estimateur grid_object = GridSearchCV(estimator=decTree, param_grid=grid_params, scoring='accuracy', cv=10) #Nous ajustons ensuite cet objet de grille aux données d'apprentissage grid_object.fit(X_train, y_train) #Extraction des meilleures parametres grid_object.best_params_
1,树的个数 2,树的最大深度 3,内部节点最少样本数与叶节点最少样本数 4,特征个数 此外,调参过程中选择的误差函数是均值误差,5倍折叠 ''' X, y = trainData[numFeatures2], trainData['rec_rate'] ''' 网格搜索参数 ''' param_test1 = {'n_estimators': range(10, 80, 5)} #从10-80每5格取一个值 gsearch1 = GridSearchCV(estimator=RandomForestRegressor(min_samples_split=50, min_samples_leaf=10, max_depth=8, max_features='sqrt', random_state=10), param_grid=param_test1, scoring='neg_mean_squared_error', cv=5) gsearch1.fit(X, y) print(gsearch1.best_params_, gsearch1.best_score_) best_n_estimators = gsearch1.best_params_['n_estimators'] #估计出的最佳数个数 param_test2 = { 'max_depth': range(3, 21), 'min_samples_split': range(10, 100, 10) } gsearch2 = GridSearchCV(estimator=RandomForestRegressor( n_estimators=best_n_estimators, min_samples_leaf=10, max_features='sqrt',
#Check model performance on test data y.value_counts() y_pred = dtClassifier.predict(X_test) from sklearn import metrics metrics.roc_auc_score(y_test, y_pred) #GridSearchCV to find optimal max_depth with Gini index as splitting criteria from sklearn.model_selection import GridSearchCV params_grid = {'criterion': ['gini'], 'max_depth': [3, 4, 5, 6, 7, 8, 9, 10]} classifier = DecisionTreeClassifier() clf = GridSearchCV(estimator=classifier, param_grid=params_grid, cv=10, scoring='roc_auc') clf.fit(X_train, y_train) clf.best_params_ clf.best_score_ #Optimal max_depth = 4 dtClassifierOpt = DecisionTreeClassifier(max_depth=4, criterion='gini') dtClassifierOpt.fit(X_train, y_train) #Displaying the decision tree (Meed GraphViz software installed on machine) from sklearn.tree import export_graphviz import pydotplus as pdot from IPython.display import Image #Export the tree into an odt file
def dcv_clf(X, y, model, param_grid, niter): """ Double cross validation (classification) Parameters ---------- X : array-like, shape = [n_samples, n_features] X training+test data y : array-like, shape = [n_samples] y training+test data model: estimator object. This is assumed to implement the scikit-learn estimator interface. param_grid : dict or list of dictionaries Dictionary with parameters names (string) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored. niter : int number of DCV iteration Returns ------- None """ # parameters ns_in = 3 # n_splits for inner loop ns_ou = 3 # n_splits for outer loop scores = np.zeros((niter, 5)) for iiter in range(niter): ypreds = np.array([]) # list of predicted y in outer loop ytests = np.array([]) # list of y_test in outer loop kf_ou = KFold(n_splits=ns_ou, shuffle=True) # [start] outer loop for test of the generalization error for train_index, test_index in kf_ou.split(X): X_train, X_test = X[train_index], X[test_index] # inner loop CV y_train, y_test = y[train_index], y[test_index] # outer loop # [start] inner loop CV for hyper parameter optimization kf_in = KFold(n_splits=ns_in, shuffle=True) gscv = GridSearchCV(model, param_grid, cv=kf_in) gscv.fit(X_train, y_train) # [end] inner loop CV for hyper parameter optimization # test of the generalization error ypred = gscv.predict(X_test) ypreds = np.append(ypreds, ypred) ytests = np.append(ytests, y_test) # [end] outer loop for test of the generalization error tn, fp, fn, tp = confusion_matrix(ytests, ypreds).ravel() acc = accuracy_score(ytests, ypreds) scores[iiter, :] = np.array([tp, fp, fn, tn, acc]) means, stds = np.mean(scores, axis=0), np.std(scores, axis=0) print() print('Double Cross Validation') print('In {:} iterations, average +/- standard deviation'.format(niter)) print('TP DCV: {:.3f} (+/-{:.3f})'.format(means[0], stds[0])) print('FP DCV: {:.3f} (+/-{:.3f})'.format(means[1], stds[1])) print('FN DCV: {:.3f} (+/-{:.3f})'.format(means[2], stds[2])) print('TN DCV: {:.3f} (+/-{:.3f})'.format(means[3], stds[3])) print('Acc. DCV: {:.3f} (+/-{:.3f})'.format(means[4], stds[4]))
def dcv_rgr(X, y, model, param_grid, niter): """ Double cross validation (regression) Parameters ---------- X : array-like, shape = [n_samples, n_features] X training+test data y : array-like, shape = [n_samples] y training+test data model: machine learning model (scikit-learn) param_grid : dict or list of dictionaries Dictionary with parameters names (string) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored. niter : int number of DCV iteration Returns ------- None """ # parameters ns_in = 3 # n_splits for inner loop ns_ou = 3 # n_splits for outer loop scores = np.zeros((niter, 3)) for iiter in range(niter): ypreds = np.array([]) # list of predicted y in outer loop ytests = np.array([]) # list of y_test in outer loop kf_ou = KFold(n_splits=ns_ou, shuffle=True) # [start] outer loop for test of the generalization error for train_index, test_index in kf_ou.split(X): X_train, X_test = X[train_index], X[test_index] # inner loop CV y_train, y_test = y[train_index], y[test_index] # outer loop # [start] inner loop CV for hyper parameter optimization kf_in = KFold(n_splits=ns_in, shuffle=True) gscv = GridSearchCV(model, param_grid, cv=kf_in) gscv.fit(X_train, y_train) # [end] inner loop CV for hyper parameter optimization # test of the generalization error ypred = gscv.predict(X_test) ypreds = np.append(ypreds, ypred) ytests = np.append(ytests, y_test) # [end] outer loop for test of the generalization error rmse = np.sqrt(mean_squared_error(ytests, ypreds)) mae = mean_absolute_error(ytests, ypreds) r2 = r2_score(ytests, ypreds) # print('DCV:RMSE, MAE, R^2 = {:.3f}, {:.3f}, {:.3f}'\ # .format(rmse, mae, r2)) scores[iiter, :] = np.array([rmse, mae, r2]) means, stds = np.mean(scores, axis=0), np.std(scores, axis=0) print() print('Double Cross Validation') print('In {:} iterations, average +/- standard deviation'.format(niter)) # print('RMSE: {:6.3f} (+/-{:6.3f})'.format(means[0], stds[0])) # print('MAE : {:6.3f} (+/-{:6.3f})'.format(means[1], stds[1])) # print('R^2 : {:6.3f} (+/-{:6.3f})'.format(means[2], stds[2])) print('DCV:RMSE, MAE, R^2 = {:6.3f}, {:6.3f}, {:6.3f} (ave)'\ .format(means[0], means[1], means[2])) print('DCV:RMSE, MAE, R^2 = {:6.3f}, {:6.3f}, {:6.3f} (std)'\ .format(stds[0], stds[1], stds[2]))
# Load data iris = datasets.load_iris() features = iris.data target = iris.target # Create logistic regression logistic = linear_model.LogisticRegression() # Create range of 20 candidate values for C C = np.logspace(0, 4, 20) # Create hyperparameter options hyperparameters = dict(C=C) # Create grid search gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, n_jobs=-1, verbose=0) # Conduct nested cross-validation and outut the average score cross_val_score(gridsearch, features, target).mean() gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=1) best_model = gridsearch.fit(features, target) scores = cross_val_score(gridsearch, features, target)
def main_clf(metric_, clf_, grid_, range_=(2, 7), cv_=5, verb_=False, graphs=False): pipe = Pipeline(steps=[('sc', StandardScaler()), ('clf', clf_)]) max_scoring = 0 for k in range(*range_): denue_wide = pd.read_csv(f"summary/Count/denue_wide_{k}.csv") ### rezago = pd.read_csv("rezago_social/rezago_social.csv") rezago_social = rezago[[ "lgc00_15cl3_2", "Key", "POB_TOTAL", "LAT", "LON" ]] df = pd.merge(rezago_social, denue_wide, on=['Key']) y = rezago_social['lgc00_15cl3_2'] df.drop(["lgc00_15cl3_2", "Key", "LAT", "LON"], axis=1, inplace=True) X = df.div(df.POB_TOTAL, axis=0) * 1000 X.drop(["POB_TOTAL"], axis=1, inplace=True) X["LAT"] = rezago_social["LAT"] X["LON"] = rezago_social["LON"] print(f'# CLF {k} {X.shape}') X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.20, random_state=0) clf_cv = GridSearchCV(pipe, grid_, cv=cv_, scoring=metric_, verbose=verb_) # cv_ clf_cv.fit(X_train, y_train) if np.mean(clf_cv.best_score_) > max_scoring: max_scoring = clf_cv.best_score_ print(f"\t # {k} CLF {clf_cv.best_score_} {clf_cv.best_params_}") best_params = clf_cv.best_params_ best_k = k Xtrain, ytrain = X_train, y_train Xtest, ytest = X_test, y_test X_, y_ = X, y best_params_ = {k[5:]: v for k, v in best_params.items()} best_clf = clf_.set_params(**best_params_) best_pipe = Pipeline(steps=[('sc', StandardScaler()), ('clf', best_clf)]) print('#BEST', best_pipe, max_scoring) best_pipe.fit(Xtrain, ytrain) print(f"# {best_k}: Train:{best_pipe.score(Xtrain, ytrain) * 100}") print(f"# {best_k}: Test:{best_pipe.score(Xtest, ytest) * 100}") scores = cross_val_score(best_pipe, X_, y_, cv=cv_, n_jobs=-1, scoring='accuracy') print(f"# {best_k}: Accuracy CV5:{np.mean(scores)} +/- {np.std(scores)}") scores_ = cross_val_score(best_pipe, X_, y_, cv=cv_, n_jobs=-1, scoring=metric_) print( f"# {best_k}: {metric_} CV5:{np.mean(scores_)} +/- {np.std(scores_)}") y_pred = cross_val_predict(best_pipe, X_, y_, cv=cv_) print(classification_report(y_, y_pred, digits=3)) print(np.unique(np.array(y_pred), return_counts=True)) if graphs: # plot_multiclass_roc(best_pipe, X_, y_, n_classes=3, figsize=(16, 10)) probas = cross_val_predict(best_pipe, X_, y_, cv=cv_, method='predict_proba') fig, (ax1, ax2) = plt.subplots(1, 2) skplt.metrics.plot_roc(y_, probas, ax=ax1, title='') handles, labels = ax1.get_legend_handles_labels() # print(labels) labels = [ lb.replace(' 1 ', ' A ').replace(' 2 ', ' M ').replace(' 3 ', ' B ') for lb in labels ] # print(labels) ax1.legend(handles, labels) ax1.get_figure() ax1.set_xlabel('TFP\n(A)') skplt.metrics.plot_precision_recall(y_, probas, ax=ax2, title='') handles, labels = ax2.get_legend_handles_labels() # print(labels) labels = [ lb.replace(' 1 ', ' A ').replace(' 2 ', ' M ').replace(' 3 ', ' B ') for lb in labels ] # print(labels) ax2.legend(handles, labels) ax2.get_figure() ax2.set_xlabel('S\n(B)') plt.show() ### 2016 denue_2016 = pd.read_csv( f"summary/201610/denue_wide_{best_k}.csv") ### df_2016 = pd.merge(rezago_social, denue_2016, on=['Key']) df_2016.drop(["lgc00_15cl3_2", "Key", "LAT", "LON"], axis=1, inplace=True) X_2016 = df_2016.div(df.POB_TOTAL, axis=0) * 1000 X_2016.drop(["POB_TOTAL"], axis=1, inplace=True) X_2016["LAT"] = rezago_social["LAT"] X_2016["LON"] = rezago_social["LON"] print(X_2016.columns) y_pred_2016 = best_pipe.predict(X_2016) ### 2017 denue_2017 = pd.read_csv( f"summary/201711/denue_wide_{best_k}.csv") ### df_2017 = pd.merge(rezago_social, denue_2017, on=['Key']) df_2017.drop(["lgc00_15cl3_2", "Key", "LAT", "LON"], axis=1, inplace=True) X_2017 = df_2017.div(df.POB_TOTAL, axis=0) * 1000 X_2017.drop(["POB_TOTAL"], axis=1, inplace=True) X_2017["LAT"] = rezago_social["LAT"] X_2017["LON"] = rezago_social["LON"] y_pred_2017 = best_pipe.predict(X_2017) # ### 2018 # denue_2018 = pd.read_csv(f"summary/201811/denue_wide_{best_k}.csv") ### # df_2018 = pd.merge(rezago_social, denue_2018, on=['Key']) # df_2018.drop(["lgc00_15cl3", "Key", "LAT", "LON"], axis=1, inplace=True) # X_2018 = df_2018.div(df.POB_TOTAL, axis=0) * 1000 # X_2018.drop(["POB_TOTAL"], axis=1, inplace=True) # X_2018["LAT"] = rezago_social["LAT"] # X_2018["LON"] = rezago_social["LON"] # y_pred_2018 = best_pipe.predict(X_2018) # ### 2019 # denue_2019 = pd.read_csv(f"summary/201911/denue_wide_{best_k}.csv") ### # df_2019 = pd.merge(rezago_social, denue_2019, on=['Key']) # df_2019.drop(["lgc00_15cl3", "Key", "LAT", "LON"], axis=1, inplace=True) # X_2019 = df_2019.div(df.POB_TOTAL, axis=0) * 1000 # X_2019.drop(["POB_TOTAL"], axis=1, inplace=True) # X_2019["LAT"] = rezago_social["LAT"] # X_2019["LON"] = rezago_social["LON"] # y_pred_2019 = best_pipe.predict(X_2019) # ### 2020 # denue_2020 = pd.read_csv(f"summary/202011/denue_wide_{best_k}.csv") ### # df_2020 = pd.merge(rezago_social, denue_2020, on=['Key']) # df_2020.drop(["lgc00_15cl3", "Key", "LAT", "LON"], axis=1, inplace=True) # X_2020 = df_2020.div(df.POB_TOTAL, axis=0) * 1000 # X_2020.drop(["POB_TOTAL"], axis=1, inplace=True) # X_2020["LAT"] = rezago_social["LAT"] # X_2020["LON"] = rezago_social["LON"] # y_pred_2020 = best_pipe.predict(X_2020) # Confusion matrix skplt.metrics.plot_confusion_matrix(y_, y_pred, normalize=True, title=" ") plt.xticks([0, 1, 2], ['B', 'M', 'A'], rotation='horizontal') plt.yticks([0, 1, 2], ['B', 'M', 'A'], rotation='horizontal') plt.xlabel('Clases predichas') plt.ylabel('Clases verdaderas') plt.show() # Mapa rezago_social['Pred'] = y_pred rezago_social['Pred_2016'] = y_pred_2016 rezago_social['Pred_2017'] = y_pred_2017 # rezago_social['Pred_2018'] = y_pred_2018 # rezago_social['Pred_2019'] = y_pred_2019 # rezago_social['Pred_2020'] = y_pred_2020 rezago_social.to_csv('predictions.csv') ### rezago_social['Key_'] = rezago_social['Key'].astype(str).str.zfill(5) gdf = gpd.read_file('municipios/areas_geoestadisticas_municipales.shp') gdf['Key_'] = gdf['CVE_ENT'] + gdf['CVE_MUN'] gdf = gdf.merge(rezago_social, on='Key_') legend_elements = [ Line2D( [0], [0], marker='o', color='w', label='B', markerfacecolor='g', markersize=10, ), Line2D([0], [0], marker='o', color='w', label='M', markerfacecolor='yellow', markersize=10), Line2D([0], [0], marker='o', color='w', label='A', markerfacecolor='r', markersize=10) ] csfont = {'fontname': 'Times New Roman'} font = font_manager.FontProperties(family='Times New Roman', weight='normal', style='normal', size=12) colors = {3: 'green', 2: 'yellow', 1: 'red'} models = { 'RandomForestClassifier': 'RF', 'SCV': 'SVM', 'LogisticRegression': 'LR' } ### # gdf.plot(color=gdf['Pred_2016'].map(colors)) # plt.xticks([]) # plt.yticks([]) # txt = f"Categorías predichas por modelo {models.get(clf.__class__.__name__, 'ABC')}, para el año 201X." # plt.text(800000, 0.01, txt, wrap=True, horizontalalignment='left', fontsize=12, **csfont) # plt.legend(handles=legend_elements, prop=font) # plt.show() ### Mapa fig, (ax1, ax2) = plt.subplots(1, 2) gdf.plot(ax=ax1, color=gdf['Pred_2016'].map(colors)) ax1.set_xticks([]) ax1.set_yticks([]) # txt = f"(A) Clases predichas con modelo {models.get(clf.__class__.__name__, 'ABC')} en 2016" ax1.set_xlabel("(A)", **csfont) # ax1.text(800000, 0.01, txt, wrap=True, horizontalalignment='center', fontsize=12, **csfont) ax1.legend(handles=legend_elements, prop=font) gdf.plot(ax=ax2, color=gdf['Pred_2017'].map(colors)) ax2.set_xticks([]) ax2.set_yticks([]) # txt = f"(B) Clases predichas con modelo {models.get(clf.__class__.__name__, 'ABC')} en 2017" ax2.set_xlabel("(B)", **csfont) # ax2.text(800000, 0.01, txt, wrap=True, horizontalalignment='center', fontsize=12, **csfont) ax2.legend(handles=legend_elements, prop=font) plt.show() ### Mapa fig, (ax1, ax2) = plt.subplots(1, 2) gdf.plot(ax=ax1, color=gdf['lgc00_15cl3_2'].map(colors), legend=True) ax1.set_xticks([]) ax1.set_yticks([]) # txt = "(A) Clases de acuerdo a Valdés-Cruz y Vargas-Chanes (2017)" ax1.set_xlabel("(A)", **csfont) # ax1.text(800000, 0.01, txt, wrap=True, horizontalalignment='center', fontsize=12, **csfont) ax1.legend(handles=legend_elements, prop=font) gdf.plot(ax=ax2, color=gdf['Pred'].map(colors)) ax2.set_xticks([]) ax2.set_yticks([]) # txt = f"(B) Clases predichas con modelo {models.get(clf.__class__.__name__, 'ABC')} en 2015" ax2.set_xlabel("(B)", **csfont) # ax2.text(800000, 0.01, txt, wrap=True, horizontalalignment='center', fontsize=12, **csfont) ax2.legend(handles=legend_elements, prop=font) plt.show() # Curva ROC y_bin = label_binarize(y, classes=[1, 2, 3]) n_classes = y_bin.shape[1] y_score = cross_val_predict(best_pipe, X_, y_, cv=cv_, method='predict_proba') fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += np.interp(all_fpr, fpr[i], tpr[i]) mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) plt.figure() plt.plot(fpr["macro"], tpr["macro"], label='ROC macro (AUC = {0:0.3f})' ''.format(roc_auc["macro"]), color='navy', linestyle=':', linewidth=4) rezago = {1: 'B', 2: 'M', 3: 'A'} colors = cycle(['green', 'yellow', 'red']) for i, color in zip(range(n_classes), colors): plt.plot(fpr[i], tpr[i], color=color, lw=2, label='Clase de rezago {0} (AUC = {1:0.3f})' ''.format(rezago[i + 1], roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--', lw=2) plt.xlim([-0.05, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('TFP', fontsize=12, **csfont) plt.ylabel('TVP', fontsize=12, **csfont) plt.legend(loc="lower right", prop=font) plt.show() return scores_
def modeling(conn, sentences, lib, dz): #def modeling(conn, df, lib, dz): #pts = pd.read_sql("SELECT DISTINCT SUBJECT_ID from UFM", conn) #pts =list(set(pts.SUBJECT_ID)) #pool = [] #for d in dz: # pool += d.pos + d.neg np.random.seed(7) decay = .0002 data = []; train = []; test = [] keys = [k[1] for k in lib] admits = pd.read_sql("SELECT * from admissions", conn) for itr in range(0,5): print ("Sess: {0}".format(itr)) for d in dz: neg = random.sample(d[1], len(d[0])) temp = d[0] + neg random.shuffle(temp) t1, t2 = cross_validation.train_test_split(temp, test_size = .2) train +=t1; test +=t2 #X stands for raw indexes of feature input; V stands for raw feature input #W stands for word vectors from feature input trained by Word2Vec X_train = []; t_train = []; W_train = []; Y_train = [] X_test = []; t_test = []; W_test = []; Y_test = [] V_train = []; V_test = [] count=0 for t in train: print (count) count+=1 corpus = [[s[2], s[3]] for s in sentences if (s[0] == t[0]) and (pd.to_datetime(admits[admits['HADM_ID']==s[1]].ADMITTIME.values[0]) <= t[1])] #order subject by time of entry for each sentence (admission) corpus = sorted(corpus, key = lambda x: x[1]) #transpose into nx2xd from 2xnxd #this way, corpus[0] refers to words and corpus[1] refers to times corpus = list(map(list, zip(*corpus))) x_train = list(chain.from_iterable(corpus[0])) t_stamps = list(chain.from_iterable(corpus[1])) x = np.array(list(map(lambda x: keys.index(x), x_train))) #configure each timestamp to reflect time elapsed from first time entry #calculate time decay from initial event temp = t_stamps[0] t_stamps = [ii-temp for ii in t_stamps] #append X_train.append(x) V_train.append(np.array(x_train)) t_train.append(np.array(t_stamps)) Y_train.append(t[3]) print ("X_train made.") count = 0 for t in test: print (count) count+=1 corpus = [[s[2], s[3]] for s in sentences if (s[0] == t[0]) and (pd.to_datetime(admits[admits['HADM_ID']==s[1]].ADMITTIME.values[0]) <= t[1])] corpus = sorted(corpus, key = lambda x: x[1]) corpus = list(map(list, zip(*corpus))) x_test = list(chain.from_iterable(corpus[0])) t_stamps = list(chain.from_iterable(corpus[1])) temp = t_stamps[0] t_stamps = [ii-temp for ii in t_stamps] x = np.array(list(map(lambda x: keys.index(x), x_test))) X_test.append(x) V_test.append(np.array(x_train)) t_test.append(np.array(t_stamps)) Y_test.append(t[3]) #training normal LSTM and CNN-LSTM top_words = [9444] max_review_length = [1000] embedding_length = [300] X_train = sequence.pad_sequences(X_train, maxlen=max_review_length[0]) X_test = sequence.pad_sequences(X_test, maxlen=max_review_length[0]) #build model using KerasClassifier and Gridsearch cnn = KerasClassifier(build_fn=cnn_train, verbose=1) lstm = KerasClassifier(build_fn=lstm_train, verbose=1) d_cnn = KerasClassifier(build_fn=d_cnn_train, verbose = 1) d_lstm = KerasClassifier(build_fn=d_lstm_train, verbose = 1) # define the grid search parameters batch_size = [32, 64, 128] epochs = [20, 50, 100, 200] optimizer = ['SGD', 'RMSprop', 'Adam'] learn_rate = (10.0**np.arange(-4,-1)).tolist() momentum = np.arange(.5,.9,.1).tolist() neurons = [50, 100, 200] dropout_W = [.1, .2, .5] dropout_U = [.1, .2, .5] W_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None] U_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None] init_mode = ['uniform', 'normal', 'zero'] #activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear'] param_grid = dict(top_words=top_words, max_length = max_review_length, embedding_length = embedding_length, batch_size=batch_size, nb_epoch=epochs, optimizer = optimizer, learn_rate = learn_rate, momentum = momentum, neurons = neurons, dropout_W = dropout_W, dropout_U = dropout_U, W_regularizer = W_regularizer, U_regularizer = U_regularizer, init_mode = init_mode) d_param_grid = dict(input_shape = [(max_review_length[0], embedding_length[0])], batch_size=batch_size, nb_epoch=epochs, optimizer = optimizer, learn_rate = learn_rate, momentum = momentum, neurons = neurons, dropout_W = dropout_W, dropout_U = dropout_U, W_regularizer = W_regularizer, U_regularizer = U_regularizer, init_mode = init_mode) lr_params = {'C':(10.0**np.arange(-4,4)).tolist(), 'penalty':('l1','l2')} sv_params = {'C':(10.0**np.arange(-4,4)).tolist(), 'kernel':('linear', 'poly', 'rbf', 'sigmoid')} rf_params = {'criterion': ['gini', 'entropy']} #setup GridSearch w/ cross validation cnn_grid = GridSearchCV(estimator=cnn, param_grid=param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1) lstm_grid = GridSearchCV(estimator=lstm, param_grid=param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1) d_cnn_grid = GridSearchCV(estimator=d_cnn, param_grid=d_param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1) d_lstm_grid = GridSearchCV(estimator=d_lstm, param_grid=d_param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1) classics = GridSearchCV(estimator = (LR, SVM, RF), param_grid = (lr_params, sv_params, rf_params), scoring = 'roc_auc', sv = 5, n_jobs = -1) #lr_grid = GridSearchCV(estimator = lr_params, param_grid = lr_params, scoring = 'roc_auc', sv = 5, n_jobs = -1) #sv_grid = GridSearchCV(estimator = sv_params, param_grid = sv_params, scoring = 'roc_auc', sv = 5, n_jobs = -1) #rf_grid = GridSearchCV(estimator = rf_params, param_grid = rf_params, scoring = 'roc_auc', sv = 5, n_jobs = -1) # Fit the model cnn_result = cnn_grid.fit(X_train, Y_train) lstm_result = lstm_grid.fit(X_train, Y_train) d_cnn_result = d_cnn_grid.fit(decay(x=np.array(V_train), t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[0], Y_train) d_lstm_result = d_lstm_grid.fit(decay(x=np.array(V_train), t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[0], Y_train) classics_result = classics.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[1], Y_train) #lr_result = lr_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train) #sv_result = sv_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train) #rf_result = rf_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train) #grid_search results: print("CNN Best: %f using %s" % (cnn_result.best_score_, cnn_result.best_params_)) means = cnn_result.cv_results_['mean_test_score'] stds = cnn_result.cv_results_['std_test_score'] params = cnn_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, params)) print("LSTM Best: %f using %s" % (lstm_result.best_score_, lstm_result.best_params_)) means = lstm_result.cv_results_['mean_test_score'] stds = lstm_result.cv_results_['std_test_score'] params = lstm_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, params)) print("Decay CNN Best: %f using %s" % (d_cnn_result.best_score_, d_cnn_result.best_params_)) means = d_cnn_result.cv_results_['mean_test_score'] stds = d_cnn_result.cv_results_['std_test_score'] params = d_cnn_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, params)) print("Decay LSTM Best: %f using %s" % (d_lstm_result.best_score_, d_lstm_result.best_params_)) means = d_lstm_result.cv_results_['mean_test_score'] stds = d_lstm_result.cv_results_['std_test_score'] params = d_lstm_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, params)) print("Best of Classics: %f using %s, %s" % (classics_result.best_score_, classics_result.best_estimator_, classics_result.best_params_)) means = classics_result.cv_results_['mean_test_score'] stds = classics_result.cv_results_['std_test_score'] params = classics_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, params)) #KFold = 5 #kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7) #cvscores = [] #for training, testing in kfold.split(X_train, Y_train): # Fit the model #model.fit(X[training], Y[training], nb_epoch=150, batch_size=10, verbose=0) # evaluate the model #scores = model.evaluate(X[testing], Y[testing], verbose=0) #print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) #cvscores.append(scores[1] * 100) #print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores))) ######TESTING####### cnn = cnn_train(top_words = top_words, max_length = max_review_length, embedding_length=embedding_length) lstm = lstm_train(top_words = top_words, max_length = max_review_length, embedding_length=embedding_length) cnn.fit(X_train, Y_train, validation_split = .2, nb_epoch=100, batch_size=128, shuffle = True, verbose=1) lstm.fit(X_train, Y_train, validation_split = .2, nb_epoch=100, batch_size=128, shuffle = True, verbose=1) #testing predictions_lstm = lstm.predict_classes(X_test) predictions_cnn = cnn.predict_classes(X_test) acc = accuracy_score(Y_test, predictions_lstm) f1 = f1_score (Y_test, predictions_lstm) auc = roc_auc_score (Y_test, predictions_lstm) scores_lstm = [("Accuracy", acc) , ("F1 Score", f1) , ("AUC Score",auc)] acc = accuracy_score(Y_test, predictions_cnn) f1 = f1_score (Y_test, predictions_cnn) auc = roc_auc_score (Y_test, predictions_cnn) scores_cnn = [("Accuracy", acc) , ("F1 Score", f1) , ("AUC Score",auc)] print ("LSTM DATA: ") for s in scores_lstm: print("%s: %.2f" %(s[0], s[1]), end = " ") print ("") print ("CNN DATA: ") for s in scores_cnn: print("%s: %.2f" %(s[0], s[1]), end = " ") data.append(data) return (Data)
x=PTrain_ad.iloc[:,1:] y=PTrain_ad.iloc[:,1] x_train,x_val,y_train,y_val = train_test_split(x,y,test_size=0.25,random_state=0) #%% Brenchmark model para_lo=[{'penalty':['l1','l2'], 'C':np.logspace(-1,1,10), 'solver':['liblinear'], 'multi_class':['ovr']}, {'penalty':['l2'], 'C':np.logspace(-1,1,20), 'solver':['lbfgs'], 'multi_class':['ovr','multinomial']}] logcv=GridSearchCV(LogisticRegression(),para_lo,cv=10,scoring='roc_auc') log=logcv.fit(x_train,y_train) yyy=log.predict(x_val) log.coef_ print("Number of defaults in test set: {0}".format(sum(y_val))) print("Number of defaults in train set: {0}".format(sum(y_pred))) print(accuracy_score(y_test,yyy)) print(confusion_matrix(y_test,yyy)) print(classification_report(y_test,yyy,digits=3)) print(clf.best_estimator_) #%% RandomForest para = [{'n_estimators':[110,120], '':['entropy','gini'], #'max_depth':[12,18,24], 'min_samples_split':[40], #'min_weight_fraction_leaf':[0.1,0.3,0.5],
plt.title('Average score: {} and Std score : {}'.format( np.mean(cv_scores), np.std(cv_scores))) # In[4]: #Tune the parameters to best fit to the training data N_E = 200 N_LR = 5 ADB = AdaBoostClassifier(base_estimator=classifier) parameter_grid = { 'n_estimators': np.arange(1, N_E + 20, 20), 'learning_rate': np.linspace(0.1, 2, N_LR) } cross_validation = StratifiedKFold(arr_out, n_folds=3) grid_search = GridSearchCV(ADB, param_grid=parameter_grid, cv=cross_validation) grid_search.fit(arr_in, arr_out) print('Best score: {}'.format(grid_search.best_score_)) print('Best parameters: {}'.format(grid_search.best_params_)) # In[5]: #Visualisation of the grid over the tuning parameters learning_rate = np.linspace(0.1, 2, N_LR) n_estimators = np.arange(1, N_E + 20, 20) plt.figure() grid_visualization = [] grid_visualization.append(grid_search.cv_results_['mean_test_score']) grid_visualization = np.array(grid_visualization) grid_visualization.shape = (len(learning_rate), len(n_estimators))
from sklearn.model_selection import GridSearchCV params = { 'gamma': [0.1, 1], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5], 'min_child_weight': [1, 100], 'n_estimators': [100, 200], 'subsample': [1.0], 'colsample_bytree': [1.0], } gsearch1 = GridSearchCV(estimator=XGBClassifier(objective='binary:logistic', nthread=4, random_state=seed, seed=seed), param_grid=params, scoring='roc_auc', n_jobs=-1) gsearch1.fit(X_train, y_train) #gsearch1.best_score_, gsearch1.best_params_, gsearch1.best_score_ print('tuned XGBClassifier') print(gsearch1) print('=================================================') print('=================================================') fpr, tpr, thresholds = metrics.roc_curve(y_train, gsearch1.predict_proba(X_train)[:, 1]) print('gini_train', 2 * metrics.auc(fpr, tpr) - 1)
# Training X = train_df_new y = y_train.total_count.values.reshape(-1, 1) dtr = DecisionTreeRegressor(max_depth=4, min_samples_split=5, max_leaf_nodes=10) dtr.fit(X, y) dot_data = tree.export_graphviz(dtr, out_file=None) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf("bike_share.pdf") # Grid Search with Cross validation param_grid = {"criterion": ["mse", "mae"], "min_samples_split": [10, 20, 40], "max_depth": [2, 6, 8], "min_samples_leaf": [20, 40, 100], "max_leaf_nodes": [5, 20, 100, 500, 800]} grid_cv_dtr = GridSearchCV(dtr, param_grid, cv=5) grid_cv_dtr.fit(X, y) # Cross Validation: Best Model Details df = pd.DataFrame(data=grid_cv_dtr.cv_results_) fig, ax = plt.subplots() sn.pointplot(data=df[['mean_test_score', 'param_max_leaf_nodes', 'param_max_depth']], y='mean_test_score', x='param_max_depth', hue='param_max_leaf_nodes', ax=ax) ax.set(title="Effect of Depth and Leaf Nodes on Model Performance") fig.savefig("cross_validation_best_model.png") # Residual Plot predicted = grid_cv_dtr.best_estimator_.predict(X)
# Estandarización scaler = preprocessing.StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Configuración del modelo SVM svm_model = svm.SVC(kernel='rbf') # C ∈ {0.02, 0.2, 2, 200} y γ ∈ {0.02, 0.2, 2, 200} Cs = 2 * np.logspace(-2, 0, num=3, base=10) Cs = np.append(Cs, 200) Gs = Cs # Validación cruzada anidada tipo K-fold optimo = GridSearchCV(estimator=svm_model, param_grid=dict(C=Cs, gamma=Gs), n_jobs=-1, cv=5) # Entrenar el modelo óptimo optimo.fit(X_train, y_train) # Configuración del modelo óptimo print optimo.best_params_ # CCR de test óptimo print optimo.score(X_test, y_test) * 100 # Representar los puntos plt.figure(1) plt.clf() plt.scatter(X[:, 0], X[:, 1], c=y, zorder=10, cmap=plt.cm.Paired)
corpusts = testdf['lyrics_string'] vectorizerts = TfidfVectorizer(stop_words='english') tfidfts=vectorizertr.transform(corpusts) predictors_tr = tfidftr targets_tr = traindf['genre'] predictors_ts = tfidfts #classifier = LinearSVC(C=0.80, penalty="l2", dual=False) parameters = {'C':[1, 10]} #clf = LinearSVC() clf = LogisticRegression() #parameters = {'n_neighbors':[1,10]} #clf = KNeighborsClassifier() #parameters = {'min_samples_split': [2,10]} #clf = DecisionTreeClassifier() #clf = RandomForestClassifier() ### Nerual Network took too long classifier = GridSearchCV(clf, parameters) classifier=classifier.fit(predictors_tr,targets_tr) predictions=classifier.predict(predictors_ts) testdf['genre'] = predictions # testdf = testdf.sort_values('id' , ascending=True) testdf[['id' , 'lyrics_clean_string' , 'genre' ]].to_csv("submission.csv")
# 主成分分析建模 pca = PCA(n_components=n_components, whiten=True).fit(X_train) eigenfaces = pca.components_.reshape((n_components, h, w)) print("根据主成分进行降维开始") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("降维结束") ############################################################################### # 训练SVM print("训练SVM分类模型开始") t0 = time() # 构建归类精确度5x6=30 param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } # 图片用rbf核函数,权重自动选取 clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) clf = clf.fit(X_train_pca, y_train) print("SVM训练结束,结果如下:" "SVM训练用时 %0.3fs" % (time() - t0)) print(clf.best_estimator_) # ############################################################################### # 测试集测试 print("测试集SVM分类模型开始") t0 = time() y_pred = clf.predict(X_test_pca) print("测试集用时 %0.3fs" % (time() - t0)) print("误差衡量") # 数据中1的个数为a,预测1的次数为b,预测1命中的次数为c # 准确率 precision = c / b # 召回率 recall = c / a
def __init__(self, name, construct, skip_methods=(), fit_args=make_classification()): self.name = name self.construct = construct self.fit_args = fit_args self.skip_methods = skip_methods DELEGATING_METAESTIMATORS = [ DelegatorData('Pipeline', lambda est: Pipeline([('est', est)])), DelegatorData( 'GridSearchCV', lambda est: GridSearchCV(est, param_grid={'param': [5]}, cv=2), skip_methods=['score']), DelegatorData('RandomizedSearchCV', lambda est: RandomizedSearchCV( est, param_distributions={'param': [5]}, cv=2, n_iter=1), skip_methods=['score']), DelegatorData('RFE', RFE, skip_methods=['transform', 'inverse_transform', 'score']), DelegatorData('RFECV', RFECV, skip_methods=['transform', 'inverse_transform', 'score']), DelegatorData('BaggingClassifier', BaggingClassifier, skip_methods=[ 'transform', 'inverse_transform', 'score',
def knncls(): """ K-近邻预测用户签到位置 :return:None """ # 读取数据 data = pd.read_csv("./data/FBlocation/train.csv") print(data.head(10)) # 处理数据 # 1、缩小数据,查询数据晒讯 data = data.query("x > 1.0 & x < 1.25 & y > 2.5 & y < 2.75") # 处理时间的数据 time_value = pd.to_datetime(data['time'], unit='s') print(time_value) # 把日期格式转换成 字典格式 time_value = pd.DatetimeIndex(time_value) # 构造一些特征 data['day'] = time_value.day data['hour'] = time_value.hour data['weekday'] = time_value.weekday # 把时间戳特征删除 data = data.drop(['time'], axis=1) print(data) # 把签到数量少于n个目标位置删除 place_count = data.groupby('place_id').count() tf = place_count[place_count.row_id > 3].reset_index() data = data[data['place_id'].isin(tf.place_id)] # 取出数据当中的特征值和目标值 y = data['place_id'] x = data.drop(['place_id'], axis=1) # 进行数据的分割训练集合测试集 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) # 特征工程(标准化) std = StandardScaler() # 对测试集和训练集的特征值进行标准化 x_train = std.fit_transform(x_train) x_test = std.transform(x_test) # 进行算法流程 # 超参数 knn = KNeighborsClassifier() # # fit, predict,score knn.fit(x_train, y_train) # # 得出预测结果 y_predict = knn.predict(x_test) # # print("预测的目标签到位置为:", y_predict) # # # 得出准确率 # print("预测的准确率:", knn.score(x_test, y_test)) # 构造一些参数的值进行搜索 param = {"n_neighbors": [3, 5, 10]} # 进行网格搜索 gc = GridSearchCV(knn, param_grid=param, cv=2) gc.fit(x_train, y_train) # 预测准确率 print("在测试集上准确率:", gc.score(x_test, y_test)) print("在交叉验证当中最好的结果:", gc.best_score_) print("选择最好的模型是:", gc.best_estimator_) print("每个超参数每次交叉验证的结果:", gc.cv_results_) return None
print('=> calculating mean and covariance') mean, cov = fit_norm_distribution_param(args, model, train_dataset, channel_idx=channel_idx) ''' 2. Train anomaly score predictor using support vector regression (SVR). (Optional) ''' # An anomaly score predictor is trained # given hidden layer output and the corresponding anomaly score on train dataset. # Predicted anomaly scores on test dataset can be used for the baseline of the adaptive threshold. if args.compensate: print('=> training an SVR as anomaly score predictor') train_score, _, _, hiddens, _ = anomalyScore( args, model, train_dataset, mean, cov, channel_idx=channel_idx) score_predictor = GridSearchCV(SVR(), cv=5, param_grid={ "C": [1e0, 1e1, 1e2], "gamma": np.logspace(-1, 1, 3) }) score_predictor.fit( torch.cat(hiddens, dim=0).numpy(), train_score.cpu().numpy()) else: score_predictor = None ''' 3. Calculate anomaly scores''' # Anomaly scores are calculated on the test dataset # given the mean and the covariance calculated on the train dataset print('=> calculating anomaly scores') score, sorted_prediction, sorted_error, _, predicted_score = anomalyScore( args, model, test_dataset,
n_folds = 6 # choosing different parameter combinations to try param_grid = {'C': [0.01, 0.1, 1, 10], 'gamma': [0.004, 0.001, 0.01, 0.1], 'kernel': ['rbf', 'linear', 'poly'], } # type of scoring used to compare parameter combinations acc_scorer = make_scorer(accuracy_score) # run grid search start_time = dt.datetime.now() print('Start grid search at {}'.format(str(start_time))) grid_search = GridSearchCV(classifier, param_grid, cv=n_folds, scoring=acc_scorer, n_jobs=4) grid_obj = grid_search.fit(X_val, y_val) # get grid search results print(grid_obj.cv_results_) # set the best classifier found for rbf clf = grid_obj.best_estimator_ print(clf) end_time = dt.datetime.now() print('Stop grid search {}'.format(str(end_time))) elapsed_time= end_time - start_time print('Elapsed grid search time {}'.format(str(elapsed_time))) # fit the best alg to the training data start_time = dt.datetime.now()
model.add(Dropout(0.2)) model.add( Dense(units=16, activation='relu', kernel_initializer='random_uniform')) model.add(Dense(units=1, activation='sigmoid')) model.compile(optimizer=optimizer, loss=loss, metrics=['binary_accuracy']) return model classifier = KerasClassifier(build_fn=criarRede) params = { 'batch_size': [10, 30], 'epochs': [50, 100], 'optimizer': ['adam', 'sgd'], 'loss': ['binary_crossentropy', 'hinge'], 'kernel_initializer': ['random_uniform', 'normal'], 'activation': ['relu', 'tanh'], 'neurons': [16, 8] } grid_search = GridSearchCV(estimator=classifier, param_grid=params, scoring='accuracy', cv=5) grid_search = grid_search.fit(data_x, data_y) best_params = grid_search.best_params_ best_precision = grid_search.best_score_
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3, random_state=101) # Using gridsearch to find the best regularizattion parameter for the linear svc clf = svm.SVC() parameters = [{ 'kernel': ['poly'], 'C': [0.5, 1, 10, 5, 7, 8, 9], 'gamma': [0.5, 1, 10, 'auto', 'scale'], 'degree': [1, 2, 3] }] grid_search = GridSearchCV(estimator=clf, param_grid=parameters, cv=5, n_jobs=-1) grid_search = grid_search.fit(X_train, y_train) best_score = grid_search.best_score_ best_parameters = grid_search.best_params_ """ clf = svm.SVC(kernel = 'linear', C = 0.8) clf.fit(X_train, y_train) #computing the decision boundary x1, x2, xx, yy = computeMesh(X_train[:,0], X_train[:,1], 0.02) xy_mesh = np.c_[x1, x2] # Turn to Nx2 matrix clzmesh = clf.predict(xy_mesh) clzmesh = clzmesh.reshape(xx.shape) fig, ax = plt.subplots()