# Grid Search
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

pipeline = Pipeline([('clf', RandomForestClassifier(criterion='gini'))])

parameters = {
    'clf__n_estimators': (1000, 2000, 3000),
    'clf__max_depth': (100, 200, 300),
    'clf__min_samples_split': (2, 3),
    'clf__min_samples_leaf': (1, 2)
}

grid_search = GridSearchCV(pipeline,
                           parameters,
                           n_jobs=-1,
                           cv=5,
                           verbose=1,
                           scoring='accuracy')
grid_search.fit(x_train, y_train)

print('Best Training score: %0.3f' % grid_search.best_score_)
print('Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('\t%s: %r' % (param_name, best_parameters[param_name]))

predictions = grid_search.predict(x_test)

print("Testing accuracy:", round(accuracy_score(y_test, predictions), 4))
print("\nComplete report of Testing data\n",
      classification_report(y_test, predictions))
示例#2
0
X_train, X_test, y_train, y_test = train_test_split(select_X,
                                                    y1,
                                                    test_size=0.2,
                                                    random_state=0)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

#cross validation
param_dist = {'n_neighbors': range(1, 30), 'weights': ["uniform", "distance"]}

cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
grid = GridSearchCV(kNN(), param_grid=param_dist, cv=cv)
grid.fit(X_train, y_train.values.ravel())

best_estimator = grid.best_estimator_
print(best_estimator)

#nach cross validation bekommen wir best_estimator.
clf = best_estimator

print('the acuracy for all is:')
print(clf.score(X_test, y_test.values.ravel()))

prediction = clf.predict(X_test)
print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, prediction))

print("Classification report:\n %s\n" %
                  optimizer='adadelta',
                  metrics=['accuracy'])

    return model


dense_size_candidates = [[32], [64], [32, 32], [64, 64]]
my_classifier = KerasClassifier(make_model, batch_size=32)

validator = GridSearchCV(
    my_classifier,
    param_grid={
        'dense_layer_sizes': dense_size_candidates,
        # nb_epoch is avail for tuning even when not
        # an argument to model building function
        'nb_epoch': [3, 6],
        'nb_filters': [8],
        'nb_conv': [3],
        'nb_pool': [2]
    },
    scoring='neg_log_loss',
    n_jobs=1)
validator.fit(X_train, y_train)

print('The parameters of the best model are: ')
print(validator.best_params_)

# validator.best_estimator_ returns sklearn-wrapped version of best model.
# validator.best_estimator_.model returns the (unwrapped) keras model
best_model = validator.best_estimator_.model
metric_names = best_model.metrics_names
示例#4
0
    axes[idx].set_title(label)
#不能这样加坐标轴标题,因为plt是加到当前绘制的图(最后一个子图)里
#plt.xlabel("Sepal Width [std]")
#plt.ylabel("Petal length [std]")
plt.text(-3.5, -4.5, s="Sepal Width [std]", ha="center", va="center", fontsize=12)
plt.text(-10.5, 4.5, s="Petal Length [std]", ha="center", va="center", fontsize=12, rotation=90)
plt.show()

# mv_clf.get_params()
# 获得当前 estimator的全部参数名称,便于进行 GridSearch


# In[4]:

from sklearn.model_selection import GridSearchCV
params = {'decisiontreeclassifier__max_depth': [1, 2], 
          'pipeline-1__clf__C': [0.001, 0.1, 100]
         }
grid = GridSearchCV(cv=5, 
                    estimator=mv_clf, 
                    n_jobs=1, 
                    scoring="roc_auc", 
                    param_grid=params)
grid.fit(X_train, y_train)

import pandas as pd
gridSearchResult = pd.DataFrame(grid.cv_results_)
gridSearchResult[["mean_test_score", "params"]].head(5)
print("best score: %0.3f ; best params: %s" % (grid.best_score_, grid.best_params_))

					descriptor__k=[10],
					classify__kernel=["rbf"],
					classify__gamma= [.002],
					classify__C=[1])
'''

params = dict(descriptor__descType=["SpatialPyramids"],
              descriptor__numFeatures=[512],
              descriptor__k=[600],
              classify__kernel=[CodeBook.pyramidMatchKernel],
              classify__gamma=[0.0001, 0.01, 10],
              classify__C=[1])

# Cross-validate
start = time.time()
grid = GridSearchCV(pipe, cv=6, n_jobs=1, param_grid=params)
grid.fit(train_images_filenames, train_labels)
end = time.time()

# save results in a file
saveXVal(grid)

# print results
print(grid.best_params_)

print("All done in ", str(end - start), " seconds.")

print("Best parameters set found on development set:")
print()
print(grid.best_params_)
print()
    "lambda_": [1e-5, 1e-4, 1e-3],
}

X_train = data_tr
y_train = target_tr
X_test = data_ts
y_test = target_ts

# initialise model
n_int_fold = 10  # number of folds
model = ValentiMLP(**default_params, n_batch=int(len(data_tr) / n_int_fold))

# grid search
grid_search = GridSearchCV(model,
                           cv=n_int_fold,
                           n_jobs=-1,
                           param_grid=param_grid,
                           verbose=2,
                           return_train_score=True)
grid_search.fit(X_train, y_train)

# print results
cv_result = pd.DataFrame(grid_search.cv_results_)
pprinter = pp.PrettyPrinter(indent=4)
print(cv_result[[
    "param_eta", "param_alpha", "param_lambda_", "param_n_hidden",
    "mean_train_score", "std_train_score", "mean_test_score", "std_test_score"
]])
print("Best parameters:")
pprinter.pprint(grid_search.best_params_)

# refit model over whole training set
def main():
    #*************************************************************************************
    #1.load data (training and test) and preprocessing data(replace NA,98,96,0(age) with NaN)
    #read data using pandas
    #replace 98, 96 with NAN for NOTime30-59,90,60-90
    #replace  0 with NAN for age
    #*************************************************************************************
    colnames = ['ID', 'label', 'RUUnsecuredL', 'age', 'NOTime30-59', \
                'DebtRatio', 'Income', 'NOCredit', 'NOTimes90', \
                'NORealEstate', 'NOTime60-89', 'NODependents']
    col_nas = ['', 'NA', 'NA', 0, [98, 96], 'NA', 'NA', 'NA', \
                [98, 96], 'NA', [98, 96], 'NA']
    col_na_values = creatDictKV(colnames, col_nas)

    dftrain = pd.read_csv("cs-training.csv", names=colnames, \
                          na_values=col_na_values, skiprows=[0])
    train_id = [int(x) for x in dftrain.pop("ID")]
    y_train = np.asarray([int(x) for x in dftrain.pop("label")])
    x_train = dftrain.as_matrix()

    dftest = pd.read_csv("cs-test.csv", names=colnames, \
                         na_values=col_na_values, skiprows=[0])
    test_id = [int(x) for x in dftest.pop("ID")]
    y_test = np.asarray(dftest.pop("label"))
    x_test = dftest.as_matrix()

    #*************************************************************************************
    #2.split training data into training_new  and test_new (for validation model)
    # to keep the class ratio using StratifiedShuffleSplit to do the split
    #*************************************************************************************

    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.33333, random_state=0)
    for train_index, test_index in sss.split(x_train, y_train):
        print("TRAIN:", train_index, "TEST:", test_index)
        x_train_new, x_test_new = x_train[train_index], x_train[test_index]
        y_train_new, y_test_new = y_train[train_index], y_train[test_index]

    y_train = y_train_new
    x_train = x_train_new

    #*****************************************************************************************
    #3.impute the data with imputer: replace MVs with Mean
    #*****************************************************************************************
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(x_train)
    x_train = imp.transform(x_train)
    x_test_new = imp.transform(x_test_new)
    x_test = imp.transform(x_test)

    #*****************************************************************************************
    #4.Build RF model using the training_new data:
    #   a. handle imbalanced data distribution by
    #      setting class_weight="balanced"/"balanced_subsample"
    #      n_samples / (n_classes * np.bincount(y))
    #*****************************************************************************************
    #  Initialize the model:
    #*****************************************************************************************
    rf = RandomForestClassifier(n_estimators=100, \
                                oob_score=True, \
                                min_samples_split=2, \
                                min_samples_leaf=50, \
                                n_jobs=-1, \
                                #class_weight="balanced",\
                                class_weight="balanced_subsample", \
                                bootstrap=True\
                                )
    #*************************************************************************************
    #   b. perform parameter tuning using grid search with CrossValidation
    #*************************************************************************************

    #param_grid={"max_features": [2,3,4,5],\
    #	 "min_samples_leaf": [30,40,50,100],\
    #	 "criterion": ["gini", "entropy"]}
    param_grid = {"max_features": [2, 3, 4], "min_samples_leaf": [50]}
    grid_search = GridSearchCV(rf,
                               cv=10,
                               scoring='roc_auc',
                               param_grid=param_grid,
                               iid=False)

    #*************************************************************************************
    #   c. output the best model and make predictions for test data
    #       - Use best parameter to build model with training_new data
    #*************************************************************************************
    grid_search.fit(x_train, y_train)
    print "the best parameter:", grid_search.best_params_
    print "the best score:", grid_search.best_score_
    #print "the parameters used:",grid_search.get_params

    #*************************************************************************************
    #   To see how fit the model with the training_new data
    #       -Use the model trained to make predication for train_new data
    #*************************************************************************************

    predicted_probs_train = grid_search.predict_proba(x_train)
    predicted_probs_train = [x[1] for x in predicted_probs_train]
    computeAUC(y_train, predicted_probs_train)

    #*************************************************************************************
    #   To see how well the model performs with the test_new data
    #    -Use the model trained to make predication for validataion data (test_new)
    #*************************************************************************************
    predicted_probs_test_new = grid_search.predict_proba(x_test_new)
    predicted_probs_test_new = [x[1] for x in predicted_probs_test_new]
    computeAUC(y_test_new, predicted_probs_test_new)

    #*************************************************************************************
    #  use the model to predict for test and output submission file
    #*************************************************************************************
    predicted_probs_test = grid_search.predict_proba(x_test)
    predicted_probs_test = ["%.9f" % x[1] for x in predicted_probs_test]
    submission = pd.DataFrame({
        'ID': test_id,
        'Probabilities': predicted_probs_test
    })
    submission.to_csv("rf_benchmark.csv", index=False)
示例#8
0
 def train(self,
           datas,
           labels,
           model="LGBM",
           gridsearch=False,
           parameters=None):
     x_vec = np.array([self.get_vec(i) for i in datas])
     label_uni = labels.unique()
     num_class = label_uni.size
     label_map = {label: ind for ind, label in enumerate(label_uni)}
     print(label_map)
     _labels = labels.map(label_map)
     print("Original dataset shape %s" % Counter(_labels))
     smote_enn = SMOTEENN(random_state=0)
     x_sample, y_sample = smote_enn.fit_sample(x_vec, _labels)
     print(sorted(Counter(y_sample).items()))
     print('re-sampled dataset shape %s' % Counter(y_sample))
     x_train, x_test, y_train, y_test = train_test_split(x_sample,
                                                         y_sample,
                                                         test_size=0.2,
                                                         random_state=123)
     print("classifier model is:%s" % model)
     y_one_hot = label_binarize(y_test, np.arange(num_class))
     clf_model = self.clf_models[model]
     if gridsearch:
         gsearch = GridSearchCV(clf_model,
                                param_grid=parameters['params'],
                                scoring='accuracy',
                                cv=parameters['cv'],
                                n_jobs=-1)
         gsearch.fit(x_train, y_train)
         print("Best score: %0.3f" % gsearch.best_score_)
         print("Best parameters set:")
         best_parameters = gsearch.best_estimator_.get_params()
         for param_name in sorted(parameters['params'].keys()):
             print("\t%s: %r" % (param_name, best_parameters[param_name]))
         evaluation = self.model_metrics(gsearch.best_estimator_, x_test,
                                         y_test)
         print(evaluation)
         y_score = gsearch.predict_proba(x_test)
         fpr, tpr, threshold = roc_curve(y_one_hot.ravel(), y_score.ravel())
         roc_auc = auc(fpr, tpr)
         plt.figure(figsize=(10, 10))
         plt.plot(fpr,
                  tpr,
                  color='darkorange',
                  lw=2.0,
                  label='ROC curve (area = %0.2f)' % roc_auc)
         plt.plot([0, 1], [0, 1], color='navy', lw=2.0, linestyle='--')
         plt.xlim([0.0, 1.0])
         plt.ylim([0.0, 1.05])
         plt.xlabel('False Positive Rate')
         plt.ylabel('True Positive Rate')
         plt.title('%s ROC curve' % model)
         plt.legend(loc="lower right")
         plt.show()
         joblib.dump(gsearch.best_estimator_,
                     os.path.join(os.getcwd(), model + "gsearch.m"))
     else:
         clf_model.fit(x_train, y_train)
         joblib.dump(clf_model, os.path.join(os.getcwd(), model + ".m"))
         evaluation = self.model_metrics(clf_model, x_test, y_test)
         print(evaluation)
         y_score = clf_model.predict_proba(x_test)
         print(y_score - y_one_hot)
         fpr, tpr, threshold = roc_curve(y_one_hot.ravel(), y_score.ravel())
         roc_auc = auc(fpr, tpr)
         plt.figure(figsize=(10, 10))
         plt.plot(fpr,
                  tpr,
                  color='darkorange',
                  lw=2.0,
                  label='ROC curve (area = %0.2f)' % roc_auc)
         plt.plot([0, 1], [0, 1], color='navy', lw=2.0, linestyle='--')
         plt.xlim([0.0, 1.0])
         plt.ylim([0.0, 1.05])
         plt.xlabel('False Positive Rate')
         plt.ylabel('True Positive Rate')
         plt.title('%s ROC curve' % model)
         plt.legend(loc="lower right")
         plt.show()
         with open("training_evaluation.txt", 'w') as f:
             s = model + '\n' + str(evaluation) + str(label_map)
             f.write(s)
for i in range(n_onehot):
    col.append(onehot_attributes[i])
df_imputed_scaled = pd.DataFrame(data_array, columns=col)
print(df_imputed_scaled.shape)

x = df_imputed_scaled.drop(['good_bad'], axis=1)
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

y = df['good_bad']

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn import tree

parameters = {'max_depth': range(4, 20)}
clf = GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=4, cv=10)
clf.fit(X=x, y=y)
predictions = clf.predict(x)
tree_model = clf.best_estimator_
print(clf.best_score_, clf.best_params_)

print("Accuracy")
print(accuracy_score(y, predictions))
mat = confusion_matrix(y, predictions)
print(classification_report(y, predictions))

features = list(x.columns.values)
print(features)
from IPython.core.display import display, Image

print("To display the tree")
示例#10
0
#Nous initialisons ensuite un DecisionTreeClassifierobjet avec deux arguments.
decTree = DecisionTreeClassifier(criterion='gini', random_state=50)

#Enfin, nous ajustons le modèle sur les données d’entraînement
decTree.fit(X_train, y_train)

# évaluons sa précision sur les données de test.
decTree.score(X_test, y_test)
y_pred = decTree.predict(X_test)

# Evalution avec Matrice de Confusion
cm = confusion_matrix(y_test, y_pred)

#Creation de grille des differents hyperparameters
grid_params = {
    'max_depth': [1, 2, 3, 4, 5, 6],
    'min_samples_leaf': [0.02, 0.04, 0.06, 0.08]
}

#ous créons un GridSearchCVobjet avec le classifieur de l’arbre de décision comme estimateur
grid_object = GridSearchCV(estimator=decTree,
                           param_grid=grid_params,
                           scoring='accuracy',
                           cv=10)

#Nous ajustons ensuite cet objet de grille aux données d'apprentissage
grid_object.fit(X_train, y_train)

#Extraction des meilleures parametres
grid_object.best_params_
示例#11
0
1,树的个数
2,树的最大深度
3,内部节点最少样本数与叶节点最少样本数
4,特征个数

此外,调参过程中选择的误差函数是均值误差,5倍折叠
'''
X, y = trainData[numFeatures2], trainData['rec_rate']
'''
网格搜索参数
'''
param_test1 = {'n_estimators': range(10, 80, 5)}  #从10-80每5格取一个值
gsearch1 = GridSearchCV(estimator=RandomForestRegressor(min_samples_split=50,
                                                        min_samples_leaf=10,
                                                        max_depth=8,
                                                        max_features='sqrt',
                                                        random_state=10),
                        param_grid=param_test1,
                        scoring='neg_mean_squared_error',
                        cv=5)
gsearch1.fit(X, y)
print(gsearch1.best_params_, gsearch1.best_score_)
best_n_estimators = gsearch1.best_params_['n_estimators']  #估计出的最佳数个数

param_test2 = {
    'max_depth': range(3, 21),
    'min_samples_split': range(10, 100, 10)
}
gsearch2 = GridSearchCV(estimator=RandomForestRegressor(
    n_estimators=best_n_estimators,
    min_samples_leaf=10,
    max_features='sqrt',
示例#12
0
#Check model performance on test data
y.value_counts()

y_pred = dtClassifier.predict(X_test)
from sklearn import metrics
metrics.roc_auc_score(y_test, y_pred)

#GridSearchCV to find optimal max_depth with Gini index as splitting criteria
from sklearn.model_selection import GridSearchCV

params_grid = {'criterion': ['gini'], 'max_depth': [3, 4, 5, 6, 7, 8, 9, 10]}

classifier = DecisionTreeClassifier()
clf = GridSearchCV(estimator=classifier,
                   param_grid=params_grid,
                   cv=10,
                   scoring='roc_auc')
clf.fit(X_train, y_train)
clf.best_params_
clf.best_score_
#Optimal max_depth = 4

dtClassifierOpt = DecisionTreeClassifier(max_depth=4, criterion='gini')
dtClassifierOpt.fit(X_train, y_train)

#Displaying the decision tree (Meed GraphViz software installed on machine)
from sklearn.tree import export_graphviz
import pydotplus as pdot
from IPython.display import Image

#Export the tree into an odt file
def dcv_clf(X, y, model, param_grid, niter):
    """
    Double cross validation (classification)

    Parameters
    ----------
    X : array-like, shape = [n_samples, n_features]
        X training+test data

    y : array-like, shape = [n_samples]
        y training+test data

    model: estimator object.
        This is assumed to implement the scikit-learn estimator interface.

    param_grid : dict or list of dictionaries
        Dictionary with parameters names (string) as keys and lists of
        parameter settings to try as values, or a list of such dictionaries,
        in which case the grids spanned by each dictionary in the list are
        explored.

    niter : int
        number of DCV iteration

    Returns
    -------
    None
    """
    # parameters
    ns_in = 3  # n_splits for inner loop
    ns_ou = 3  # n_splits for outer loop
    scores = np.zeros((niter, 5))
    for iiter in range(niter):
        ypreds = np.array([])  # list of predicted y in outer loop
        ytests = np.array([])  # list of y_test in outer loop
        kf_ou = KFold(n_splits=ns_ou, shuffle=True)

        # [start] outer loop for test of the generalization error
        for train_index, test_index in kf_ou.split(X):
            X_train, X_test = X[train_index], X[test_index]  # inner loop CV
            y_train, y_test = y[train_index], y[test_index]  # outer loop

            # [start] inner loop CV for hyper parameter optimization
            kf_in = KFold(n_splits=ns_in, shuffle=True)
            gscv = GridSearchCV(model, param_grid, cv=kf_in)
            gscv.fit(X_train, y_train)
            # [end] inner loop CV for hyper parameter optimization

            # test of the generalization error
            ypred = gscv.predict(X_test)
            ypreds = np.append(ypreds, ypred)
            ytests = np.append(ytests, y_test)

        # [end] outer loop for test of the generalization error
        tn, fp, fn, tp = confusion_matrix(ytests, ypreds).ravel()
        acc = accuracy_score(ytests, ypreds)
        scores[iiter, :] = np.array([tp, fp, fn, tn, acc])

    means, stds = np.mean(scores, axis=0), np.std(scores, axis=0)
    print()
    print('Double Cross Validation')
    print('In {:} iterations, average +/- standard deviation'.format(niter))
    print('TP   DCV: {:.3f} (+/-{:.3f})'.format(means[0], stds[0]))
    print('FP   DCV: {:.3f} (+/-{:.3f})'.format(means[1], stds[1]))
    print('FN   DCV: {:.3f} (+/-{:.3f})'.format(means[2], stds[2]))
    print('TN   DCV: {:.3f} (+/-{:.3f})'.format(means[3], stds[3]))
    print('Acc. DCV: {:.3f} (+/-{:.3f})'.format(means[4], stds[4]))
def dcv_rgr(X, y, model, param_grid, niter):
    """
    Double cross validation (regression)

    Parameters
    ----------
    X : array-like, shape = [n_samples, n_features]
        X training+test data

    y : array-like, shape = [n_samples]
        y training+test data

    model: 
        machine learning model (scikit-learn)

    param_grid : dict or list of dictionaries
        Dictionary with parameters names (string) as keys and lists of
        parameter settings to try as values, or a list of such dictionaries,
        in which case the grids spanned by each dictionary in the list are
        explored.

    niter : int
        number of DCV iteration

    Returns
    -------
    None
    """
    # parameters
    ns_in = 3  # n_splits for inner loop
    ns_ou = 3  # n_splits for outer loop
    scores = np.zeros((niter, 3))
    for iiter in range(niter):
        ypreds = np.array([])  # list of predicted y in outer loop
        ytests = np.array([])  # list of y_test in outer loop
        kf_ou = KFold(n_splits=ns_ou, shuffle=True)

        # [start] outer loop for test of the generalization error
        for train_index, test_index in kf_ou.split(X):
            X_train, X_test = X[train_index], X[test_index]  # inner loop CV
            y_train, y_test = y[train_index], y[test_index]  # outer loop

            # [start] inner loop CV for hyper parameter optimization
            kf_in = KFold(n_splits=ns_in, shuffle=True)
            gscv = GridSearchCV(model, param_grid, cv=kf_in)
            gscv.fit(X_train, y_train)
            # [end] inner loop CV for hyper parameter optimization

            # test of the generalization error
            ypred = gscv.predict(X_test)
            ypreds = np.append(ypreds, ypred)
            ytests = np.append(ytests, y_test)

        # [end] outer loop for test of the generalization error
        rmse = np.sqrt(mean_squared_error(ytests, ypreds))
        mae = mean_absolute_error(ytests, ypreds)
        r2 = r2_score(ytests, ypreds)
        #        print('DCV:RMSE, MAE, R^2 = {:.3f}, {:.3f}, {:.3f}'\
        #        .format(rmse, mae, r2))
        scores[iiter, :] = np.array([rmse, mae, r2])

    means, stds = np.mean(scores, axis=0), np.std(scores, axis=0)
    print()
    print('Double Cross Validation')
    print('In {:} iterations, average +/- standard deviation'.format(niter))
    #    print('RMSE: {:6.3f} (+/-{:6.3f})'.format(means[0], stds[0]))
    #    print('MAE : {:6.3f} (+/-{:6.3f})'.format(means[1], stds[1]))
    #    print('R^2 : {:6.3f} (+/-{:6.3f})'.format(means[2], stds[2]))
    print('DCV:RMSE, MAE, R^2 = {:6.3f}, {:6.3f}, {:6.3f} (ave)'\
          .format(means[0], means[1], means[2]))
    print('DCV:RMSE, MAE, R^2 = {:6.3f}, {:6.3f}, {:6.3f} (std)'\
          .format(stds[0], stds[1], stds[2]))
示例#15
0
# Load data
iris = datasets.load_iris()
features = iris.data
target = iris.target

# Create logistic regression
logistic = linear_model.LogisticRegression()

# Create range of 20 candidate values for C
C = np.logspace(0, 4, 20)

# Create hyperparameter options
hyperparameters = dict(C=C)
# Create grid search

gridsearch = GridSearchCV(logistic,
                          hyperparameters,
                          cv=5,
                          n_jobs=-1,
                          verbose=0)

# Conduct nested cross-validation and outut the average score
cross_val_score(gridsearch, features, target).mean()

gridsearch = GridSearchCV(logistic, hyperparameters, cv=5, verbose=1)

best_model = gridsearch.fit(features, target)

scores = cross_val_score(gridsearch, features, target)
示例#16
0
def main_clf(metric_,
             clf_,
             grid_,
             range_=(2, 7),
             cv_=5,
             verb_=False,
             graphs=False):
    pipe = Pipeline(steps=[('sc', StandardScaler()), ('clf', clf_)])
    max_scoring = 0
    for k in range(*range_):
        denue_wide = pd.read_csv(f"summary/Count/denue_wide_{k}.csv")  ###
        rezago = pd.read_csv("rezago_social/rezago_social.csv")
        rezago_social = rezago[[
            "lgc00_15cl3_2", "Key", "POB_TOTAL", "LAT", "LON"
        ]]
        df = pd.merge(rezago_social, denue_wide, on=['Key'])
        y = rezago_social['lgc00_15cl3_2']
        df.drop(["lgc00_15cl3_2", "Key", "LAT", "LON"], axis=1, inplace=True)
        X = df.div(df.POB_TOTAL, axis=0) * 1000
        X.drop(["POB_TOTAL"], axis=1, inplace=True)
        X["LAT"] = rezago_social["LAT"]
        X["LON"] = rezago_social["LON"]
        print(f'# CLF {k} {X.shape}')
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            stratify=y,
                                                            test_size=0.20,
                                                            random_state=0)
        clf_cv = GridSearchCV(pipe,
                              grid_,
                              cv=cv_,
                              scoring=metric_,
                              verbose=verb_)  # cv_
        clf_cv.fit(X_train, y_train)
        if np.mean(clf_cv.best_score_) > max_scoring:
            max_scoring = clf_cv.best_score_
            print(f"\t # {k} CLF {clf_cv.best_score_} {clf_cv.best_params_}")
            best_params = clf_cv.best_params_
            best_k = k
            Xtrain, ytrain = X_train, y_train
            Xtest, ytest = X_test, y_test
            X_, y_ = X, y
    best_params_ = {k[5:]: v for k, v in best_params.items()}
    best_clf = clf_.set_params(**best_params_)
    best_pipe = Pipeline(steps=[('sc', StandardScaler()), ('clf', best_clf)])
    print('#BEST', best_pipe, max_scoring)
    best_pipe.fit(Xtrain, ytrain)
    print(f"# {best_k}: Train:{best_pipe.score(Xtrain, ytrain) * 100}")
    print(f"# {best_k}: Test:{best_pipe.score(Xtest, ytest) * 100}")
    scores = cross_val_score(best_pipe,
                             X_,
                             y_,
                             cv=cv_,
                             n_jobs=-1,
                             scoring='accuracy')
    print(f"# {best_k}: Accuracy CV5:{np.mean(scores)} +/- {np.std(scores)}")
    scores_ = cross_val_score(best_pipe,
                              X_,
                              y_,
                              cv=cv_,
                              n_jobs=-1,
                              scoring=metric_)
    print(
        f"# {best_k}: {metric_} CV5:{np.mean(scores_)} +/- {np.std(scores_)}")
    y_pred = cross_val_predict(best_pipe, X_, y_, cv=cv_)
    print(classification_report(y_, y_pred, digits=3))

    print(np.unique(np.array(y_pred), return_counts=True))

    if graphs:
        # plot_multiclass_roc(best_pipe, X_, y_, n_classes=3, figsize=(16, 10))
        probas = cross_val_predict(best_pipe,
                                   X_,
                                   y_,
                                   cv=cv_,
                                   method='predict_proba')
        fig, (ax1, ax2) = plt.subplots(1, 2)
        skplt.metrics.plot_roc(y_, probas, ax=ax1, title='')
        handles, labels = ax1.get_legend_handles_labels()
        # print(labels)
        labels = [
            lb.replace(' 1 ', ' A ').replace(' 2 ',
                                             ' M ').replace(' 3 ', ' B ')
            for lb in labels
        ]
        # print(labels)
        ax1.legend(handles, labels)
        ax1.get_figure()
        ax1.set_xlabel('TFP\n(A)')
        skplt.metrics.plot_precision_recall(y_, probas, ax=ax2, title='')
        handles, labels = ax2.get_legend_handles_labels()
        # print(labels)
        labels = [
            lb.replace(' 1 ', ' A ').replace(' 2 ',
                                             ' M ').replace(' 3 ', ' B ')
            for lb in labels
        ]
        # print(labels)
        ax2.legend(handles, labels)
        ax2.get_figure()
        ax2.set_xlabel('S\n(B)')
        plt.show()

        ### 2016
        denue_2016 = pd.read_csv(
            f"summary/201610/denue_wide_{best_k}.csv")  ###
        df_2016 = pd.merge(rezago_social, denue_2016, on=['Key'])
        df_2016.drop(["lgc00_15cl3_2", "Key", "LAT", "LON"],
                     axis=1,
                     inplace=True)
        X_2016 = df_2016.div(df.POB_TOTAL, axis=0) * 1000
        X_2016.drop(["POB_TOTAL"], axis=1, inplace=True)
        X_2016["LAT"] = rezago_social["LAT"]
        X_2016["LON"] = rezago_social["LON"]
        print(X_2016.columns)
        y_pred_2016 = best_pipe.predict(X_2016)
        ### 2017
        denue_2017 = pd.read_csv(
            f"summary/201711/denue_wide_{best_k}.csv")  ###
        df_2017 = pd.merge(rezago_social, denue_2017, on=['Key'])
        df_2017.drop(["lgc00_15cl3_2", "Key", "LAT", "LON"],
                     axis=1,
                     inplace=True)
        X_2017 = df_2017.div(df.POB_TOTAL, axis=0) * 1000
        X_2017.drop(["POB_TOTAL"], axis=1, inplace=True)
        X_2017["LAT"] = rezago_social["LAT"]
        X_2017["LON"] = rezago_social["LON"]
        y_pred_2017 = best_pipe.predict(X_2017)
        # ### 2018
        # denue_2018 = pd.read_csv(f"summary/201811/denue_wide_{best_k}.csv")  ###
        # df_2018 = pd.merge(rezago_social, denue_2018, on=['Key'])
        # df_2018.drop(["lgc00_15cl3", "Key", "LAT", "LON"], axis=1, inplace=True)
        # X_2018 = df_2018.div(df.POB_TOTAL, axis=0) * 1000
        # X_2018.drop(["POB_TOTAL"], axis=1, inplace=True)
        # X_2018["LAT"] = rezago_social["LAT"]
        # X_2018["LON"] = rezago_social["LON"]
        # y_pred_2018 = best_pipe.predict(X_2018)
        # ### 2019
        # denue_2019 = pd.read_csv(f"summary/201911/denue_wide_{best_k}.csv")  ###
        # df_2019 = pd.merge(rezago_social, denue_2019, on=['Key'])
        # df_2019.drop(["lgc00_15cl3", "Key", "LAT", "LON"], axis=1, inplace=True)
        # X_2019 = df_2019.div(df.POB_TOTAL, axis=0) * 1000
        # X_2019.drop(["POB_TOTAL"], axis=1, inplace=True)
        # X_2019["LAT"] = rezago_social["LAT"]
        # X_2019["LON"] = rezago_social["LON"]
        # y_pred_2019 = best_pipe.predict(X_2019)
        # ### 2020
        # denue_2020 = pd.read_csv(f"summary/202011/denue_wide_{best_k}.csv")  ###
        # df_2020 = pd.merge(rezago_social, denue_2020, on=['Key'])
        # df_2020.drop(["lgc00_15cl3", "Key", "LAT", "LON"], axis=1, inplace=True)
        # X_2020 = df_2020.div(df.POB_TOTAL, axis=0) * 1000
        # X_2020.drop(["POB_TOTAL"], axis=1, inplace=True)
        # X_2020["LAT"] = rezago_social["LAT"]
        # X_2020["LON"] = rezago_social["LON"]
        # y_pred_2020 = best_pipe.predict(X_2020)
        # Confusion matrix
        skplt.metrics.plot_confusion_matrix(y_,
                                            y_pred,
                                            normalize=True,
                                            title=" ")
        plt.xticks([0, 1, 2], ['B', 'M', 'A'], rotation='horizontal')
        plt.yticks([0, 1, 2], ['B', 'M', 'A'], rotation='horizontal')
        plt.xlabel('Clases predichas')
        plt.ylabel('Clases verdaderas')
        plt.show()
        # Mapa
        rezago_social['Pred'] = y_pred
        rezago_social['Pred_2016'] = y_pred_2016
        rezago_social['Pred_2017'] = y_pred_2017
        # rezago_social['Pred_2018'] = y_pred_2018
        # rezago_social['Pred_2019'] = y_pred_2019
        # rezago_social['Pred_2020'] = y_pred_2020
        rezago_social.to_csv('predictions.csv')  ###
        rezago_social['Key_'] = rezago_social['Key'].astype(str).str.zfill(5)
        gdf = gpd.read_file('municipios/areas_geoestadisticas_municipales.shp')
        gdf['Key_'] = gdf['CVE_ENT'] + gdf['CVE_MUN']
        gdf = gdf.merge(rezago_social, on='Key_')
        legend_elements = [
            Line2D(
                [0],
                [0],
                marker='o',
                color='w',
                label='B',
                markerfacecolor='g',
                markersize=10,
            ),
            Line2D([0], [0],
                   marker='o',
                   color='w',
                   label='M',
                   markerfacecolor='yellow',
                   markersize=10),
            Line2D([0], [0],
                   marker='o',
                   color='w',
                   label='A',
                   markerfacecolor='r',
                   markersize=10)
        ]
        csfont = {'fontname': 'Times New Roman'}
        font = font_manager.FontProperties(family='Times New Roman',
                                           weight='normal',
                                           style='normal',
                                           size=12)
        colors = {3: 'green', 2: 'yellow', 1: 'red'}
        models = {
            'RandomForestClassifier': 'RF',
            'SCV': 'SVM',
            'LogisticRegression': 'LR'
        }
        ###
        # gdf.plot(color=gdf['Pred_2016'].map(colors))
        # plt.xticks([])
        # plt.yticks([])
        # txt = f"Categorías predichas por modelo {models.get(clf.__class__.__name__, 'ABC')}, para el año 201X."
        # plt.text(800000, 0.01, txt, wrap=True, horizontalalignment='left', fontsize=12, **csfont)
        # plt.legend(handles=legend_elements, prop=font)
        # plt.show()
        ### Mapa
        fig, (ax1, ax2) = plt.subplots(1, 2)
        gdf.plot(ax=ax1, color=gdf['Pred_2016'].map(colors))
        ax1.set_xticks([])
        ax1.set_yticks([])
        # txt = f"(A) Clases predichas con modelo {models.get(clf.__class__.__name__, 'ABC')} en 2016"
        ax1.set_xlabel("(A)", **csfont)
        # ax1.text(800000, 0.01, txt, wrap=True, horizontalalignment='center', fontsize=12, **csfont)
        ax1.legend(handles=legend_elements, prop=font)
        gdf.plot(ax=ax2, color=gdf['Pred_2017'].map(colors))
        ax2.set_xticks([])
        ax2.set_yticks([])
        # txt = f"(B) Clases predichas con modelo {models.get(clf.__class__.__name__, 'ABC')} en 2017"
        ax2.set_xlabel("(B)", **csfont)
        # ax2.text(800000, 0.01, txt, wrap=True, horizontalalignment='center', fontsize=12, **csfont)
        ax2.legend(handles=legend_elements, prop=font)
        plt.show()

        ### Mapa
        fig, (ax1, ax2) = plt.subplots(1, 2)
        gdf.plot(ax=ax1, color=gdf['lgc00_15cl3_2'].map(colors), legend=True)
        ax1.set_xticks([])
        ax1.set_yticks([])
        # txt = "(A) Clases de acuerdo a Valdés-Cruz y Vargas-Chanes (2017)"
        ax1.set_xlabel("(A)", **csfont)
        # ax1.text(800000, 0.01, txt, wrap=True, horizontalalignment='center', fontsize=12, **csfont)
        ax1.legend(handles=legend_elements, prop=font)
        gdf.plot(ax=ax2, color=gdf['Pred'].map(colors))
        ax2.set_xticks([])
        ax2.set_yticks([])
        # txt = f"(B) Clases predichas con modelo {models.get(clf.__class__.__name__, 'ABC')} en 2015"
        ax2.set_xlabel("(B)", **csfont)
        # ax2.text(800000, 0.01, txt, wrap=True, horizontalalignment='center', fontsize=12, **csfont)
        ax2.legend(handles=legend_elements, prop=font)
        plt.show()
        # Curva ROC
        y_bin = label_binarize(y, classes=[1, 2, 3])
        n_classes = y_bin.shape[1]
        y_score = cross_val_predict(best_pipe,
                                    X_,
                                    y_,
                                    cv=cv_,
                                    method='predict_proba')
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])
        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
        mean_tpr = np.zeros_like(all_fpr)
        for i in range(n_classes):
            mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
        mean_tpr /= n_classes
        fpr["macro"] = all_fpr
        tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
        plt.figure()
        plt.plot(fpr["macro"],
                 tpr["macro"],
                 label='ROC macro (AUC = {0:0.3f})'
                 ''.format(roc_auc["macro"]),
                 color='navy',
                 linestyle=':',
                 linewidth=4)
        rezago = {1: 'B', 2: 'M', 3: 'A'}
        colors = cycle(['green', 'yellow', 'red'])
        for i, color in zip(range(n_classes), colors):
            plt.plot(fpr[i],
                     tpr[i],
                     color=color,
                     lw=2,
                     label='Clase de rezago {0} (AUC = {1:0.3f})'
                     ''.format(rezago[i + 1], roc_auc[i]))
        plt.plot([0, 1], [0, 1], 'k--', lw=2)
        plt.xlim([-0.05, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('TFP', fontsize=12, **csfont)
        plt.ylabel('TVP', fontsize=12, **csfont)
        plt.legend(loc="lower right", prop=font)
        plt.show()
    return scores_
示例#17
0
def modeling(conn, sentences, lib, dz):
#def modeling(conn, df, lib, dz):
  
    #pts = pd.read_sql("SELECT DISTINCT SUBJECT_ID from UFM", conn)
    #pts =list(set(pts.SUBJECT_ID))
    #pool = []
    #for d in dz:
    #    pool += d.pos + d.neg
    np.random.seed(7)
    decay = .0002
    data = []; train = []; test = []
    keys = [k[1] for k in lib]
    
    admits = pd.read_sql("SELECT * from admissions", conn)
    
    for itr in range(0,5):
        print ("Sess: {0}".format(itr))
        for d in dz:
            neg = random.sample(d[1], len(d[0]))
            temp = d[0] + neg
            random.shuffle(temp)
            t1, t2 = cross_validation.train_test_split(temp, test_size = .2)
            train +=t1; test +=t2
                    
        #X stands for raw indexes of feature input; V stands for raw feature input
        #W stands for word vectors from feature input trained by Word2Vec
        X_train = []; t_train = []; W_train = []; Y_train = []
        X_test = []; t_test = []; W_test = []; Y_test = []
        V_train = []; V_test = []
    
        count=0
        for t in train:
            print (count)
            count+=1

            corpus = [[s[2], s[3]] for s in sentences if  (s[0] == t[0]) and (pd.to_datetime(admits[admits['HADM_ID']==s[1]].ADMITTIME.values[0]) <= t[1])]
            #order subject by time of entry for each sentence (admission)
            corpus = sorted(corpus, key = lambda x: x[1])
            #transpose into nx2xd from 2xnxd
            #this way, corpus[0] refers to words and corpus[1] refers to times
            corpus = list(map(list, zip(*corpus)))                  
            x_train = list(chain.from_iterable(corpus[0]))
            t_stamps = list(chain.from_iterable(corpus[1]))
            x = np.array(list(map(lambda x: keys.index(x), x_train)))
     
            #configure each timestamp to reflect time elapsed from first time entry
            #calculate time decay from initial event
            temp = t_stamps[0]
            t_stamps = [ii-temp for ii in t_stamps]
                
            #append
            X_train.append(x)
            V_train.append(np.array(x_train))
            t_train.append(np.array(t_stamps))
            Y_train.append(t[3])
                
        print ("X_train made.")

        count = 0
        for t in test:
            print (count)
            count+=1
                
            corpus = [[s[2], s[3]] for s in sentences if  (s[0] == t[0]) and (pd.to_datetime(admits[admits['HADM_ID']==s[1]].ADMITTIME.values[0]) <= t[1])]
                
            corpus = sorted(corpus, key = lambda x: x[1])
            corpus = list(map(list, zip(*corpus)))                  
            x_test = list(chain.from_iterable(corpus[0]))
            t_stamps = list(chain.from_iterable(corpus[1]))
            temp = t_stamps[0]
            t_stamps = [ii-temp for ii in t_stamps]
            x = np.array(list(map(lambda x: keys.index(x), x_test)))
            
            X_test.append(x)
            V_test.append(np.array(x_train))
            t_test.append(np.array(t_stamps))
            Y_test.append(t[3])            
                           
        #training normal LSTM and CNN-LSTM          
        top_words = [9444]
        max_review_length = [1000]
        embedding_length = [300]          
        X_train = sequence.pad_sequences(X_train, maxlen=max_review_length[0])
        X_test = sequence.pad_sequences(X_test, maxlen=max_review_length[0])


        #build model using KerasClassifier and Gridsearch
        cnn = KerasClassifier(build_fn=cnn_train, verbose=1)
        lstm = KerasClassifier(build_fn=lstm_train, verbose=1)
        d_cnn = KerasClassifier(build_fn=d_cnn_train, verbose = 1)
        d_lstm = KerasClassifier(build_fn=d_lstm_train, verbose = 1)
        # define the grid search parameters

        batch_size = [32, 64, 128]
        epochs = [20, 50, 100, 200]
        optimizer = ['SGD', 'RMSprop', 'Adam']
        learn_rate = (10.0**np.arange(-4,-1)).tolist()
        momentum = np.arange(.5,.9,.1).tolist()
        neurons = [50, 100, 200]
        dropout_W = [.1, .2, .5]
        dropout_U = [.1, .2, .5]
        W_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None]
        U_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None]
        init_mode = ['uniform', 'normal', 'zero']
        #activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
        param_grid = dict(top_words=top_words, max_length = max_review_length, embedding_length = embedding_length, batch_size=batch_size, nb_epoch=epochs, optimizer = optimizer, learn_rate = learn_rate, momentum = momentum, neurons = neurons, dropout_W = dropout_W, dropout_U = dropout_U, W_regularizer = W_regularizer, U_regularizer = U_regularizer, init_mode = init_mode)
        d_param_grid = dict(input_shape = [(max_review_length[0], embedding_length[0])], batch_size=batch_size, nb_epoch=epochs, optimizer = optimizer, learn_rate = learn_rate, momentum = momentum, neurons = neurons, dropout_W = dropout_W, dropout_U = dropout_U, W_regularizer = W_regularizer, U_regularizer = U_regularizer, init_mode = init_mode)
        lr_params = {'C':(10.0**np.arange(-4,4)).tolist(), 'penalty':('l1','l2')}
        sv_params = {'C':(10.0**np.arange(-4,4)).tolist(), 'kernel':('linear', 'poly', 'rbf', 'sigmoid')}
        rf_params = {'criterion': ['gini', 'entropy']}

        #setup GridSearch w/ cross validation
        cnn_grid = GridSearchCV(estimator=cnn, param_grid=param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1)
        lstm_grid = GridSearchCV(estimator=lstm, param_grid=param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1)
        d_cnn_grid = GridSearchCV(estimator=d_cnn, param_grid=d_param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1)
        d_lstm_grid = GridSearchCV(estimator=d_lstm, param_grid=d_param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1)
        classics = GridSearchCV(estimator = (LR, SVM, RF), param_grid = (lr_params, sv_params, rf_params), scoring = 'roc_auc', sv = 5, n_jobs = -1)
        #lr_grid = GridSearchCV(estimator = lr_params, param_grid = lr_params, scoring = 'roc_auc', sv = 5, n_jobs = -1)
        #sv_grid = GridSearchCV(estimator = sv_params, param_grid = sv_params, scoring = 'roc_auc', sv = 5, n_jobs = -1)
        #rf_grid = GridSearchCV(estimator = rf_params, param_grid = rf_params, scoring = 'roc_auc', sv = 5, n_jobs = -1)

        # Fit the model
        cnn_result = cnn_grid.fit(X_train, Y_train)
        lstm_result = lstm_grid.fit(X_train, Y_train) 
        d_cnn_result = d_cnn_grid.fit(decay(x=np.array(V_train), t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[0], Y_train)
        d_lstm_result = d_lstm_grid.fit(decay(x=np.array(V_train), t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[0], Y_train) 
        classics_result = classics.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[1], Y_train)       
        #lr_result = lr_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train)
        #sv_result = sv_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train)
        #rf_result = rf_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train)        
        
        #grid_search results:
        print("CNN Best: %f using %s" % (cnn_result.best_score_, cnn_result.best_params_))
        means = cnn_result.cv_results_['mean_test_score']
        stds = cnn_result.cv_results_['std_test_score']
        params = cnn_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, params))
        
        print("LSTM Best: %f using %s" % (lstm_result.best_score_, lstm_result.best_params_))
        means = lstm_result.cv_results_['mean_test_score']
        stds = lstm_result.cv_results_['std_test_score']
        params = lstm_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, params))
        
        print("Decay CNN Best: %f using %s" % (d_cnn_result.best_score_, d_cnn_result.best_params_))
        means = d_cnn_result.cv_results_['mean_test_score']
        stds = d_cnn_result.cv_results_['std_test_score']
        params = d_cnn_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, params))        
            
        print("Decay LSTM Best: %f using %s" % (d_lstm_result.best_score_, d_lstm_result.best_params_))
        means = d_lstm_result.cv_results_['mean_test_score']
        stds = d_lstm_result.cv_results_['std_test_score']
        params = d_lstm_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, params))        
            
        print("Best of Classics: %f using %s, %s" % (classics_result.best_score_, classics_result.best_estimator_, classics_result.best_params_))    
        means = classics_result.cv_results_['mean_test_score']
        stds = classics_result.cv_results_['std_test_score']
        params = classics_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, params))        
        
        #KFold = 5
        #kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
        #cvscores = []
        #for training, testing in kfold.split(X_train, Y_train):     
            # Fit the model
            #model.fit(X[training], Y[training], nb_epoch=150, batch_size=10, verbose=0)
            # evaluate the model
            #scores = model.evaluate(X[testing], Y[testing], verbose=0)
            #print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
            #cvscores.append(scores[1] * 100)
        #print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

 ######TESTING#######
        cnn = cnn_train(top_words = top_words, max_length = max_review_length, embedding_length=embedding_length)
        lstm = lstm_train(top_words = top_words, max_length = max_review_length, embedding_length=embedding_length)
            
        cnn.fit(X_train, Y_train, validation_split = .2, nb_epoch=100, batch_size=128, shuffle = True, verbose=1)
        lstm.fit(X_train, Y_train, validation_split = .2, nb_epoch=100, batch_size=128, shuffle = True, verbose=1)

        #testing
        predictions_lstm = lstm.predict_classes(X_test)
        predictions_cnn = cnn.predict_classes(X_test)

        acc = accuracy_score(Y_test, predictions_lstm)
        f1 = f1_score (Y_test, predictions_lstm)
        auc = roc_auc_score (Y_test, predictions_lstm)
        scores_lstm = [("Accuracy", acc) , ("F1 Score", f1) , ("AUC Score",auc)]

        acc = accuracy_score(Y_test, predictions_cnn)
        f1 = f1_score (Y_test, predictions_cnn)
        auc = roc_auc_score (Y_test, predictions_cnn)
        scores_cnn = [("Accuracy", acc) , ("F1 Score", f1) , ("AUC Score",auc)]

        print ("LSTM DATA: ")
        for s in scores_lstm:
            print("%s: %.2f" %(s[0], s[1]), end = " ")
        print ("")
        print ("CNN DATA: ")
        for s in scores_cnn:
            print("%s: %.2f" %(s[0], s[1]), end = " ")        
        
        
        data.append(data)
            
    return (Data)
示例#18
0
x=PTrain_ad.iloc[:,1:]
y=PTrain_ad.iloc[:,1]
x_train,x_val,y_train,y_val = train_test_split(x,y,test_size=0.25,random_state=0)

#%% Brenchmark model

para_lo=[{'penalty':['l1','l2'],
                'C':np.logspace(-1,1,10),
                'solver':['liblinear'],
                'multi_class':['ovr']},
                {'penalty':['l2'],
                 'C':np.logspace(-1,1,20),
                'solver':['lbfgs'],
                'multi_class':['ovr','multinomial']}]

logcv=GridSearchCV(LogisticRegression(),para_lo,cv=10,scoring='roc_auc')
log=logcv.fit(x_train,y_train)
yyy=log.predict(x_val)
log.coef_
print("Number of defaults in test set: {0}".format(sum(y_val)))
print("Number of defaults in train set: {0}".format(sum(y_pred)))
print(accuracy_score(y_test,yyy))
print(confusion_matrix(y_test,yyy))
print(classification_report(y_test,yyy,digits=3))
print(clf.best_estimator_)
#%% RandomForest
para = [{'n_estimators':[110,120],
'':['entropy','gini'],
#'max_depth':[12,18,24],
'min_samples_split':[40], 
#'min_weight_fraction_leaf':[0.1,0.3,0.5],
示例#19
0
plt.title('Average score: {} and Std score : {}'.format(
    np.mean(cv_scores), np.std(cv_scores)))

# In[4]:
#Tune the parameters to best fit to the training data
N_E = 200
N_LR = 5
ADB = AdaBoostClassifier(base_estimator=classifier)
parameter_grid = {
    'n_estimators': np.arange(1, N_E + 20, 20),
    'learning_rate': np.linspace(0.1, 2, N_LR)
}

cross_validation = StratifiedKFold(arr_out, n_folds=3)

grid_search = GridSearchCV(ADB, param_grid=parameter_grid, cv=cross_validation)

grid_search.fit(arr_in, arr_out)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

# In[5]:
#Visualisation of the grid over the tuning parameters
learning_rate = np.linspace(0.1, 2, N_LR)
n_estimators = np.arange(1, N_E + 20, 20)
plt.figure()
grid_visualization = []
grid_visualization.append(grid_search.cv_results_['mean_test_score'])
grid_visualization = np.array(grid_visualization)
grid_visualization.shape = (len(learning_rate), len(n_estimators))
示例#20
0
from sklearn.model_selection import GridSearchCV

params = {
    'gamma': [0.1, 1],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'min_child_weight': [1, 100],
    'n_estimators': [100, 200],
    'subsample': [1.0],
    'colsample_bytree': [1.0],
}

gsearch1 = GridSearchCV(estimator=XGBClassifier(objective='binary:logistic',
                                                nthread=4,
                                                random_state=seed,
                                                seed=seed),
                        param_grid=params,
                        scoring='roc_auc',
                        n_jobs=-1)

gsearch1.fit(X_train, y_train)
#gsearch1.best_score_, gsearch1.best_params_, gsearch1.best_score_

print('tuned XGBClassifier')
print(gsearch1)
print('=================================================')
print('=================================================')
fpr, tpr, thresholds = metrics.roc_curve(y_train,
                                         gsearch1.predict_proba(X_train)[:, 1])
print('gini_train', 2 * metrics.auc(fpr, tpr) - 1)
    # Training
    X = train_df_new
    y = y_train.total_count.values.reshape(-1, 1)
    dtr = DecisionTreeRegressor(max_depth=4, min_samples_split=5, max_leaf_nodes=10)
    dtr.fit(X, y)
    dot_data = tree.export_graphviz(dtr, out_file=None)
    graph = pydotplus.graph_from_dot_data(dot_data)
    graph.write_pdf("bike_share.pdf")

    # Grid Search with Cross validation
    param_grid = {"criterion": ["mse", "mae"],
                  "min_samples_split": [10, 20, 40],
                  "max_depth": [2, 6, 8],
                  "min_samples_leaf": [20, 40, 100],
                  "max_leaf_nodes": [5, 20, 100, 500, 800]}
    grid_cv_dtr = GridSearchCV(dtr, param_grid, cv=5)
    grid_cv_dtr.fit(X, y)

    # Cross Validation: Best Model Details
    df = pd.DataFrame(data=grid_cv_dtr.cv_results_)
    fig, ax = plt.subplots()
    sn.pointplot(data=df[['mean_test_score',
                          'param_max_leaf_nodes',
                          'param_max_depth']],
                 y='mean_test_score', x='param_max_depth',
                 hue='param_max_leaf_nodes', ax=ax)
    ax.set(title="Effect of Depth and Leaf Nodes on Model Performance")
    fig.savefig("cross_validation_best_model.png")

    # Residual Plot
    predicted = grid_cv_dtr.best_estimator_.predict(X)
示例#22
0
# Estandarización
scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Configuración del modelo SVM
svm_model = svm.SVC(kernel='rbf')

# C ∈ {0.02, 0.2, 2, 200} y γ ∈ {0.02, 0.2, 2, 200}
Cs = 2 * np.logspace(-2, 0, num=3, base=10)
Cs = np.append(Cs, 200)
Gs = Cs

# Validación cruzada anidada tipo K-fold
optimo = GridSearchCV(estimator=svm_model,
                      param_grid=dict(C=Cs, gamma=Gs),
                      n_jobs=-1,
                      cv=5)

# Entrenar el modelo óptimo
optimo.fit(X_train, y_train)

# Configuración del modelo óptimo
print optimo.best_params_

# CCR de test óptimo
print optimo.score(X_test, y_test) * 100

# Representar los puntos
plt.figure(1)
plt.clf()
plt.scatter(X[:, 0], X[:, 1], c=y, zorder=10, cmap=plt.cm.Paired)
corpusts = testdf['lyrics_string']
vectorizerts = TfidfVectorizer(stop_words='english')
tfidfts=vectorizertr.transform(corpusts)

predictors_tr = tfidftr

targets_tr = traindf['genre']

predictors_ts = tfidfts


#classifier = LinearSVC(C=0.80, penalty="l2", dual=False)
parameters = {'C':[1, 10]}
#clf = LinearSVC()
clf = LogisticRegression()
#parameters = {'n_neighbors':[1,10]}
#clf = KNeighborsClassifier()
#parameters = {'min_samples_split': [2,10]}
#clf = DecisionTreeClassifier()
#clf = RandomForestClassifier()
### Nerual Network took too long

classifier = GridSearchCV(clf, parameters)

classifier=classifier.fit(predictors_tr,targets_tr)

predictions=classifier.predict(predictors_ts)
testdf['genre'] = predictions
# testdf = testdf.sort_values('id' , ascending=True)

testdf[['id' , 'lyrics_clean_string' , 'genre' ]].to_csv("submission.csv")
示例#24
0
# 主成分分析建模
pca = PCA(n_components=n_components, whiten=True).fit(X_train)
eigenfaces = pca.components_.reshape((n_components, h, w))
print("根据主成分进行降维开始")
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("降维结束")
###############################################################################
# 训练SVM
print("训练SVM分类模型开始")
t0 = time()
# 构建归类精确度5x6=30
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
# 图片用rbf核函数,权重自动选取
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print("SVM训练结束,结果如下:" "SVM训练用时 %0.3fs" % (time() - t0))
print(clf.best_estimator_)

# ###############################################################################
# 测试集测试
print("测试集SVM分类模型开始")
t0 = time()
y_pred = clf.predict(X_test_pca)
print("测试集用时 %0.3fs" % (time() - t0))

print("误差衡量")
# 数据中1的个数为a,预测1的次数为b,预测1命中的次数为c
# 准确率 precision = c / b
# 召回率 recall = c / a
示例#25
0
    def __init__(self,
                 name,
                 construct,
                 skip_methods=(),
                 fit_args=make_classification()):
        self.name = name
        self.construct = construct
        self.fit_args = fit_args
        self.skip_methods = skip_methods


DELEGATING_METAESTIMATORS = [
    DelegatorData('Pipeline', lambda est: Pipeline([('est', est)])),
    DelegatorData(
        'GridSearchCV',
        lambda est: GridSearchCV(est, param_grid={'param': [5]}, cv=2),
        skip_methods=['score']),
    DelegatorData('RandomizedSearchCV',
                  lambda est: RandomizedSearchCV(
                      est, param_distributions={'param': [5]}, cv=2, n_iter=1),
                  skip_methods=['score']),
    DelegatorData('RFE',
                  RFE,
                  skip_methods=['transform', 'inverse_transform', 'score']),
    DelegatorData('RFECV',
                  RFECV,
                  skip_methods=['transform', 'inverse_transform', 'score']),
    DelegatorData('BaggingClassifier',
                  BaggingClassifier,
                  skip_methods=[
                      'transform', 'inverse_transform', 'score',
示例#26
0
def knncls():
    """
    K-近邻预测用户签到位置
    :return:None
    """
    # 读取数据
    data = pd.read_csv("./data/FBlocation/train.csv")

    print(data.head(10))

    # 处理数据
    # 1、缩小数据,查询数据晒讯
    data = data.query("x > 1.0 &  x < 1.25 & y > 2.5 & y < 2.75")

    # 处理时间的数据
    time_value = pd.to_datetime(data['time'], unit='s')

    print(time_value)

    # 把日期格式转换成 字典格式
    time_value = pd.DatetimeIndex(time_value)

    # 构造一些特征
    data['day'] = time_value.day
    data['hour'] = time_value.hour
    data['weekday'] = time_value.weekday

    # 把时间戳特征删除
    data = data.drop(['time'], axis=1)

    print(data)

    # 把签到数量少于n个目标位置删除
    place_count = data.groupby('place_id').count()

    tf = place_count[place_count.row_id > 3].reset_index()

    data = data[data['place_id'].isin(tf.place_id)]

    # 取出数据当中的特征值和目标值
    y = data['place_id']

    x = data.drop(['place_id'], axis=1)

    # 进行数据的分割训练集合测试集
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

    # 特征工程(标准化)
    std = StandardScaler()

    # 对测试集和训练集的特征值进行标准化
    x_train = std.fit_transform(x_train)

    x_test = std.transform(x_test)

    # 进行算法流程 # 超参数
    knn = KNeighborsClassifier()

    # # fit, predict,score
    knn.fit(x_train, y_train)

    # # 得出预测结果
    y_predict = knn.predict(x_test)
    #
    # print("预测的目标签到位置为:", y_predict)
    #
    # # 得出准确率
    # print("预测的准确率:", knn.score(x_test, y_test))

    # 构造一些参数的值进行搜索
    param = {"n_neighbors": [3, 5, 10]}

    # 进行网格搜索
    gc = GridSearchCV(knn, param_grid=param, cv=2)

    gc.fit(x_train, y_train)

    # 预测准确率
    print("在测试集上准确率:", gc.score(x_test, y_test))

    print("在交叉验证当中最好的结果:", gc.best_score_)

    print("选择最好的模型是:", gc.best_estimator_)

    print("每个超参数每次交叉验证的结果:", gc.cv_results_)

    return None
     print('=> calculating mean and covariance')
     mean, cov = fit_norm_distribution_param(args,
                                             model,
                                             train_dataset,
                                             channel_idx=channel_idx)
 ''' 2. Train anomaly score predictor using support vector regression (SVR). (Optional) '''
 # An anomaly score predictor is trained
 # given hidden layer output and the corresponding anomaly score on train dataset.
 # Predicted anomaly scores on test dataset can be used for the baseline of the adaptive threshold.
 if args.compensate:
     print('=> training an SVR as anomaly score predictor')
     train_score, _, _, hiddens, _ = anomalyScore(
         args, model, train_dataset, mean, cov, channel_idx=channel_idx)
     score_predictor = GridSearchCV(SVR(),
                                    cv=5,
                                    param_grid={
                                        "C": [1e0, 1e1, 1e2],
                                        "gamma": np.logspace(-1, 1, 3)
                                    })
     score_predictor.fit(
         torch.cat(hiddens, dim=0).numpy(),
         train_score.cpu().numpy())
 else:
     score_predictor = None
 ''' 3. Calculate anomaly scores'''
 # Anomaly scores are calculated on the test dataset
 # given the mean and the covariance calculated on the train dataset
 print('=> calculating anomaly scores')
 score, sorted_prediction, sorted_error, _, predicted_score = anomalyScore(
     args,
     model,
     test_dataset,
n_folds = 6
# choosing different parameter combinations to try
param_grid = {'C': [0.01, 0.1, 1, 10],
              'gamma': [0.004, 0.001, 0.01, 0.1],
              'kernel': ['rbf', 'linear', 'poly'],
             }

# type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)


# run grid search
start_time = dt.datetime.now()
print('Start grid search at {}'.format(str(start_time)))

grid_search = GridSearchCV(classifier, param_grid, cv=n_folds, scoring=acc_scorer, n_jobs=4)
grid_obj = grid_search.fit(X_val, y_val)
# get grid search results
print(grid_obj.cv_results_)

# set the best classifier found for rbf
clf = grid_obj.best_estimator_
print(clf)
end_time = dt.datetime.now()
print('Stop grid search {}'.format(str(end_time)))
elapsed_time= end_time - start_time
print('Elapsed grid search time {}'.format(str(elapsed_time)))


# fit the best alg to the training data
start_time = dt.datetime.now()
    model.add(Dropout(0.2))
    model.add(
        Dense(units=16, activation='relu',
              kernel_initializer='random_uniform'))
    model.add(Dense(units=1, activation='sigmoid'))

    model.compile(optimizer=optimizer, loss=loss, metrics=['binary_accuracy'])

    return model

    classifier = KerasClassifier(build_fn=criarRede)


params = {
    'batch_size': [10, 30],
    'epochs': [50, 100],
    'optimizer': ['adam', 'sgd'],
    'loss': ['binary_crossentropy', 'hinge'],
    'kernel_initializer': ['random_uniform', 'normal'],
    'activation': ['relu', 'tanh'],
    'neurons': [16, 8]
}

grid_search = GridSearchCV(estimator=classifier,
                           param_grid=params,
                           scoring='accuracy',
                           cv=5)

grid_search = grid_search.fit(data_x, data_y)
best_params = grid_search.best_params_
best_precision = grid_search.best_score_
示例#30
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=1 / 3,
                                                    random_state=101)

# Using gridsearch to find the best regularizattion parameter for the linear svc
clf = svm.SVC()
parameters = [{
    'kernel': ['poly'],
    'C': [0.5, 1, 10, 5, 7, 8, 9],
    'gamma': [0.5, 1, 10, 'auto', 'scale'],
    'degree': [1, 2, 3]
}]

grid_search = GridSearchCV(estimator=clf,
                           param_grid=parameters,
                           cv=5,
                           n_jobs=-1)
grid_search = grid_search.fit(X_train, y_train)
best_score = grid_search.best_score_
best_parameters = grid_search.best_params_
"""
clf = svm.SVC(kernel = 'linear', C = 0.8)
clf.fit(X_train, y_train)

#computing the decision boundary
x1, x2, xx, yy = computeMesh(X_train[:,0], X_train[:,1], 0.02)
xy_mesh = np.c_[x1, x2] # Turn to Nx2 matrix
clzmesh = clf.predict(xy_mesh)
clzmesh = clzmesh.reshape(xx.shape)

fig, ax = plt.subplots()