Exemplo n.º 1
0
        regression(linear_model.LassoLarsIC()),
        regression(linear_model.OrthogonalMatchingPursuit()),
        regression(linear_model.OrthogonalMatchingPursuitCV()),
        regression(linear_model.Ridge(random_state=RANDOM_SEED)),
        regression(linear_model.RidgeCV()),
        regression(linear_model.BayesianRidge()),
        regression(linear_model.ARDRegression()),
        regression(linear_model.SGDRegressor(random_state=RANDOM_SEED)),
        regression(
            linear_model.PassiveAggressiveRegressor(random_state=RANDOM_SEED)),

        # Logistic Regression
        classification(
            linear_model.LogisticRegression(random_state=RANDOM_SEED)),
        classification(
            linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)),
        classification(linear_model.RidgeClassifier(random_state=RANDOM_SEED)),
        classification(linear_model.RidgeClassifierCV()),
        classification(linear_model.SGDClassifier(random_state=RANDOM_SEED)),
        classification_binary(
            linear_model.LogisticRegression(random_state=RANDOM_SEED)),
        classification_binary(
            linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)),
        classification_binary(
            linear_model.RidgeClassifier(random_state=RANDOM_SEED)),
        classification_binary(linear_model.RidgeClassifierCV()),
        classification_binary(
            linear_model.SGDClassifier(random_state=RANDOM_SEED)),

        # Decision trees
        regression(tree.DecisionTreeRegressor(**TREE_PARAMS)),
Exemplo n.º 2
0
def classify(X, y):
    clf = linear_model.LogisticRegressionCV()
    clf.fit(X, y)
    return clf
test_X = imp.transform(test_X)

#Get method
print "Choose model. Options:"
print "1. Logistic regression"
print "2. Support vector classification (linear kernel)"
print "3. Support vector classification (RBF kernel)"
print "4. Decision trees"
print "5. Random forests"
print "6. Extra trees"
choice = int(raw_input("Choose model (1-6): "))

#Make prediction
if choice == 1:
    from sklearn import linear_model as lm
    clf = lm.LogisticRegressionCV()
    clf.fit(X, y)
    print "We chose from these C values for CV: ",
    print clf.Cs_
    print "Best C value is: ",
    print clf.C_
    print "Slope is: ",
    print clf.coef_
    print "Intercept is: ",
    print clf.intercept_
    howgood = clf.score(X, y)
    print "In-sample score is: %.5f" % howgood
    test_y = clf.predict(test_X)
    #print test_y
elif choice == 2:
    from sklearn import svm, grid_search
Exemplo n.º 4
0
    print_image(vector2image(X_train[1,:]), 'Train Cat Example')
    print_image(vector2image(X_train[0,:]), 'Train Dog Example')
    print_image(vector2image(X_test[0,:]), 'Test Example')
    print_image(vector2image(X_test2[0,:]), 'Test2 Example')


#--------------------------------
# II - Train Logistic Regression
#--------------------------------

# we don't train a statsmodels GLM, because data are too heavy for its implentation.

if not os.path.isfile('obj/adv.pkl'):
    # sklearn LR with L2 regularization
    # search for the best C hyperparameter
    lr_l2_CV = linear_model.LogisticRegressionCV(penalty = 'l2', solver='sag', Cs=100,
        random_state = 42, n_jobs=-1)
    # Cs=100 : grid of 100 values
    lr_l2_CV.fit(X_train, y_train)
    bestC = lr_l2_CV.C_[0]
    print('Best C found: {0}'.format(bestC))
    del lr_l2_CV

    # retrain LR with the best C
    # this is the same than above, but currently adversarialLogistic 
    # doesn't support linear_model.LogisticRegressionCV
    lr_l2 = linear_model.LogisticRegression(penalty = 'l2', solver='sag', random_state = 42, C=bestC, n_jobs=-1)
    lr_l2.fit(X_train, y_train)

    lr_l2_acc_is = lr_l2.score(X = X_train, y = y_train)
    lr_l2_acc_oos = lr_l2.score(X = X_test, y = y_test)
    print('Accuracy in-sample: {0}'.format(lr_l2_acc_is))
Exemplo n.º 5
0
GridSearchCV(cv=None,
             estimator=LogisticRegression(C=1.0,
                                          intercept_scaling=1,
                                          dual=False,
                                          fit_intercept=True,
                                          penalty='l2',
                                          tol=0.0001),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]})
clf = clf.fit(X, y)
y_predicted2 = clf.predict(X_test)  #predicted class
cm2 = ConfusionMatrix(y_test, y_predicted2)
cm2.print_stats()
acc2 = accuracy_score(y_test, y_predicted2)
#print(classification_report(y_test, y_predicted))
cmatrix2 = confusion_matrix(y_test, y_predicted)
ROI2 = cmatrix[1, 1] * 100 + cmatrix[0, 0] * 15 + cmatrix[0, 1] * (
    -15) + cmatrix[1, 0] * (-30)

##Next I tried cross validation
logisticCV = linear_model.LogisticRegressionCV(
    class_weight='balanced', scoring='roc_auc')  #scoring =‘accuracy’
logisticCV = logisticCV.fit(X, y)
y_predicted = logisticCV.predict(X_test)  #predicted class
cm = ConfusionMatrix(y_test, y_predicted)
cm.print_stats()
acc = accuracy_score(y_test, y_predicted)
#print(classification_report(y_test, y_predicted))
cmatrix = confusion_matrix(y_test, y_predicted)
ROI = cmatrix[1, 1] * 100 + cmatrix[0, 0] * 15 + cmatrix[0, 1] * (
    -15) + cmatrix[1, 0] * (-30)
Exemplo n.º 6
0
y, x = dmatrices(formula, rawdf, return_type="dataframe")
y = y.values.flatten()

logreg = linear_model.LogisticRegression(C=0.1, penalty='l1', tol=0.01)

logreg.fit(x, y)
scores = cross_val_score(logreg, x, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

coeffdf = pd.DataFrame({'feature': x.columns, 'coeff': np.transpose(logreg.coef_).flatten()})
nflist = coeffdf[coeffdf.coeff != 0].feature.values.tolist()
print(len(nflist))

# feature selection using best model from cross validation and get the best features
fslogreg = linear_model.LogisticRegressionCV(penalty='l1', solver='liblinear')
fslogreg.fit(x, y)

fsmodel = SelectFromModel(fslogreg, prefit=True)
x_new = fsmodel.transform(x)
x_new.shape

coeffdf = pd.DataFrame({'feature': x.columns, 'coeff': np.transpose(fslogreg.coef_).flatten()})
nflist = coeffdf[coeffdf.coeff != 0].feature.values.tolist()
print(len(nflist))

'''
L2 regularisation on new features
'''
formula = ModelDesc([Term([LookupFactor('rating')])], [Term([LookupFactor(c)]) for c in nflist])
y, x = dmatrices(formula, rawdf, return_type="dataframe")
Exemplo n.º 7
0
    X = data[:, :-1]
    y = data[:, -1]
    return X, y


# Cargamos los datos de entrenamiento
X_train, y_train = read_csv(
    '../datasets-clasificacion/train_countryriskmoodys.csv')

# Cargamos los datos de generalización
X_test, y_test = read_csv(
    '../datasets-clasificacion/test_countryriskmoodys.csv')

# Escalamos datos
scaler = preprocessing.StandardScaler(with_mean=False).fit(X_train)
X_train_escalado = scaler.transform(X_train)
X_test_escalado = scaler.transform(X_test)

# =================================================================
# Probemos a optimizar el coste cd la regresión logística
# http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV
print(
    'Rendimiento con datos estandarizados y optimizando el parámetro de coste')
regr_cv = linear_model.LogisticRegressionCV(solver='liblinear')
regr_cv.fit(X_train_escalado, y_train)
y_test_predict = regr_cv.predict(X_test_escalado)
accuracy = metrics.accuracy_score(y_test, y_test_predict)
conf_matrix = metrics.confusion_matrix(y_test, y_test_predict)
print('Precisión global:\t%0.2f' % (accuracy))
accuracy_per_class(conf_matrix)
Exemplo n.º 8
0
def get_classifier(x, y):
    reg = linear_model.LogisticRegressionCV(fit_intercept=False,
                                            multi_class='multinomial')
    reg.fit(x, y)
    return reg
Exemplo n.º 9
0
    X_all, names, adjpvals = apply_ttest(
        X_all, names, outfile_root + "adj_pvals_features.txt")

    X_scaled = preprocessing.scale(X_all)

    make_scatter_plots(X_scaled, names, outfile_root + 'scatter_plots.pdf')

    data = np.zeros((3, 4))

    dfper = pd.DataFrame(
        data,
        columns=['accuracy', 'precision', 'recall', 'roc_auc'],
        index=['DT', 'LR', 'RF'])  # performance table

    clf = linear_model.LogisticRegressionCV(refit=True, random_state=PRNG)

    LR_imp, dfper = build_model(clf, X_scaled, Y, names, "LR", dfper,
                                outfile_root + "LR_features.txt")

    clf = tree.DecisionTreeClassifier(class_weight=None,
                                      criterion='gini',
                                      max_depth=None,
                                      max_features=None,
                                      max_leaf_nodes=None,
                                      min_impurity_split=1e-07,
                                      min_samples_leaf=5,
                                      min_samples_split=2,
                                      min_weight_fraction_leaf=0.0,
                                      presort=False,
                                      random_state=PRNG,
Exemplo n.º 10
0
def analyze_logistic(X,
                     y,
                     model,
                     scale_columns,
                     analyze_params=False,
                     balance_outcomes=False):
    """
    Function for doing analysis of logistic regression. Plots cumulative gain, confusion matrix
    and grid search of optimal learning rate/epochs in SGD with k-fold CV (optional).
    Performs scaling of all continuous features in the data set.

    Inputs:
    - X: design matrix, shape (n, p)
    - y: targets, shape (n,)
    - scale_columns: list of indices of which columns to MinMax scale
    - analyze_params: boolean, option to perform grid search of learning rate and n_epochs in SGD
    - balance_outcomes: boolean, option to balance training data in case of skewed classes
    """

    #split data in train/validate and test
    X_train_val, X_test, y_train_val, y_test = train_test_split(X,
                                                                y,
                                                                test_size=0.1)

    #balance training set such that outcomes are 50/50 in training data
    if balance_outcomes:
        non_default_inds = np.where(y_train_val == 0)[0]
        default_inds = np.where(y_train_val == 1)[0]

        remove_size = len(non_default_inds) - len(default_inds)
        remove_inds = np.random.choice(non_default_inds,
                                       size=remove_size,
                                       replace=False)

        X_train_val = np.delete(X, remove_inds, axis=0)
        y_train_val = np.delete(y, remove_inds, axis=0)
    #end if

    #scale continuous features
    minmaxscaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = ColumnTransformer(remainder='passthrough',
                               transformers=[('minmaxscaler', minmaxscaler,
                                              scale_columns)])

    #scale only test data at this point (CV scales training/validation)
    scaler.fit(X_train_val)
    X_test = scaler.transform(X_test)

    if analyze_params:

        #initialize vectors for saving results
        error_scores = pd.DataFrame(
            columns=['log eta', 'n_epochs', 'mse', 'r2', 'accuracy'])
        n_etas = 4
        eta_vals = np.linspace(-1, -4, n_etas)
        n_epoch_vals = np.array([10, 100, 500, 1000])
        n_epochs = len(n_epoch_vals)
        accuracy_scores = np.zeros((n_etas, n_epochs))

        max_accuracy = 0
        best_eta = 0
        best_n_epochs = 0

        #perform grid search of best learning rate
        #and number of epochs with k-fold cross-validation
        i = 0
        for eta in eta_vals:
            model.set_eta(10**eta)

            j = 0
            for epoch in n_epoch_vals:
                model.set_n_epochs(epoch)

                #perform cross validation
                mse, r2, accuracy = CV(X_train_val, y_train_val, model)
                accuracy_scores[i, j] = accuracy

                error_scores = error_scores.append(
                    {
                        'log eta': eta,
                        'n_epochs': epoch,
                        'mse': mse,
                        'r2': r2,
                        'accuracy': accuracy
                    },
                    ignore_index=True)

                #check if current configuration is better
                if accuracy > max_accuracy:
                    max_accuracy = accuracy
                    best_eta = eta
                    best_n_epochs = epoch

                j += 1
                #end for epoch
            i += 1
            #end for eta

        #set optimal model parameters
        model.set_eta(10**best_eta)
        model.set_n_epochs(best_n_epochs)

        #plot heatmap of grid search
        acc_table = pd.pivot_table(error_scores,
                                   values='accuracy',
                                   index=['log eta'],
                                   columns='n_epochs')
        idx_i = np.where(acc_table == max_accuracy)[0]
        idx_j = np.where(acc_table == max_accuracy)[1]

        fig = plt.figure()
        ax = sns.heatmap(acc_table,
                         annot=True,
                         fmt='.2g',
                         cbar=True,
                         linewidths=1,
                         linecolor='white',
                         cbar_kws={'label': 'Accuracy'})

        ax.add_patch(
            Rectangle((idx_j, idx_i), 1, 1, fill=False, edgecolor='red', lw=2))
        ax.set_xlabel('Number of epochs')
        ax.set_ylabel(r'log$_{10}$ of Learning rate')

        bottom, top = ax.get_ylim()
        ax.set_ylim(bottom + 0.5, top - 0.5)
        plt.show()
    #end if

    #scale training data
    X_train_val = scaler.transform(X_train_val)

    #pylearn model
    model.fit(X_train_val, y_train_val)
    pred_train = model.predict(X_train_val)
    pred_test = model.predict(X_test)

    #sklearn model
    clf = linear_model.LogisticRegressionCV()
    clf.fit(X_train_val, y_train_val)
    pred_skl = clf.predict(X_test)

    #get accuracy scores
    accuracy_on_test = accuracy_score(y_test, pred_test)
    accuracy_on_train = accuracy_score(y_train_val, pred_train)
    accuracy_skl = accuracy_score(y_test, pred_skl)

    #predict
    pred_train_prob = model.predict(X_train_val, probability=True)
    pred_test_prob = model.predict(X_test, probability=True)

    #get area ratio and plot cumulaive gain
    area_ratio_train = cumulative_gain_area_ratio(y_train_val,
                                                  pred_train_prob,
                                                  title='Training results')
    area_ratio_test = cumulative_gain_area_ratio(y_test,
                                                 pred_test_prob,
                                                 title=None)
    plt.show()

    #plot confusion matrix
    ax1 = plot_confusion_matrix(y_test,
                                pred_test,
                                normalize=True,
                                cmap='Blues',
                                title=' ')
    ax2 = plot_confusion_matrix(y_train_val,
                                pred_train,
                                normalize=True,
                                cmap='Blues',
                                title='Training data')

    bottom, top = ax1.get_ylim()
    ax1.set_ylim(bottom + 0.5, top - 0.5)
    ax2.set_ylim(bottom + 0.5, top - 0.5)

    plt.show()

    #print some stats
    print('===accuracy and area ratio stats===')
    print('accuracy on test:', accuracy_on_test)
    print('accuracy on train:', accuracy_on_train)
    print('accuracy skl:', accuracy_skl)
    print('area ratio train:', area_ratio_train)
    print('area ratio test:', area_ratio_test)

    if analyze_params:
        print('===grid search stats===')
        print('max accuracy:', max_accuracy)
        print('eta:', best_eta)
        print('n_epochs:', best_n_epochs)
Exemplo n.º 11
0
    R = np.array([[np.cos(theta), -np.sin(theta)],
                  [np.sin(theta), np.cos(theta)]])

    xx, yy = np.dot(R, [xx, yy])
    ## skalowanie
    xx /= max(np.absolute(xx))
    yy /= max(np.absolute(yy))
    ## przypisanie do X
    X[row, ::2] = xx
    X[row, 1::2] = yy
X = decomposition.PCA().fit_transform(X)

## UTWORZENIE OBIEKTU KLASYFIKATORA WRAZ Z CROSS-VALIDACJĄ
Cs = np.linspace(10, 12, 60)
clf = linear_model.LogisticRegressionCV(Cs=Cs,
                                        fit_intercept=True,
                                        max_iter=10000,
                                        n_jobs=-1).fit(X, y)
print(clf.score(X, y))
print(clf.scores_)
print(clf.C_)

## TWORZENIE CONFUSSION MATRICES
fig, (ax1, ax2) = plt.subplots(2)
fig.suptitle('Confusion matrices (not)normalized')

ax1.set_title('Nie znormalizowany')
conf_mat_disp = metrics.plot_confusion_matrix(clf,
                                              X,
                                              y,
                                              display_labels=y_labels,
                                              cmap=plt.cm.Blues,
Exemplo n.º 12
0
def fit_lasso(X, knockoffs, y, y_dist=None, use_lars=False, **kwargs):

    # Parse some kwargs/defaults
    if "max_iter" in kwargs:
        max_iter = kwargs.pop("max_iter")
    else:
        max_iter = 500
    if "tol" in kwargs:
        tol = kwargs.pop("tol")
    else:
        tol = 1e-3
    if "cv" in kwargs:
        cv = kwargs.pop("cv")
    else:
        cv = 5
    if y_dist is None:
        y_dist = parse_y_dist(y)

    # Bind data
    p = X.shape[1]
    features = np.concatenate([X, knockoffs], axis=1)

    # Randomize coordinates to make sure everything is symmetric
    inds, rev_inds = random_permutation_inds(2 * p)
    features = features[:, inds]

    # Fit lasso
    warnings.filterwarnings("ignore")
    if y_dist == "gaussian":
        if not use_lars:
            gl = linear_model.LassoCV(
                alphas=DEFAULT_REG_VALS,
                cv=cv,
                verbose=False,
                max_iter=max_iter,
                tol=tol,
                **kwargs,
            ).fit(features, y)
        elif use_lars:
            gl = linear_model.LassoLarsCV(
                cv=cv,
                verbose=False,
                max_iter=max_iter,
                **kwargs,
            ).fit(features, y)
    elif y_dist == "binomial":
        gl = linear_model.LogisticRegressionCV(
            Cs=1 / DEFAULT_REG_VALS,
            penalty="l1",
            max_iter=max_iter,
            tol=tol,
            cv=cv,
            verbose=False,
            solver="liblinear",
            **kwargs,
        ).fit(features, y)
    else:
        raise ValueError(
            f"y_dist must be one of gaussian, binomial, not {y_dist}")
    warnings.resetwarnings()

    return gl, inds, rev_inds
Exemplo n.º 13
0
def fill_sex_and_age():
    clf = linear_model.LogisticRegressionCV()
    fill(file_tf_users, 'sex', 0, file_tf_fs_users, clf)
    fill(file_tf_dd_users, 'sex', 0, file_tf_dd_fs_users, clf)
    fill(file_tf_fs_users, 'age', 0, file_tf_fsa_users, clf)
    fill(file_tf_dd_fs_users, 'age', 0, file_tf_dd_fsa_users, clf)
Exemplo n.º 14
0
def instanciate_estimators(clf_type, classifiers, clf_seed, y=None, **kw):

    score_metric, _ = get_score_metric(clf_type)
    param_grid_LGBM = {
        'learning_rate': [0.1, .05, .5],
        'num_leaves': [7, 15, 31]
    }
    param_grid_XGB = {'learning_rate': [0.1, .05, .3], 'max_depth': [3, 6, 9]}
    param_grid_MLP = {
        'learning_rate_init': [.001, .0005, .005],
        'hidden_layer_sizes': [(30, ), (50, ), (100, ), (30, 30), (50, 50),
                               (100, 100)]
    }
    param_grid_EigenProGaussian = {'bandwidth': [1, 5, 25]}
    n_components_eigenpro = 160
    param_grid_nystroem_ridgecv = {
        'kernel_approx__n_components': [1000, 3000],
        'kernel_approx__degree': [2, 3],
    }
    if clf_type == 'binary':
        print(('Fraction by class: True: %0.2f; False: %0.2f' %
               (list(y).count(True) / len(y), list(y).count(False) / len(y))))
        cw = 'balanced'
        clfs = {
            'L2RegularizedLinearModel':
            linear_model.LogisticRegressionCV(class_weight=cw,
                                              max_iter=100,
                                              solver='sag',
                                              penalty='l2',
                                              n_jobs=1,
                                              cv=3,
                                              multi_class='multinomial'),
            'GradientBoosting':
            ensemble.GradientBoostingClassifier(n_estimators=100),
            'LGBM':
            GridSearchCV(estimator=LGBMClassifier(n_estimators=100,
                                                  n_jobs=1,
                                                  is_unbalance=True),
                         param_grid=param_grid_LGBM,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'XGB':
            GridSearchCV(estimator=XGBClassifier(n_estimators=100, n_jobs=1),
                         param_grid=param_grid_XGB,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'MLP':
            MLPClassifier(hidden_layer_sizes=(30, 30),
                          activation='relu',
                          solver='adam',
                          alpha=0.0001,
                          batch_size='auto',
                          learning_rate='constant',
                          learning_rate_init=0.001,
                          power_t=0.5,
                          max_iter=200,
                          shuffle=True,
                          random_state=None,
                          tol=0.0001,
                          verbose=False,
                          warm_start=False,
                          momentum=0.9,
                          nesterovs_momentum=True,
                          early_stopping=False,
                          validation_fraction=0.1,
                          beta_1=0.9,
                          beta_2=0.999,
                          epsilon=1e-08,
                          n_iter_no_change=10),
            'MLPGridSearchCV':
            GridSearchCV(estimator=MLPClassifier(hidden_layer_sizes=(30, 30),
                                                 activation='relu',
                                                 solver='adam',
                                                 alpha=0.0001,
                                                 batch_size='auto',
                                                 learning_rate='adaptive',
                                                 learning_rate_init=0.001,
                                                 power_t=0.5,
                                                 max_iter=200,
                                                 shuffle=True,
                                                 random_state=None,
                                                 tol=0.0001,
                                                 verbose=False,
                                                 warm_start=False,
                                                 momentum=0.9,
                                                 nesterovs_momentum=True,
                                                 early_stopping=False,
                                                 validation_fraction=0.1,
                                                 beta_1=0.9,
                                                 beta_2=0.999,
                                                 epsilon=1e-08,
                                                 n_iter_no_change=10),
                         param_grid=param_grid_MLP,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProPolynomial':
            FKC_EigenPro(batch_size="auto",
                         n_epoch=10,
                         n_components=n_components_eigenpro,
                         subsample_size="auto",
                         kernel="polynomial",
                         bandwidth=5,
                         gamma=None,
                         degree=2,
                         coef0=1,
                         kernel_params=None,
                         random_state=None),
            'EigenProGaussian160':
            GridSearchCV(estimator=FKC_EigenPro(
                batch_size="auto",
                n_epoch=10,
                n_components=n_components_eigenpro,
                subsample_size="auto",
                kernel="gaussian",
                gamma=None,
                degree=2,
                coef0=1,
                kernel_params=None,
                random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProGaussian1000':
            GridSearchCV(estimator=FKC_EigenPro(batch_size="auto",
                                                n_epoch=10,
                                                n_components=1000,
                                                subsample_size="auto",
                                                kernel="gaussian",
                                                gamma=None,
                                                degree=2,
                                                coef0=1,
                                                kernel_params=None,
                                                random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'NystroemRidgeCV':
            GridSearchCV(estimator=Pipeline([
                ('kernel_approx',
                 Nystroem(kernel="polynomial",
                          n_components=None,
                          random_state=clf_seed,
                          degree=2)),
                ('classifier',
                 linear_model.LogisticRegressionCV(class_weight=cw,
                                                   max_iter=100,
                                                   solver='sag',
                                                   penalty='l2',
                                                   n_jobs=1,
                                                   cv=3,
                                                   multi_class='multinomial'))
            ]),
                         param_grid=param_grid_nystroem_ridgecv,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
        }

    elif clf_type == 'multiclass':
        print('fraction of the most frequent class:',
              max([list(y).count(x) for x in set(list(y))]) / len(list(y)))
        clfs = {
            'L2RegularizedLinearModel':
            linear_model.LogisticRegressionCV(penalty='l2',
                                              n_jobs=1,
                                              cv=3,
                                              multi_class='multinomial',
                                              solver='sag',
                                              max_iter=100),
            'GradientBoosting':
            ensemble.GradientBoostingClassifier(n_estimators=100),
            'LGBM':
            GridSearchCV(estimator=LGBMClassifier(n_estimators=100, n_jobs=1),
                         param_grid=param_grid_LGBM,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'XGB':
            GridSearchCV(estimator=XGBClassifier(n_estimators=100,
                                                 n_jobs=1,
                                                 objective='multi:softmax',
                                                 num_class=len(np.unique(y))),
                         param_grid=param_grid_XGB,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'MLP':
            MLPClassifier(hidden_layer_sizes=(30, 30),
                          activation='relu',
                          solver='adam',
                          alpha=0.0001,
                          batch_size='auto',
                          learning_rate='constant',
                          learning_rate_init=0.001,
                          power_t=0.5,
                          max_iter=200,
                          shuffle=True,
                          random_state=None,
                          tol=0.0001,
                          verbose=False,
                          warm_start=False,
                          momentum=0.9,
                          nesterovs_momentum=True,
                          early_stopping=False,
                          validation_fraction=0.1,
                          beta_1=0.9,
                          beta_2=0.999,
                          epsilon=1e-08,
                          n_iter_no_change=10),
            'MLPGridSearchCV':
            GridSearchCV(estimator=MLPClassifier(hidden_layer_sizes=(30, 30),
                                                 activation='relu',
                                                 solver='adam',
                                                 alpha=0.0001,
                                                 batch_size='auto',
                                                 learning_rate='adaptive',
                                                 learning_rate_init=0.001,
                                                 power_t=0.5,
                                                 max_iter=200,
                                                 shuffle=True,
                                                 random_state=None,
                                                 tol=0.0001,
                                                 verbose=False,
                                                 warm_start=False,
                                                 momentum=0.9,
                                                 nesterovs_momentum=True,
                                                 early_stopping=False,
                                                 validation_fraction=0.1,
                                                 beta_1=0.9,
                                                 beta_2=0.999,
                                                 epsilon=1e-08,
                                                 n_iter_no_change=10),
                         param_grid=param_grid_MLP,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProPolynomial':
            FKC_EigenPro(batch_size="auto",
                         n_epoch=10,
                         n_components=n_components_eigenpro,
                         subsample_size="auto",
                         kernel="polynomial",
                         gamma=None,
                         degree=2,
                         coef0=1,
                         kernel_params=None,
                         random_state=None),
            'EigenProGaussian160':
            GridSearchCV(estimator=FKC_EigenPro(
                batch_size="auto",
                n_epoch=10,
                n_components=n_components_eigenpro,
                subsample_size="auto",
                kernel="gaussian",
                gamma=None,
                degree=2,
                coef0=1,
                kernel_params=None,
                random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProGaussian1000':
            GridSearchCV(estimator=FKC_EigenPro(batch_size="auto",
                                                n_epoch=10,
                                                n_components=1000,
                                                subsample_size="auto",
                                                kernel="gaussian",
                                                gamma=None,
                                                degree=2,
                                                coef0=1,
                                                kernel_params=None,
                                                random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'NystroemRidgeCV':
            GridSearchCV(estimator=Pipeline([
                ('kernel_approx',
                 Nystroem(kernel="polynomial",
                          n_components=None,
                          random_state=clf_seed,
                          degree=2)),
                ('classifier',
                 linear_model.LogisticRegressionCV(penalty='l2',
                                                   n_jobs=1,
                                                   cv=3,
                                                   multi_class='multinomial',
                                                   solver='sag',
                                                   max_iter=100))
            ]),
                         param_grid=param_grid_nystroem_ridgecv,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
        }
    elif clf_type == 'regression':
        clfs = {
            'L2RegularizedLinearModel':
            linear_model.RidgeCV(cv=3),
            'GradientBoosting':
            ensemble.GradientBoostingRegressor(n_estimators=100),
            'LGBM':
            GridSearchCV(estimator=LGBMRegressor(n_estimators=100, n_jobs=1),
                         param_grid=param_grid_LGBM,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'XGB':
            GridSearchCV(estimator=XGBRegressor(n_estimators=100, n_jobs=1),
                         param_grid=param_grid_XGB,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'MLP':
            MLPRegressor(hidden_layer_sizes=(30, 30),
                         activation='relu',
                         solver='adam',
                         alpha=0.0001,
                         batch_size='auto',
                         learning_rate='constant',
                         learning_rate_init=0.001,
                         power_t=0.5,
                         max_iter=200,
                         shuffle=True,
                         random_state=None,
                         tol=0.0001,
                         verbose=False,
                         warm_start=False,
                         momentum=0.9,
                         nesterovs_momentum=True,
                         early_stopping=False,
                         validation_fraction=0.1,
                         beta_1=0.9,
                         beta_2=0.999,
                         epsilon=1e-08,
                         n_iter_no_change=10),
            'MLPGridSearchCV':
            GridSearchCV(estimator=MLPRegressor(hidden_layer_sizes=(30, 30),
                                                activation='relu',
                                                solver='adam',
                                                alpha=0.0001,
                                                batch_size='auto',
                                                learning_rate='adaptive',
                                                learning_rate_init=0.001,
                                                power_t=0.5,
                                                max_iter=200,
                                                shuffle=True,
                                                random_state=None,
                                                tol=0.0001,
                                                verbose=False,
                                                warm_start=False,
                                                momentum=0.9,
                                                nesterovs_momentum=True,
                                                early_stopping=False,
                                                validation_fraction=0.1,
                                                beta_1=0.9,
                                                beta_2=0.999,
                                                epsilon=1e-08,
                                                n_iter_no_change=10),
                         param_grid=param_grid_MLP,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProPolynomial':
            FKR_EigenPro(batch_size="auto",
                         n_epoch=10,
                         n_components=n_components_eigenpro,
                         subsample_size="auto",
                         kernel="polynomial",
                         bandwidth=5,
                         gamma=None,
                         degree=2,
                         coef0=1,
                         kernel_params=None,
                         random_state=None),
            'EigenProGaussian160':
            GridSearchCV(estimator=FKR_EigenPro(
                batch_size="auto",
                n_epoch=10,
                n_components=n_components_eigenpro,
                subsample_size="auto",
                kernel="gaussian",
                gamma=None,
                degree=2,
                coef0=1,
                kernel_params=None,
                random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProGaussian1000':
            GridSearchCV(estimator=FKR_EigenPro(batch_size="auto",
                                                n_epoch=10,
                                                n_components=1000,
                                                subsample_size="auto",
                                                kernel="gaussian",
                                                gamma=None,
                                                degree=2,
                                                coef0=1,
                                                kernel_params=None,
                                                random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'NystroemRidgeCV':
            GridSearchCV(estimator=Pipeline([('kernel_approx',
                                              Nystroem(kernel="polynomial",
                                                       n_components=None,
                                                       random_state=clf_seed,
                                                       degree=2)),
                                             ('classifier',
                                              linear_model.RidgeCV(cv=3))]),
                         param_grid=param_grid_nystroem_ridgecv,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
        }
    else:
        raise ValueError("{} not recognized".format(clf_type))

    clfs = [clfs[clf] for clf in classifiers]
    for clf in clfs:
        try:
            if 'random_state' in clf.estimator.get_params():
                clf.estimator.set_params(random_state=clf_seed)
        except AttributeError:
            if 'random_state' in clf.get_params():
                clf.set_params(random_state=clf_seed)
    return clfs
Exemplo n.º 15
0
def main():
    parser = argparse.ArgumentParser(
        description='Extract features and run models')
    parser.add_argument('--classifier',
                        dest='classifier_type',
                        help='lr svm ffn',
                        default='')
    parser.add_argument(
        '--name',
        dest='model_name',
        help=
        'model name base, automatically appends experiment features and classifier, None just puts classifier and features',
        default=None)
    parser.add_argument('--feature',
                        dest='feature',
                        help='feature type',
                        default='user')
    parser.add_argument(
        '--output-dirpath',
        dest='output_dirpath',
        help='output dirpath; default /projects/websci2020_tumblr_identity',
        default='/projects/websci2020_tumblr_identity')
    args = parser.parse_args()

    feature_type = args.feature
    output_dirpath = args.output_dirpath

    # Classifier definitions
    classifiers = {
        'lr':
        linear_model.LogisticRegressionCV(cv=10,
                                          n_jobs=10,
                                          max_iter=10000,
                                          verbose=0),
        'svm':
        model_selection.GridSearchCV(svm.LinearSVC(dual=False,
                                                   max_iter=10000,
                                                   verbose=0),
                                     {
                                         'C': [.01, .1, 1, 10, 100],
                                         'penalty': ['l2']
                                     },
                                     n_jobs=10,
                                     cv=10,
                                     verbose=2),
        'ffn':
        neural_network.MLPClassifier(hidden_layer_sizes=(32, 50),
                                     activation='relu',
                                     early_stopping=True,
                                     verbose=2)
    }

    # ### Post baseline
    print("Extracting post baseline features...")
    X_train, y_train, X_test, y_test = extract_features(feature_type)
    clf = classifiers[args.classifier_type]

    print("Running post baseline...")
    if args.model_name is None:
        model_name = f'baseline_{args.classifier_type}'
    else:
        model_name = f'{args.model_name}_{args.classifier_type}'

    model, score, baseline_preds = run_model(model_name, clf, X_train, y_train,
                                             X_test, y_test, feature_type,
                                             output_dirpath)
    print(f'\tBaseline score: {score: .4f}')
Exemplo n.º 16
0
def plot_ROC(X, y, outfile_name, PRNG):

    "ROC curve summary for the classifiers - with cross-validation"

    cv = StratifiedKFold(n_splits=10)

    clf1 = tree.DecisionTreeClassifier(
        class_weight=None,
        criterion='gini',
        max_depth=None,
        max_features=None,
        max_leaf_nodes=None,
        min_impurity_split=1e-07,
        min_samples_leaf=5,  ## min 5 in each leaf
        min_samples_split=2,
        min_weight_fraction_leaf=0.0,
        presort=False,
        random_state=PRNG,
        splitter='best')  # min_samples_split is common in literature

    clf2 = linear_model.LogisticRegressionCV(refit=True, random_state=PRNG)

    clf3 = RandomForestClassifier(n_estimators=1000, random_state=PRNG)

    clfnames = ['DT', 'LR', 'RF']

    colors = cycle(['blue', 'green', 'darkorange'])

    plt.figure(figsize=(4, 3), dpi=300)

    for clf, cnames, color in zip([clf1, clf2, clf3], clfnames, colors):

        mean_tpr = 0.0
        mean_fpr = np.linspace(0, 1, 100)

        lw = 1.5

        i = 0
        for (train, test), color in zip(cv.split(X, y), colors):

            probas_ = clf.fit(X[train], y[train]).predict_proba(X[test])

            # Compute ROC curve and area the curve
            fpr, tpr, thresholds = metrics.roc_curve(y[test], probas_[:, 1])
            mean_tpr += interp(mean_fpr, fpr, tpr)
            mean_tpr[0] = 0.0
            roc_auc = metrics.auc(fpr, tpr)

            i += 1

        mean_tpr /= cv.get_n_splits(X, y)
        mean_tpr[-1] = 1.0
        mean_auc = metrics.auc(mean_fpr, mean_tpr)
        plt.plot(mean_fpr,
                 mean_tpr,
                 color=color,
                 label='%s (%0.2f)' % (cnames, mean_auc),
                 lw=lw)

    font = {'size': 12, 'weight': 'normal', 'family': 'sans-serif'}

    plt.rc('font', **font)

    plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('FPR')
    plt.ylabel('TPR')

    plt.title("")
    plt.legend(loc="lower right", prop={'size': 12})
    plt.savefig(outfile_name)
Exemplo n.º 17
0

#df[df.apply(lambda x: np.abs(x - x.mean()) / x.std() < 3).all(axis=1)]
#df.describe()


# In[345]:


#Creation of the model
X = np.array(df.drop(['double'],1))
y = np.array(df['double'])

X_val = np.array(df_val.drop(['double'],1))
y_val = np.array(df_val['double'])
model = linear_model.LogisticRegressionCV(Cs=100, class_weight={0:2.3, 1:3.7}, cv=10, random_state=7)
model.fit(X,y)


# In[346]:


#Store the model and print the confusion matrix
joblib.dump(model, 'model.pkl')
y_pred = model.predict(X_val)
cm = metrics.confusion_matrix(y_val, y_pred)
print(cm)


# In[347]:
Exemplo n.º 18
0
def logistic_deconvolution(estimation_train,
                           estimation_test,
                           stimuli_train,
                           stimuli_test,
                           logistic_window,
                           delay=0):
    """
    Learn a deconvolution filter for classification given a time window
    using logistic regression.

    Parameters
    ----------

    estimation_train: numpy array of shape [n_scans_train, n_categories]
        estimation of the categories time series for the train data

    estimation_test: numpy array of shape [n_scans_test, n_categories]
        estimation of the categories time series for the test data

    stimuli_train: numpy array of shape [n_scans_train, n_categories]
        time series of the train stimuli with one-hot encoding

    stimuli_test: numpy array of shape [n_scans_test, n_categories]
        time series of the test stimuli with one-hot encoding

    logistic_window: int
        size of the time window to be used for creating train and test data

    delay: int, optional
        delay between time series and stimuli to be applied to the data.
        Defaults to 0.

    Returns
    -------

    score: numpy array of size [n_categories]
        prediction r2 score for each category
    """

    log = linear_model.LogisticRegressionCV()

    # Add a delay between time series and stimuli if needed
    if delay != 0:
        estimation_train, estimation_test = (estimation_train[delay:],
                                             estimation_test[delay:])
        stimuli_train, stimuli_test = (stimuli_train[:-delay],
                                       stimuli_test[:-delay])

    # Create train and test masks for the stimuli (i.e. no 'rest' category)
    train_mask = np.sum(stimuli_train[:, 1:], axis=1).astype(bool)
    test_mask = np.sum(stimuli_test[:, 1:], axis=1).astype(bool)

    # Create train and test time windows
    time_windows_train = [
        estimation_train[scan:scan + logistic_window].ravel()
        for scan in range(len(estimation_train) - logistic_window + 1)
        if train_mask[scan]
    ]
    time_windows_test = [
        estimation_test[scan:scan + logistic_window].ravel()
        for scan in range(len(estimation_test) - logistic_window + 1)
        if test_mask[scan]
    ]

    # Create train and test stimuli labels
    stimuli_train = np.argmax(stimuli_train[train_mask], axis=1)
    stimuli_test = np.argmax(stimuli_test[test_mask], axis=1)

    # Fit logistic regression
    log.fit(time_windows_train, stimuli_train)
    accuracy = log.score(time_windows_test, stimuli_test)

    return accuracy
Exemplo n.º 19
0
        regression(linear_model.LassoLars()),
        regression(linear_model.LassoLarsIC()),
        regression(linear_model.OrthogonalMatchingPursuit()),
        regression(linear_model.OrthogonalMatchingPursuitCV()),
        regression(linear_model.Ridge(random_state=RANDOM_SEED)),
        regression(linear_model.RidgeCV()),
        regression(linear_model.BayesianRidge()),
        regression(linear_model.ARDRegression()),
        regression(linear_model.SGDRegressor(random_state=RANDOM_SEED)),
        regression(linear_model.PassiveAggressiveRegressor(
            random_state=RANDOM_SEED)),

        # Logistic Regression
        classification(linear_model.LogisticRegression(
            random_state=RANDOM_SEED)),
        classification(linear_model.LogisticRegressionCV(
            random_state=RANDOM_SEED)),
        classification(linear_model.RidgeClassifier(random_state=RANDOM_SEED)),
        classification(linear_model.RidgeClassifierCV()),
        classification(linear_model.SGDClassifier(random_state=RANDOM_SEED)),

        classification_binary(linear_model.LogisticRegression(
            random_state=RANDOM_SEED)),
        classification_binary(linear_model.LogisticRegressionCV(
            random_state=RANDOM_SEED)),
        classification_binary(linear_model.RidgeClassifier(
            random_state=RANDOM_SEED)),
        classification_binary(linear_model.RidgeClassifierCV()),
        classification_binary(linear_model.SGDClassifier(
            random_state=RANDOM_SEED)),

        # Decision trees
Exemplo n.º 20
0
    if_else('max_dtsl', '>', 10).if_else('max_dxcs', '>', 3).if_else('max_dxmj', '>', 1000).\
    if_else('max_jzgd', '>', 100).if_else('max_jznl', '<', 0).if_else('max_jznl', '>', 10000).\
    if_else('max_rzsl', '>', 50).if_else('max_zdmj', '>', 10000).if_else('ssdts', '>', 10).if_else('xfcds', '>', 10).\
    if_else('xfdts', '>', 10).if_else('yhsl', '>', 300).if_else('zgrs', '>', 1000).if_else('zgsl', '>', 300).\
    fill_na('zjhzsj', 20010101000000).fill_na('zjjcsj', 20010101000000).if_else('zjyhsl', '>', 10).\
    if_else('zjzgsl', '>', 10).date_diff('zjhzsj', deadline).date_diff('zjjcsj', deadline).\
    col_diff_if_else('yhsl', 'zgsl').col_diff_if_else('zjyhsl', 'zjzgsl')
# 提取需要的数据列
fire_data = fire_data.fire_data.ix[:, [
    'dwid', 'Y', 'aqsks', 'dwdj_1', 'dwdj_2', 'dwxz_1', 'dwxz_2', 'dwxz_3',
    'dwxz_4', 'hzsl', 'jcsl', 'jzmj', 'jzsl', 'sfgpdw', 'sfzdyhdw', 'ssdts',
    'xfcds', 'xfdts', 'yhsl', 'zdxfss', 'zgrs', 'zjyhsl', 'zjzgsl',
    'hzts_to_deadline', 'jcts_to_deadline', 'yhsl_minus_zgsl',
    'zjyhsl_minus_zjzgsl', 'zddw', 'ybdw', 'jxdw', 'wxp', 'max_jzzt',
    'max_jznl', 'max_jzgd', 'max_zdmj', 'max_dscs', 'max_dsmj', 'max_dxcs',
    'max_dxmj', 'max_nhdj', 'max_rnrs', 'max_dtsl', 'max_xfkzs', 'max_rzsl',
    'max_xfsssl'
]].fillna(0)
# 运行模型:弹性网络模型
# enet = lm.ElasticNetCV(l1_ratio=1, cv=10, n_jobs=1)  # l1_ratio=1表示Lasso回归
# enet.fit(X=fire_data.ix[:, 2:], y=fire_data.ix[:, 1])
# joblib.dump(enet, model_path+'/fire_risk_model_enet.pkl')

# 运行模型:逻辑回归模型
lgr = lm.LogisticRegressionCV(cv=10,
                              penalty='l1',
                              solver='liblinear',
                              n_jobs=1)
lgr.fit(X=fire_data.ix[:, 2:], y=fire_data.ix[:, 1])
joblib.dump(lgr, model_path + '/fire_risk_model_lgr.pkl')
Exemplo n.º 21
0
def compare_algorithm(data, target):
    x_train, x_cross, y_train, y_cross = train_test_split(data, target)
    MLA = [
        # Ensemble Methods
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),

        # Gaussian Processes
        gaussian_process.GaussianProcessClassifier(),

        # GLM
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(max_iter=1000, tol=0.001),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(max_iter=1000, tol=0.001),
        linear_model.Perceptron(max_iter=1000, tol=0.001),

        # Navies Bayes
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),

        # Nearest Neighbor
        neighbors.KNeighborsClassifier(),

        # SVM
        svm.SVC(probability=True),
        svm.NuSVC(probability=True),
        svm.LinearSVC(),

        # Trees
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),

        # Discriminant Analysis
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),

        # xgboost: http://xgboost.readthedocs.io/en/latest/model.html
        xgb.XGBClassifier()
    ]
    MLA_columns = []
    MLA_compare = pd.DataFrame(columns=MLA_columns)

    row_index = 0
    for alg in MLA:
        predicted = alg.fit(x_train, y_train).predict(x_cross)
        fp, tp, th = roc_curve(y_cross, predicted)
        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Train Accuracy'] = round(
            alg.score(x_train, y_train), 4)
        MLA_compare.loc[row_index, 'MLA Test Accuracy'] = round(
            alg.score(x_cross, y_cross), 4)
        MLA_compare.loc[row_index, 'MLA Precission'] = precision_score(
            y_cross, predicted)
        MLA_compare.loc[row_index,
                        'MLA Recall'] = recall_score(y_cross, predicted)
        MLA_compare.loc[row_index, 'MLA AUC'] = auc(fp, tp)
        row_index = row_index + 1

    MLA_compare.sort_values(by=['MLA Test Accuracy'],
                            ascending=False,
                            inplace=True)
    print(MLA_compare)
Exemplo n.º 22
0
"""

import sklearn as sk
import sklearn.linear_model as skl
import sklearn.preprocessing as skp
import sklearn.datasets as skd
import sklearn.decomposition as skD
import numpy as np
import kk_utils as kk

dataset = skd.load_breast_cancer()

linreg = skl.LinearRegression()
rreg = skl.RidgeCV()
lasreg = skl.LassoCV()
logreg = skl.LogisticRegressionCV()

scores = []
'''
# Experimental Part
for n_components in range(60,65):
    print('Number of components:',n_components)
    pca = pca.set_params(n_components=n_components)    
    X = dataset.data
    X = kk.MeanNormalizer(X)
    X = pca.fit_transform(X)
    score,best_cv = kk.fitModel(linreg, X, Y, cv=True, ncv = 10)
    cvs.append(best_cv)
    scores.append(score)
   '''   
# BEST SCORE CLASSIFICATION
    loss_clf_test=round(hamming_loss(y_test,y_clf_test),4)
    loss_train.append(loss_clf_train); loss_test.append(loss_clf_test)    
    return [y_clf_train,y_clf_test,acc_clf_train,
            acc_clf_test,loss_clf_train,loss_clf_test]
def get_classifier_results():
    return pandas.DataFrame({'classifier':classifier_list,
                             'classifier_name':classifier_names,
                             'clf_dataset':clf_datasets,
                             'acc_train':acc_train,'acc_test':acc_test,
                             'loss_train':loss_train,'loss_test':loss_test})

classifier_list,classifier_names,clf_datasets=[],[],[]
acc_train,acc_test,loss_train,loss_test=[],[],[],[]
df_list=['classifier_name','acc_train','acc_test','loss_train','loss_test']
clf=[linear_model.LogisticRegression(solver='liblinear',multi_class='ovr'),
     linear_model.LogisticRegressionCV(solver='liblinear',multi_class='ovr'),
     linear_model.SGDClassifier(max_iter=1000,tol=0.00001),
     linear_model.RidgeClassifier(),linear_model.RidgeClassifierCV(),
     LinearDiscriminantAnalysis(),QuadraticDiscriminantAnalysis(),
     svm.LinearSVC(),svm.SVC(gamma='scale',C=10.0,kernel='poly'),
     svm.NuSVC(gamma='scale',kernel='poly'),
     KNeighborsClassifier(),RadiusNeighborsClassifier(radius=30),
     NearestCentroid(),
     DecisionTreeClassifier(),ExtraTreeClassifier(),GaussianNB(),
     BernoulliNB(),MultinomialNB(),
     BaggingClassifier(),RandomForestClassifier(n_estimators=64),
     AdaBoostClassifier(),GradientBoostingClassifier(),
     linear_model.Perceptron(max_iter=1000,tol=0.00001),
     linear_model.PassiveAggressiveClassifier(max_iter=1000,tol=0.00001),
     GaussianProcessClassifier(),LabelPropagation(),LabelSpreading()]
Exemplo n.º 24
0
def trial_dataparams(data, target):
    random_state = 42

    X, y = data, target

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=random_state, stratify=y)

    kfolds = StratifiedKFold(n_splits=5,
                             shuffle=True,
                             random_state=random_state)
    scoring = {
        'Precision': make_scorer(precision_score),
        'Recall': make_scorer(recall_score),
        'F1_score': make_scorer(f1_score),
        'Accuracy': make_scorer(accuracy_score)
    }

    cls_balanced = [
        ('dtc',
         tree.DecisionTreeClassifier(class_weight='balanced',
                                     random_state=random_state)),
        ('rfc',
         ensemble.RandomForestClassifier(n_estimators=100,
                                         class_weight='balanced',
                                         random_state=random_state)),
        ('lr',
         linear_model.LogisticRegressionCV(class_weight='balanced',
                                           random_state=random_state)),
        ('svc',
         svm.SVC(probability=True,
                 class_weight='balanced',
                 random_state=random_state)),
        ('xgb',
         XGBClassifier(n_estimators=100,
                       objective='binary:logistic',
                       scale_pos_weight=13,
                       random_state=random_state))
    ]

    cls = [('dtc', tree.DecisionTreeClassifier(random_state=random_state)),
           ('bc',
            ensemble.BaggingClassifier(n_estimators=100,
                                       random_state=random_state)),
           ('gbc',
            ensemble.GradientBoostingClassifier(n_estimators=100,
                                                random_state=random_state)),
           ('rfc',
            ensemble.RandomForestClassifier(n_estimators=100,
                                            random_state=random_state)),
           ('lr',
            linear_model.LogisticRegressionCV(random_state=random_state)),
           ('knn', neighbors.KNeighborsClassifier()),
           ('svc', svm.SVC(probability=True, random_state=random_state)),
           ('xgb',
            XGBClassifier(n_estimators=100,
                          objective='binary:logistic',
                          random_state=random_state))]

    dfout = pd.DataFrame()
    for scaler in [StandardScaler(), RobustScaler(quantile_range=(2.5, 97.5))]:
        for pipelinesteps in [get_pipe_pca(scaler), get_pipe_nopca(scaler)]:
            for cls_train in [cls, cls_balanced]:
                dfout_i = train_cls(cls_train,
                                    pipelinesteps,
                                    X_train=X_train,
                                    y_train=y_train,
                                    kfolds=kfolds,
                                    scoring=scoring)
                dfout = dfout.append(dfout_i, ignore_index=True)
    dfout = dfout.sort_values(by='CV F1 Mean', ascending=False)
    dfout.to_csv(PROCESSED + os.sep + 'trial_dataparams_.csv',
                 index=False,
                 sep=';')
#grid_knn=grid(neighbors.KNeighborsClassifier()).grid_get(x_train_c,y_train,knn_grid)
#grid_forest=grid(RandomForestClassifier()).grid_get(x_train_c,y_train,forest_grid)
#grid_dtree=grid(tree.DecisionTreeClassifier()).grid_get(x_train_c,y_train,dtree_grid)
#grid_lrc=grid(linear_model.LogisticRegressionCV()).grid_get(x_train_c,y_train,lrc_grid)
#grid_rc=grid(linear_model.RidgeClassifierCV()).grid_get(x_train_c,y_train,rc_grid)

# In[66]:

svc = svm.SVC(C=5, gamma=1e-05, kernel='linear')
knn = neighbors.KNeighborsClassifier(algorithm='kd_tree',
                                     n_neighbors=6,
                                     weights='distance')
dtree = tree.DecisionTreeClassifier(criterion='gini',
                                    min_samples_split=0.05,
                                    random_state=0)
lrc = linear_model.LogisticRegressionCV(Cs=1000)
#rc=linear_model.RidgeClassifierCV(grid_rc)
forest = RandomForestClassifier(criterion='gini', max_depth=8, n_estimators=80)
bayes = naive_bayes.GaussianNB()
models = [svc, forest]
meta_model = knn

# In[67]:

from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone


class stacking(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, mod, meta_model):
        self.mod = mod
        self.meta_model = meta_model
Exemplo n.º 26
0
print("Test AUC:%.2f; bACC:%.2f, Time: %.2fs" %
      (scores['test_roc_auc'].mean(), scores['test_balanced_accuracy'].mean(),
       scores['fit_time'].sum()))

# %%
# Models with built-in cross-validation
# --------------------------------------
#
# Let sklearn select the best parameters over a default grid.
#
# **Classification**

print("== Logistic Ridge (L2 penalty) ==")
mod_cv = lm.LogisticRegressionCV(class_weight='balanced',
                                 scoring='balanced_accuracy',
                                 n_jobs=-1,
                                 cv=5)
scores = cross_val_score(estimator=mod_cv, X=X, y=y, cv=5)
print("Test  ACC:%.2f" % scores.mean())

# %%
# **Regression**

X, y, coef = datasets.make_regression(n_samples=50,
                                      n_features=100,
                                      noise=10,
                                      n_informative=2,
                                      random_state=42,
                                      coef=True)

print("== Ridge (L2 penalty) ==")
Exemplo n.º 27
0
# In[ ]:

MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),

    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),

    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),

    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),

    #SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
def auto_model(X, y, X_pred, sub):

    models = {
        'ridge  ':
        linear_model.Ridge(alpha=.5, max_iter=1e8),
        'ridgeCV':
        linear_model.RidgeCV(cv=3),
        'lasso  ':
        linear_model.Lasso(alpha=1e-6, max_iter=1e8),
        'lr     ':
        linear_model.LogisticRegression(solver='lbfgs', max_iter=1e4),
        'lrCV   ':
        linear_model.LogisticRegressionCV(solver='lbfgs', max_iter=1e4, cv=5),
        'mlp_clf':
        neural_network.MLPClassifier(solver='lbfgs',
                                     alpha=1e-5,
                                     hidden_layer_sizes=(256, 64, 32, 32, 32),
                                     random_state=1),
        'mlp_reg':
        neural_network.MLPRegressor(solver='lbfgs',
                                    alpha=1e-5,
                                    hidden_layer_sizes=(256, 64, 32, 32, 32),
                                    random_state=1),
        'svc    ':
        svm.SVC(),
        'rfreg  ':
        ensemble.RandomForestRegressor(max_depth=4),
        'rfclf  ':
        ensemble.RandomForestClassifier(max_depth=4),
        'lgbclf ':
        lgb.LGBMClassifier(gamma='auto',
                           num_leaves=4,
                           learning_rate=0.001,
                           n_estimators=2000),
        'lgbreg ':
        lgb.LGBMRegressor(gamma='auto',
                          num_leaves=31,
                          learning_rate=0.001,
                          n_estimators=20000),
        'knn    ':
        neighbors.KNeighborsClassifier(n_neighbors=5, n_jobs=15),
        'nb     ':
        naive_bayes.GaussianNB(),
        'dt     ':
        tree.DecisionTreeClassifier(),
        #        'catreg ': CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=3, verbose = True),
        #        'catclf ': CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=2, verbose = True),
    }

    print('\nall models: ', list(models.keys()))

    # In[all scalers]
    from sklearn import preprocessing
    from sklearn import feature_selection
    from sklearn.decomposition import PCA

    preprocessings = {
        'standards': preprocessing.StandardScaler(),
        'minmaxs': preprocessing.MinMaxScaler(),
        'robusts': preprocessing.RobustScaler(),
        'PCA': PCA(),
        'PowerTransformer': preprocessing.Normalizer(),
        'variance_threshold':
        feature_selection.VarianceThreshold(threshold=0.5),
    }

    print('all preprocessings: ', list(preprocessings.keys()), '\n')

    # In[Preprocessing Pipline]
    from sklearn.pipeline import make_pipeline
    from sklearn.model_selection import KFold
    nfolds = 5

    kf = KFold(n_splits=nfolds, random_state=2019, shuffle=True)

    pipe_preprocessing = make_pipeline(
        #                        preprocessings['variance_threshold'],
        preprocessings['standards'],
        #                         preprocessings['minmaxs'],
        #                         preprocessings['robusts'],
        #                         preprocessings['PCA'],
        #                         preprocessings['PowerTransformer'],
    )

    full = np.concatenate((X, X_pred), axis=0)
    pipe_preprocessing.fit(full)
    X = pipe_preprocessing.transform(X)
    X_pred = pipe_preprocessing.transform(X_pred)
    #
    mse_score = []
    auc_score = []
    valid_score = {}
    oof = y * 0
    idx2 = 0
    X_s = pd.DataFrame()
    X_s_preds = pd.DataFrame()

    for idx, model in enumerate(models.items()):

        model = model[1]

        try:
            print('trainning in: ', list(models.keys())[idx])

            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                if (list(models.keys())[idx] == 'lgbclf ') | (list(
                        models.keys())[idx] == 'lgbreg '):
                    model.fit(X_train,
                              y_train,
                              eval_set=[(X_test, y_test)],
                              eval_metric='l1',
                              early_stopping_rounds=5,
                              verbose=0)
                elif (list(models.keys())[idx] == 'catreg ') | (list(
                        models.keys())[idx] == 'catclf '):
                    model.fit(X_train, y_train, eval_set=(X_test, y_test))
                else:
                    model.fit(X_train, y_train)

                oof[test_index] = model.predict(X_test)

            mse_score.append(mean_squared_error(y, oof))
            try:
                auc_score.append(roc_auc_score(y, oof))
            except:
                auc_score.append(0)

            X_s[list(models.keys())[idx]] = oof
            X_s_preds[list(models.keys())[idx]] = model.predict(X_pred)

            valid_score.update({
                list(models.keys())[idx]: [
                    'mse: ', "{0:.4f}".format(mse_score[idx2]), 'auc: ',
                    "{0:.4f}".format(auc_score[idx2])
                ]
            })

            idx2 += 1

        except:
            print('error     in: ', list(models.keys())[idx])
            print('-------------------------------------------')

    # In[Keras NN]
    X_s0 = np.asarray(X)
    X_s0_preds = np.asarray(X_pred)

    oof = np.zeros(len(X_s0))
    predictions = np.asarray(sub.target * 0.)

    pipe_preprocessing = make_pipeline(
        preprocessings['minmaxs'],
        preprocessings['PowerTransformer'],
    )

    full = np.concatenate((X_s0, X_s0_preds), axis=0)
    pipe_preprocessing.fit(full)
    X = pipe_preprocessing.transform(X_s0)
    X_pred = pipe_preprocessing.transform(X_s0_preds)
    #
    print('\nstart kerasNN ... ')

    for fold_, (train_index, test_index) in enumerate(kf.split(X)):

        model = Sequential([
            Dense(256, input_shape=(X_s0.shape[1], )),
            Activation('relu'),
            Dense(128),
            Activation('relu'),
            Dense(64),
            Activation('relu'),
            Dense(32),
            Activation('relu'),
            Dense(32),
            Activation('relu'),
            Dense(1),
            Activation('sigmoid'),
        ])

        model.compile(optimizer='adam', loss='binary_crossentropy')

        file_path = "NN_ml_" + "_model_" + "loop_" + str(fold_) + ".hdf5"

        X_tr, X_val = X_s0[train_index], X_s0[test_index]
        y_tr, y_val = y[train_index], y[test_index]

        callbacks = [
            EarlyStopping(monitor='val_loss', mode='min', patience=20),
            ModelCheckpoint(filepath=file_path,
                            monitor='val_loss',
                            mode='min',
                            save_best_only=True), lr_reduced
        ]

        model.fit(X_tr,
                  y_tr,
                  epochs=750,
                  batch_size=512,
                  callbacks=callbacks,
                  shuffle=True,
                  validation_data=(X_val, y_val),
                  verbose=1)

        model.load_weights(file_path)

        oof[test_index] = np.ndarray.flatten(model.predict(X_val))
        predictions += np.ndarray.flatten(model.predict(X_s0_preds))

    predictions /= nfolds

    X_s['keras_nn'] = oof
    X_s_preds['keras_nn'] = predictions
    oof_k = np.asarray(oof)

    # In[Stacking]
    X_s = np.asarray(X_s)
    X_s_preds = np.asarray(X_s_preds)

    oof = np.zeros(len(X_s))
    predictions = np.asarray(sub.target * 0.)

    pipe_preprocessing = make_pipeline(preprocessings['minmaxs'], )

    full = np.concatenate((X_s, X_s_preds), axis=0)
    pipe_preprocessing.fit(full)
    X = pipe_preprocessing.transform(X_s)
    X_pred = pipe_preprocessing.transform(X_s_preds)
    #
    print('\nstart stacking... ')

    for fold_, (train_index, test_index) in enumerate(kf.split(X)):

        model = Sequential([
            Dense(512, input_shape=(X_s.shape[1], )),
            Activation('linear'),
            Dense(1),
            Activation('sigmoid'),
        ])

        model.compile(optimizer='adam', loss='binary_crossentropy')

        file_path = "NN_ml_" + "_model_" + "loop_" + str(fold_) + ".hdf5"

        X_tr, X_val = X_s[train_index], X_s[test_index]
        y_tr, y_val = y[train_index], y[test_index]

        callbacks = [
            EarlyStopping(monitor='val_loss', mode='min', patience=20),
            ModelCheckpoint(filepath=file_path,
                            monitor='val_loss',
                            mode='min',
                            save_best_only=True), lr_reduced
        ]

        model.fit(X_tr,
                  y_tr,
                  epochs=750,
                  batch_size=512,
                  callbacks=callbacks,
                  shuffle=True,
                  validation_data=(X_val, y_val),
                  verbose=1)

        model.load_weights(file_path)

        oof[test_index] = np.ndarray.flatten(model.predict(X_val))
        predictions += np.ndarray.flatten(model.predict(X_s_preds))

    predictions /= nfolds

    # In[showing result]
    print('\nvalid score: \n', pd.DataFrame(valid_score).transpose())

    inds = np.argmax(mse_score)
    print('\nmse score - worst model found: ',
          list(models.keys())[inds], np.max(mse_score))
    inds = np.argmin(auc_score)
    print('auc score - worst model found: ',
          list(models.keys())[inds], np.min(auc_score))

    inds = np.argmin(mse_score)
    print('mse score - best model found: ',
          list(models.keys())[inds], np.min(mse_score))
    inds = np.argmax(auc_score)
    print('auc score - best model found: ',
          list(models.keys())[inds], np.max(auc_score))

    try:
        mse_score_s = mean_squared_error(y, oof_k)
        print('\rmse score - keras_nn: ', mse_score_s)
        mse_score_s = mean_squared_error(y, oof)
        print('\rmse score - stacking: ', mse_score_s)
    except:
        pass

    try:
        auc_score_s = roc_auc_score(y, oof_k)
        print('auc score - keras_nn: ', auc_score_s)
        auc_score_s = roc_auc_score(y, oof)
        print('auc score - stacking: ', auc_score_s)
    except:
        pass

    sub['target'] = predictions
Exemplo n.º 29
0
#Machine Learning Algorithm (MLA) Selection and initialization
CLF = [
    #Ensemble Methods
    ('ada', ensemble.AdaBoostClassifier(tree.DecisionTreeClassifier())),
    ('bc', ensemble.BaggingClassifier()),
    ('etc', ensemble.ExtraTreesClassifier()),
    ('gbc', ensemble.GradientBoostingClassifier()),
    ('xgbc', xgb.XGBClassifier(max_depth=3)),  # xgb.XGBClassifier()),    #
    ('rfc', ensemble.RandomForestClassifier(n_estimators=50)),

    #Gaussian Processes
    ('gpc', gaussian_process.GaussianProcessClassifier()),

    #GLM - remove linear models, since this is a classifier algorithm
    ('lr', linear_model.LogisticRegressionCV()),
    ('pac', linear_model.PassiveAggressiveClassifier()),
    ('rc', linear_model.RidgeClassifierCV()),
    ('sgd', linear_model.SGDClassifier()),
    ('pct', linear_model.Perceptron()),

    #Navies Bayes
    ('gnb', naive_bayes.GaussianNB()),

    #Nearest Neighbor
    ('knn', neighbors.KNeighborsClassifier(n_neighbors=3)),

    #SVM
    ('svc', svm.SVC(probability=True)),
    ('lsvc', svm.LinearSVC()),
Exemplo n.º 30
0
            coefficient_results.append([col_name])
        coefficient_results[counter_coe].append(scipy.stats.pearsonr(df_temp[col_name].values.tolist(), Y)[0])    
        counter_coe+=1
        
    X = df_temp.values.tolist()
    X = np.array(X)
    X = X/X.max(axis=0)
    #five fold cross-validation
    numData.append(len(X))
    score1= []
    score2= []
    kf = KFold(len(X), n_folds = 5, shuffle = True )
    for train, test in kf:
        X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test]
    #X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2)
        logreg = linear_model.LogisticRegressionCV(penalty='l1', solver='liblinear', Cs=[1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8], refit=True)
        logreg.fit(X_train,y_train)
        print "step"+str(step[i])
        print "Percentage of positive data: ",float(sum(Y))/len(Y)
        y_pred = logreg.predict(X_test)

        #print "Accuracy score w/o feature selection: ",logreg.score(X_test,y_test)
        score1.append(logreg.score(X_test,y_test))
        """
        # perform recursive feature selection(backward selectoin)
        rfecv = RFECV(estimator=logreg, step=1, cv=StratifiedKFold(y_train, 4),scoring='accuracy')
        #rfecv = RFECV(estimator=logreg, step=1, cv=StratifiedKFold(y_train, 4),scoring='roc_auc')
        rfecv.fit(X_train, y_train)
        print("Optimal number of features : %d" % rfecv.n_features_)
        print "rfecv support ",rfecv.support_
        print "rfecv ranking ", rfecv.ranking_