示例#1
0
 def test_array_like(self):
     plot_roc([0, 'a'], [[0.8, 0.2], [0.2, 0.8]])
     plot_roc([0, 1], [[0.8, 0.2], [0.2, 0.8]])
     plot_roc(['b', 'a'], [[0.8, 0.2], [0.2, 0.8]])
示例#2
0
from sklearn.metrics import f1_score

f1 = f1_score(y_test, y_test_pred_class, average='micro')

neptune.log_metric('test_f1', f1)

import matplotlib.pyplot as plt
from scikitplot.metrics import plot_confusion_matrix, plot_roc

fig, ax = plt.subplots(figsize=(16, 12))
plot_confusion_matrix(y_test, y_test_pred_class, ax=ax)
neptune.log_image('diagnostic_charts', fig)

fig, ax = plt.subplots(figsize=(16, 12))
plot_roc(y_test, y_test_pred, ax=ax)
neptune.log_image('diagnostic_charts', fig)

model.save('my_model.h5')
neptune.log_artifact('my_model.h5')

# tests
current_exp = neptune.get_experiment()

correct_logs = [
    'batch_loss', 'batch_accuracy', 'epoch_loss', 'epoch_accuracy',
    'epoch_val_loss', 'epoch_val_accuracy', 'test_f1', 'diagnostic_charts'
]

if set(current_exp.get_logs().keys()) != set(correct_logs):
    raise ValueError()
示例#3
0
y_pred = y_pred
df = pd.DataFrame(
    data={
        'y_test': y_test,
        'y_pred': y_pred,
        'y_pred_probability': y_pred_proba.max(axis=1)
    })
log_table('predictions', df)

# Log model performance visualizations

import matplotlib.pyplot as plt
from scikitplot.metrics import plot_roc, plot_precision_recall

fig, ax = plt.subplots()
plot_roc(y_test, y_pred_proba, ax=ax)
neptune.log_image('model-performance-visualizations', fig, image_name='ROC')

fig, ax = plt.subplots()
plot_precision_recall(y_test, y_pred_proba, ax=ax)
neptune.log_image('model-performance-visualizations',
                  fig,
                  image_name='precision recall')
plt.close('all')

# Log train data sample (images per class)

for j, class_name in enumerate(class_names):
    plt.figure(figsize=(10, 10))
    label_ = np.where(y_train == j)
    for i in range(9):
示例#4
0
 def test_string_classes(self):
     np.random.seed(0)
     clf = LogisticRegression()
     clf.fit(self.X, convert_labels_into_string(self.y))
     probas = clf.predict_proba(self.X)
     plot_roc(convert_labels_into_string(self.y), probas)
示例#5
0
def plot_roc_curve(labels, output):
    skplt.plot_roc(labels, output, plot_micro=False)
    plt.show()
示例#6
0
def LogisticRegression_self_test(X_train, X_test, y_train, y_test,
                                 learning_rates, epochs, iteration):
    """
	Logistic regression with stochastic gradient descent and gradient descent.
	"""

    # scoping number of training samples

    n_inputs = X_train.shape[0]
    n_features = X_train.shape[1]

    eta_ = 1e-12
    beta_opt = np.random.randn(X_train.shape[1], 2)
    calc_beta_GD, norm = GradientDescent(X_train, beta_opt, y_train, iteration,
                                         eta_)
    prob_GD, predict_GD = Probability_GD(
        X_test, calc_beta_GD)  #defining values to be between 0 and 1
    #yPred_GD = (predict_GD >= 0.5).astype(int) # converting to just 0 or 1

    #Define Logistic regression
    clf = LogisticRegression(solver='lbfgs', max_iter=1e5)
    clf = clf.fit(X_train, np.ravel(y_train))
    pred_sklearn = clf.predict(X_test)
    prob_sklearn = clf.predict_proba(X_test)
    #print(prob_sklearn)

    #for eta in np.logspace(np.log10(1e-6), np.log10(1e0), 7):
    accuracy = np.zeros(len(learning_rates))
    auc_score = np.zeros(len(learning_rates))

    for i, eta in enumerate(learning_rates):
        beta_SGD = stochastic_gradient_descent(X_train, beta_opt, y_train, eta,
                                               epochs, iteration)
        prob_SGD, predict_SGD = Probability(
            X_test, beta_SGD)  #defining values to be between 0 and 1

        accuracy[i] = metrics.accuracy_score(y_test, predict_SGD)
        auc_score[i] = metrics.roc_auc_score(y_test, predict_SGD)
        difference = y_test - predict_SGD

        if i > 0 and auc_score[i] > auc_score[i - 1]:
            best_pred_SGD = predict_SGD
            best_prob_SGD = prob_SGD

        print('Accuracy {}, learning rate= {}, iterations = {}'.format(
            accuracy[i], eta, iteration))

        print('Auc score: {}'.format(auc_score[i]))
        """
		plt.plot(yPred, label='predict')
		plt.plot(optimal_beta, label ='optimal beta')
		plt.plot(y_test, label='test')
		plt.show()
		"""

    sns.set()
    sns.heatmap(pd.DataFrame(accuracy), annot=True, fmt='.4g')
    plt.title('Grid-search for logistic regression')
    plt.ylabel('Learning rate: $\\eta$')
    plt.xlabel('Regularization Term: $\\lambda$')
    #plt.xticks(ticks=np.arange(len(learning_rates)) + 0.5, labels=learning_rates)
    #plt.yticks(ticks=np.arange(len(lambda_values)) + 0.5, labels=lambda_values)
    b, t = plt.ylim()  # discover the values for bottom and top
    b += 0.5  # Add 0.5 to the bottom
    t -= 0.5  # Subtract 0.5 from the top
    plt.ylim(b, t)  # update the ylim(bottom, top) values
    #plt.savefig('accuracy_logreg.png')
    plt.show()

    sns.heatmap(pd.DataFrame(auc_score), annot=True, fmt='.4g')
    plt.title('Grid-search for logistic regression')
    plt.ylabel('Learning rate: $\\eta$')
    plt.xlabel('Regularization Term: $\\lambda$')
    #plt.xticks(ticks=np.arange(len(learning_rates)) + 0.5, labels=learning_rates)
    #plt.yticks(ticks=np.arange(len(lambda_values)) + 0.5, labels=lambda_values)
    b, t = plt.ylim()  # discover the values for bottom and top
    b += 0.5  # Add 0.5 to the bottom
    t -= 0.5  # Subtract 0.5 from the top
    plt.ylim(b, t)  # update the ylim(bottom, top) values
    #plt.savefig('auc_score_logreg.png')
    plt.show()

    #plot confusion matrix
    Confusion_Matrix(y_test, predict_GD)
    #Confusion_Matrix(y_test, best_pred_SGD)
    #Confusion_Matrix(y_test, pred_sklearn)

    #diff = np.concatenate((1- predict, predict), axis=1)

    diff_sklearn = np.concatenate((1 - prob_sklearn, prob_sklearn), axis=1)
    diff_GD = np.concatenate((1 - prob_GD, prob_GD), axis=1)
    diff_SGD = np.concatenate((1 - best_prob_SGD, best_prob_SGD), axis=1)

    #plot roc curves
    plot_roc(y_test, prob_sklearn)
    plot_roc(y_test, diff_SGD)
    plot_roc(y_test, prob_GD)
    plt.show()

    #plot cumulative gain curves
    plot_cumulative_gain(y_test, prob_sklearn)
    ax = plot_cumulative_gain(y_test, diff_SGD)
    plot_cumulative_gain(y_test, prob_GD)
    #plt.show()
    """
	#plot roc curves
	plot_roc(y_test, diff_sklearn, plot_micro=False, plot_macro= False)
	plot_roc(y_test, diff_GD, plot_micro=False, plot_macro= False)
	plot_roc(y_test, diff_SGD, plot_micro=False, plot_macro= False)
	plt.show()

	#plot cumulative gain curves
	plot_cumulative_gain(y_test, diff_sklearn)
	plot_cumulative_gain(y_test, diff_GD)
	plot_cumulative_gain(y_test, diff_SGD)
	plt.show()	

	"""

    model_curve = auc_score
    area_baseline = 0.5
    area_ratio = (model_curve - area_baseline) / (area_baseline)
    print('Area Ratio:', area_ratio)

    return accuracy, learning_rates
示例#7
0
    test_questions_df = X_test[import_quest_lst]
    import_quest_demos_test = pd.concat([test_questions_df, test_demos],
                                        axis=1,
                                        join_axes=[test_questions_df.index])

    #Train best model, test best model

    rf_final = RandomForestClassifier(n_estimators=100,
                                      n_jobs=-1,
                                      class_weight='balanced',
                                      random_state=1)
    rf_final.fit(import_quest_demos, y_train)
    y_predict = rf_final.predict(import_quest_demos_test)
    y_predict_prob = rf_final.predict_proba(import_quest_demos_test)
    acc_final = accuracy_score(y_test, y_predict)

    #ROC plot

    plot_roc(y_test,
             y_predict_prob,
             title='Test Data ROC Curve',
             plot_micro=False,
             plot_macro=True,
             classes_to_plot=[])
    plt.savefig("images/roc.png")
    plt.close()

    #Partial Dependency Plots
    part_dep_plot(import_quest_lst)
    part_dep_plot(['gender_num', 'club_num', 'age_bin'])
def main():
    neptune.init(api_token=os.getenv('NEPTUNE_API_TOKEN'),
                 project_qualified_name=os.getenv('NEPTUNE_PROJECT'))

    train_idx = pd.read_csv(TRAIN_IDX_PATH, nrows=NROWS)
    valid_idx = pd.read_csv(VALID_IDX_PATH, nrows=NROWS)
    features = pd.read_csv(FEATURES_PATH, nrows=NROWS)

    train = pd.merge(train_idx, features, on='SK_ID_CURR')
    valid = pd.merge(valid_idx, features, on='SK_ID_CURR')

    all_params = {
        'num_boost_round': NUM_BOOST_ROUND,
        'early_stopping_rounds': EARLY_STOPPING_ROUNDS,
        **LGBM_PARAMS
    }

    with neptune.create_experiment(name='model training',
                                   params=all_params,
                                   tags=['lgbm'],
                                   upload_source_files=get_filepaths(),
                                   properties={
                                       'features_path':
                                       FEATURES_PATH,
                                       'features_version':
                                       md5_hash(FEATURES_PATH),
                                       'train_split_version':
                                       md5_hash(TRAIN_IDX_PATH),
                                       'valid_split_version':
                                       md5_hash(VALID_IDX_PATH),
                                   }):
        results = train_evaluate(train,
                                 valid,
                                 LGBM_PARAMS,
                                 callbacks=[neptune_monitor()])
        train_score, valid_score = results['train_score'], results[
            'valid_score']
        train_preds, valid_preds = results['train_preds'], results[
            'valid_preds']

        neptune.send_metric('train_auc', train_score)
        neptune.send_metric('valid_auc', valid_score)

        train_pred_path = os.path.join(PREDICTION_DIRPATH, 'train_preds.csv')
        train_preds.to_csv(train_pred_path, index=None)
        neptune.send_artifact(train_pred_path)

        valid_pred_path = os.path.join(PREDICTION_DIRPATH, 'valid_preds.csv')
        valid_preds.to_csv(valid_pred_path, index=None)
        neptune.send_artifact(valid_pred_path)

        model_path = os.path.join(MODEL_DIRPATH, 'model.pkl')
        joblib.dump(results['model'], model_path)
        neptune.set_property('model_path', model_path)
        neptune.set_property('model_version', md5_hash(model_path))
        neptune.send_artifact(model_path)

        if PACKAGE_TO_PROD:
            saved_path = CreditDefaultClassifier.pack(
                model=results['model']).save(PRODUCTION_DIRPATH)
            neptune.set_property('production_model_path', saved_path)

        fig, ax = plt.subplots(figsize=(16, 12))
        sk_metrics.plot_confusion_matrix(valid_preds['TARGET'],
                                         valid_preds['preds_pos'] > 0.5,
                                         ax=ax)
        plot_path = os.path.join(REPORTS_DIRPATH, 'conf_matrix.png')
        fig.savefig(plot_path)
        neptune.send_image('diagnostics', plot_path)

        fig, ax = plt.subplots(figsize=(16, 12))
        sk_metrics.plot_roc(valid_preds['TARGET'],
                            valid_preds[['preds_neg', 'preds_pos']],
                            ax=ax)
        plot_path = os.path.join(REPORTS_DIRPATH, 'roc_auc.png')
        fig.savefig(plot_path)
        neptune.send_image('diagnostics', plot_path)

        fig, ax = plt.subplots(figsize=(16, 12))
        sk_metrics.plot_precision_recall(
            valid_preds['TARGET'],
            valid_preds[['preds_neg', 'preds_pos']],
            ax=ax)
        plot_path = os.path.join(REPORTS_DIRPATH, 'prec_recall.png')
        fig.savefig(plot_path)
        neptune.send_image('diagnostics', plot_path)

        fig, ax = plt.subplots(figsize=(16, 12))
        plot_prediction_distribution(valid_preds['TARGET'],
                                     valid_preds['preds_pos'],
                                     ax=ax)
        plot_path = os.path.join(REPORTS_DIRPATH, 'preds_dist.png')
        fig.savefig(plot_path)
        neptune.send_image('diagnostics', plot_path)
示例#9
0
def train_models(X_train, X_test, y_train, y_test):
    '''
    train, store model results: images + scores, and store models
    input:
              X_train: X training data
              X_test: X testing data
              y_train: y training data
              y_test: y testing data
    output:
              None
    '''
    print("Training models")

    # Train models
    rfc = RandomForestClassifier(random_state=42)
    lrc = LogisticRegression(solver='lbfgs', max_iter=400)

    param_grid = {
        'n_estimators': [200, 500],
        'max_features': ['auto', 'sqrt'],
        'max_depth': [4, 5, 100],
        'criterion': ['gini', 'entropy']
    }

    cv_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)
    cv_rfc.fit(X_train, y_train)

    lrc.fit(X_train, y_train)

    print("Successfully trained models")
    print("Making predictions")

    # Make predictions
    y_train_preds_rf = cv_rfc.best_estimator_.predict(X_train)
    y_test_preds_rf = cv_rfc.best_estimator_.predict(X_test)

    y_train_preds_lr = lrc.predict(X_train)
    y_test_preds_lr = lrc.predict(X_test)

    print("Successfully made predictions")
    print("Saving results as images")

    # Save roc curve
    plt.figure(figsize=(15, 8))
    ax = plt.gca()
    lrc_plot = plot_roc(lrc, X_test, y_test, ax=ax)
    rfc_disp = plot_roc(cv_rfc.best_estimator_, X_test, y_test, ax=ax)
    #     lrc_plot.plot(ax=ax, alpha=0.8)
    plt.savefig('./images/results/roc_curve_result.png')
    plt.close()

    # Save results
    classification_report_image(y_train, y_test, y_train_preds_lr,
                                y_train_preds_rf, y_test_preds_lr,
                                y_test_preds_rf)

    # Save feature importance
    feature_importance_plot(cv_rfc.best_estimator_, X_train,
                            './images/results/feature_importances.png')

    print("Successfully saved results as images")
    print("Saving models as pickle files")

    # Save pickle files
    joblib.dump(cv_rfc.best_estimator_, './models/rfc_model.pkl')
    joblib.dump(lrc, './models/logistic_model.pkl')

    print("Successfully saved models as pickle files")