Exemplo n.º 1
0
def main():
    data = proc.read_data()
    features, yfill = proc.features_yfill(data)
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        yfill,
                                                        test_size=0.20,
                                                        random_state=42,
                                                        stratify=yfill)
    X_train_over, y_train_over = proc.oversample(X_train, y_train, r=0.3)
    #plot_roc(X_train, y_train, 'LogisticRegression', LogisticRegression(C=1e5,penalty='l2'))
    '''
    model_over = runLR(X_train_over, X_test, y_train_over, y_test)
    test_results(model_over, X_test, y_test)
    '''

    model = runLR(X_train.values, X_test, y_train.values, y_test)
    test_results(model, X_test, y_test)
def do_grid_search(data):
    # Get the data from our function above
    features, yfill = proc.features_yfill(data)
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        yfill,
                                                        test_size=0.20,
                                                        random_state=42,
                                                        stratify=yfill)

    # Initalize our model here
    # original est = RandomForestClassifier()
    est = RandomForestClassifier(bootstrap=True,
                                 criterion="gini",
                                 class_weight="balanced_subsample")

    # Here are the params we are tuning, ie,
    # if you look in the docs, all of these are 'nobs' within the GradientBoostingClassifier algo.
    param_grid = {
        "max_depth": [3, 5, 10, 30, 50, 100],
        "max_features": [1, 3, 10, 30],
        "min_samples_split": [2, 3, 10],
        "min_samples_leaf": [2, 3, 10]
    }
    '''
    {'max_depth': 10,
     'max_features': 30,
     'min_samples_leaf': 2,
     'min_samples_split': 2}
     '''
    '''
    param_grid = {"max_depth": [3, 5, 10, 30],
              "max_features": [1, 3, 10, 30],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [2, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
              "class_weight":[None, "balanced_subsample"]}
    Full param grid
    '''

    # Plug in our model, params dict, and the number of jobs, then .fit()
    gs_cv = GridSearchCV(est, param_grid, n_jobs=2).fit(X_train, y_train)

    # return the best score and the best params
    return gs_cv.best_score_, gs_cv.best_params_
Exemplo n.º 3
0
    plt.plot(num_trees, recall)
    #plt.ylim((0.8, 1))
    plt.savefig('recall_vs_numtrees_{}.png'.format(graphid))
    plt.close()


if __name__ == '__main__':
    data = proc.read_data()
    # bits, yfill = bits_yfill(data)
    # X_train, X_test, y_train, y_test = train_test_split(bits, yfill, test_size=0.20, random_state=42, stratify =yfill)
    # for num in range(10):
    #     rffit = RandomForestClass(X_train, X_test, y_train, y_test)
    #     feature_importance(bits, rffit)
    #     plot_features(bits, rffit, 20, 'bits', num)

    features, yfill = proc.features_yfill(data)
    X_train, X_test, y_train, y_test = train_test_split(features, yfill, test_size=0.20, random_state=1, stratify =yfill)
    X_train_over, y_train_over = proc.oversample(X_train,y_train, r = 0.3)
    rffit, y_predict = randomforest(X_train_over, X_test, y_train_over, y_test, num_est=50, cls_w = 'balanced_subsample')

    precision, recall, median_recall_index, medianrecall_threshold = set_threshold(rffit, X_train, X_test, y_train, y_test)
    print_threshold(rffit, X_train, X_test, y_train, y_test, medianrecall_threshold)
    feature_importance(features, rffit)


    '''
    pprob = rffit.predict_proba(X_test)
    pdf = pd.DataFrame(pprob)
    print(pdf)
    pdf['myH'] = pdf[1].map(lambda x: 1 if x>0.35 else 0)
    my_pred = pdf['myH'].values
def uploaded_file(filename):
    if request.method == 'POST':
        # check if the post request has the file part
        if 'file' not in request.files:
            flash('No file part')
            return redirect(request.url)
        file = request.files['file']
        # if user does not select file, browser also
        # submit a empty part without filename
        if file.filename == '':
            flash('No selected file')
            return redirect(request.url)
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            print(filename)
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            return redirect(url_for('uploaded_HTS', filename=filename))
    elif request.method == 'GET':
        # make a pd.dataframe of training data
        df = pd.read_csv(os.path.join(app.config['UPLOAD_FOLDER'], filename))
        # use all features and yfill (no NaNs, filled with 0)
        features, yfill = proc.features_yfill(df)
        #train test split at 20%
        X_train, X_test, y_train, y_test = rf.train_test_split(features,
                                                               yfill,
                                                               test_size=0.20,
                                                               random_state=1,
                                                               stratify=yfill)

        #Optional: oversampling of minority class for training purposes
        #X_train_over, y_train_over = proc.oversample(X_train,y_train, r = 0.3)
        #rffit, y_predict = rf.randomforest(X_train_over, X_test, y_train_over, y_test, num_est=50, cls_w = 'balanced_subsample')

        #fit the Random Forest classifier: would like to add in a grid search
        rffit, y_predict = rf.randomforest(X_train.values, X_test,
                                           y_train.values, y_test)

        # Use below to run a grid search .... takes to long to work right now
        #rffit, y_predict = rf.randomforest(X_train.values, X_test, y_train.values, y_test, grid_search = 'small')

        #pickle the fit model for use with test data
        proc._pickle(rffit, 'RFC_fit.pkl')

        #set_threshold_recall is a function which determines the threshold to set such that recall is optimized (the median of the available thresholds that return the second best recall (not 1.0))
        precision_list, recall_list, median_recall_index, medianrecall_threshold = rf.set_threshold_recall(
            rffit, X_train, X_test, y_train, y_test)

        #print_threshold uses the trained model and the selected threshold (in this case recall optimized) to return listed statistics
        precision, recall, fpr, fpr_test, tpr_test, cm = rf.print_threshold(
            rffit, X_train, X_test, y_train, y_test, medianrecall_threshold)
        r_cm = pd.DataFrame(cm)
        proc._pickle(medianrecall_threshold, 'medianrecall_threshold.pkl')

        #make a pd.dataframe of the stats for display
        recall_opt_stats = pd.DataFrame([[
            format(medianrecall_threshold, '.2f'),
            format(recall, '.2f'),
            format(fpr, '.2f'),
            format(precision, '.2f'),
        ]],
                                        columns=[
                                            'Suggested Threshold',
                                            'True Positive Rate (Recall)',
                                            'False Positive Rate (Fall-out)',
                                            'Precision'
                                        ])

        # repeat the threshold selection process for precision optimization
        p_precision, p_recall, p_median_precision, threshold_precision = rf.set_threshold_precision(
            rffit, X_train, X_test, y_train, y_test)
        p_precision, p_recall, p_fpr, p_fpr_test, p_tpr_test, p_cm = rf.print_threshold(
            rffit, X_train, X_test, y_train, y_test, threshold_precision)
        p_cm = pd.DataFrame(p_cm)
        precision_opt_stats = pd.DataFrame(
            [[
                format(threshold_precision, '.2f'),
                format(p_recall, '.2f'),
                format(p_fpr, '.2f'),
                format(p_precision, '.2f'),
            ]],
            columns=[
                'Suggested Threshold', 'True Positive Rate (Recall)',
                'False Positive Rate (Fall-out)', 'Precision'
            ])

        #produce a ROC plot
        test_prob = rffit.predict_proba(X_test)
        roc.plot_roc(X_train.values,
                     y_train.values,
                     y_test,
                     test_prob,
                     'Test',
                     RandomForestClassifier,
                     max_depth=10,
                     max_features=30,
                     min_samples_leaf=2,
                     min_samples_split=2)
        feature_description = rf.plot_features(features,
                                               rffit,
                                               'Identifier',
                                               n=10)
        #option for overfit data
        #roc.plot_roc(X_train_over, y_train_over, y_test, test_prob, 'Test', RandomForestClassifier,  max_depth = 10, max_features= 30, min_samples_leaf= 2, min_samples_split = 2)
        #roc.simple_roc(y_test, test_prob, 'ROC_RFC')
        pd.set_option('display.max_colwidth', -1)

        return render_template("rock.html",
                               data_recall_opt=recall_opt_stats.to_html(
                                   index=False, classes="data_recall_opt"),
                               data_precision_opt=precision_opt_stats.to_html(
                                   index=False, classes="data_precision_opt"),
                               rocname='Test',
                               f_descrip=feature_description.to_html(
                                   index=False, classes="f_descrip"),
                               recall_cm=r_cm.to_html(classes="cm"),
                               precision_cm=p_cm.to_html(classes="cm"))