Пример #1
0
    def test_xgboost_regressor_unwrapped(self):
        """
        Validate xgboost regressor without wrapper
        """
        X, y = make_regression(n_samples=500,
                               n_features=22,
                               n_informative=8,
                               random_state=8311982)
        X_train, X_test, y_train, y_test = tts(X, y)

        model = xgb.XGBRFRegressor()
        oz = residuals_plot(model,
                            X_train,
                            y_train,
                            X_test,
                            y_test,
                            show=False)
        assert is_fitted(oz)
Пример #2
0
    )


for model in models:
    show_score(x_train, y_train, model)

figure, axes = plt.subplots(nrows=len(models),
                            ncols=2,
                            figsize=(9, 9),
                            sharex=True)

for ind, model in enumerate(models):
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    for index, ax in enumerate(axes):
        residuals_plot(model, x_test, preds, hist=False, ax=ax[index])
        prediction_error(model, x_test, preds, ax=ax)

# Do some scoring on XGB estimators
# Validation curve
viz = ValidationCurve(XGBRegressor(objective="reg:squarederror"),
                      param_name="max_depth",
                      param_range=np.arange(1, 11),
                      cv=5,
                      scoring="r2")
viz.fit(x_train, y_train)
viz.show()

# Learning curve
model = XGBRegressor(objective="reg:squarederror")
viz_2 = LearningCurve(model, scoring="r2")
Пример #3
0
# penalizes large errors
metrics.mean_squared_error(auto_y_test, lr.predict(auto_X_test))

metrics.mean_absolute_error(auto_y_test, lr.predict(auto_X_test))

dt = tree.DecisionTreeRegressor(max_depth=4)
dt.fit(auto_X_train, auto_y_train)
dt.score(auto_X_test, auto_y_test)

metrics.mean_absolute_error(auto_y_test, dt.predict(auto_X_test))

metrics.mean_squared_error(auto_y_test, dt.predict(auto_X_test))

# Residuals plot
# Good for looking at homoskedasticity - variance of errors
regressor.residuals_plot(lr, auto_X_train, auto_y_train, auto_X_test,
                         auto_y_test)

# Residuals plot
# Good for looking at homoskedasticity - variance of errors
regressor.residuals_plot(dt, auto_X_train, auto_y_train, auto_X_test,
                         auto_y_test)

# Prediction Error
# Good for looking at variance and performance at different ends
regressor.prediction_error(lr, auto_X_train, auto_y_train, auto_X_test,
                           auto_y_test)

# Prediction Error
# Good for looking at variance and performance at different ends
regressor.prediction_error(dt, auto_X_train, auto_y_train, auto_X_test,
                           auto_y_test)
Пример #4
0
def visualize_features(classes, problem_type, curdir, default_features,
                       balance_data, test_size):

    # make features into label encoder here
    features, feature_labels, class_labels = get_features(
        classes, problem_type, default_features, balance_data)

    # now preprocess features for all the other plots
    os.chdir(curdir)
    le = preprocessing.LabelEncoder()
    le.fit(class_labels)
    tclass_labels = le.transform(class_labels)

    # process features to help with clustering
    se = preprocessing.StandardScaler()
    t_features = se.fit_transform(features)

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        tclass_labels,
                                                        test_size=test_size,
                                                        random_state=42)

    # print(len(features))
    # print(len(feature_labels))
    # print(len(class_labels))
    # print(class_labels)

    # GET TRAINING DATA DURING MODELING PROCESS
    ##################################
    # get filename
    # csvfile=''
    # print(classes)
    # for i in range(len(classes)):
    # 	csvfile=csvfile+classes[i]+'_'

    # get training and testing data for later
    # try:
    # print('loading training files...')
    # X_train=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'train.csv')
    # y_train=X_train['class_']
    # X_train.drop(['class_'], axis=1)
    # X_test=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'test.csv')
    # y_test=X_test['class_']
    # X_test.drop(['class_'], axis=1)
    # y_train=le.inverse_transform(y_train)
    # y_test=le.inverse_transform(y_test)
    # except:
    # print('error loading in training files, making new test data')

    # Visualize each class (quick plot)
    ##################################
    visualization_dir = 'visualization_session'
    try:
        os.mkdir(visualization_dir)
        os.chdir(visualization_dir)
    except:
        shutil.rmtree(visualization_dir)
        os.mkdir(visualization_dir)
        os.chdir(visualization_dir)

    objects = tuple(set(class_labels))
    y_pos = np.arange(len(objects))
    performance = list()
    for i in range(len(objects)):
        performance.append(class_labels.count(objects[i]))

    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.xticks(rotation=90)
    plt.title('Counts per class')
    plt.ylabel('Count')
    plt.xlabel('Class')
    plt.tight_layout()
    plt.savefig('classes.png')
    plt.close()

    # set current directory
    curdir = os.getcwd()

    # ##################################
    # # CLUSTERING!!!
    # ##################################

    ##################################
    # Manifold type options
    ##################################
    '''
		"lle"
		Locally Linear Embedding (LLE) uses many local linear decompositions to preserve globally non-linear structures.
		"ltsa"
		LTSA LLE: local tangent space alignment is similar to LLE in that it uses locality to preserve neighborhood distances.
		"hessian"
		Hessian LLE an LLE regularization method that applies a hessian-based quadratic form at each neighborhood
		"modified"
		Modified LLE applies a regularization parameter to LLE.
		"isomap"
		Isomap seeks a lower dimensional embedding that maintains geometric distances between each instance.
		"mds"
		MDS: multi-dimensional scaling uses similarity to plot points that are near to each other close in the embedding.
		"spectral"
		Spectral Embedding a discrete approximation of the low dimensional manifold using a graph representation.
		"tsne" (default)
		t-SNE: converts the similarity of points into probabilities then uses those probabilities to create an embedding.
	'''
    os.mkdir('clustering')
    os.chdir('clustering')

    # tSNE
    plt.figure()
    viz = Manifold(manifold="tsne", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="tsne.png")
    plt.close()
    # os.system('open tsne.png')
    # viz.show()

    # PCA
    plt.figure()
    visualizer = PCADecomposition(scale=True, classes=set(classes))
    visualizer.fit_transform(np.array(features), tclass_labels)
    visualizer.poof(outpath="pca.png")
    plt.close()
    # os.system('open pca.png')

    # spectral embedding
    plt.figure()
    viz = Manifold(manifold="spectral", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="spectral.png")
    plt.close()

    # lle embedding
    plt.figure()
    viz = Manifold(manifold="lle", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="lle.png")
    plt.close()

    # ltsa
    # plt.figure()
    # viz = Manifold(manifold="ltsa", classes=set(classes))
    # viz.fit_transform(np.array(features), tclass_labels)
    # viz.poof(outpath="ltsa.png")
    # plt.close()

    # hessian
    # plt.figure()
    # viz = Manifold(manifold="hessian", method='dense', classes=set(classes))
    # viz.fit_transform(np.array(features), tclass_labels)
    # viz.poof(outpath="hessian.png")
    # plt.close()

    # modified
    plt.figure()
    viz = Manifold(manifold="modified", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="modified.png")
    plt.close()

    # isomap
    plt.figure()
    viz = Manifold(manifold="isomap", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="isomap.png")
    plt.close()

    # mds
    plt.figure()
    viz = Manifold(manifold="mds", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="mds.png")
    plt.close()

    # spectral
    plt.figure()
    viz = Manifold(manifold="spectral", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="spectral.png")
    plt.close()

    # UMAP embedding
    plt.figure()
    umap = UMAPVisualizer(metric='cosine',
                          classes=set(classes),
                          title="UMAP embedding")
    umap.fit_transform(np.array(features), class_labels)
    umap.poof(outpath="umap.png")
    plt.close()

    # alternative UMAP
    # import umap.plot
    # plt.figure()
    # mapper = umap.UMAP().fit(np.array(features))
    # fig=umap.plot.points(mapper, labels=np.array(tclass_labels))
    # fig = fig.get_figure()
    # fig.tight_layout()
    # fig.savefig('umap2.png')
    # plt.close(fig)

    #################################
    # 	  FEATURE RANKING!!
    #################################
    os.chdir(curdir)
    os.mkdir('feature_ranking')
    os.chdir('feature_ranking')

    # You can get the feature importance of each feature of your dataset
    # by using the feature importance property of the model.
    plt.figure(figsize=(12, 12))
    model = ExtraTreesClassifier()
    model.fit(np.array(features), tclass_labels)
    # print(model.feature_importances_)
    feat_importances = pd.Series(model.feature_importances_,
                                 index=feature_labels[0])
    feat_importances.nlargest(20).plot(kind='barh')
    plt.title('Feature importances (ExtraTrees)', size=16)
    plt.title('Feature importances with %s features' % (str(len(features[0]))))
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    # os.system('open feature_importance.png')

    # get selected labels for top 20 features
    selectedlabels = list(dict(feat_importances.nlargest(20)))
    new_features, new_labels = restructure_features(selectedlabels, t_features,
                                                    feature_labels[0])
    new_features_, new_labels_ = restructure_features(selectedlabels, features,
                                                      feature_labels[0])

    # Shapiro rank algorithm (1D)
    plt.figure(figsize=(28, 12))
    visualizer = Rank1D(algorithm='shapiro',
                        classes=set(classes),
                        features=new_labels)
    visualizer.fit(np.array(new_features), tclass_labels)
    visualizer.transform(np.array(new_features))
    # plt.tight_layout()
    visualizer.poof(outpath="shapiro.png")
    plt.title('Shapiro plot (top 20 features)', size=16)
    plt.close()
    # os.system('open shapiro.png')
    # visualizer.show()

    # pearson ranking algorithm (2D)
    plt.figure(figsize=(12, 12))
    visualizer = Rank2D(algorithm='pearson',
                        classes=set(classes),
                        features=new_labels)
    visualizer.fit(np.array(new_features), tclass_labels)
    visualizer.transform(np.array(new_features))
    plt.tight_layout()
    visualizer.poof(outpath="pearson.png")
    plt.title('Pearson ranking plot (top 20 features)', size=16)
    plt.close()
    # os.system('open pearson.png')
    # visualizer.show()

    # feature importances with top 20 features for Lasso
    plt.figure(figsize=(12, 12))
    viz = FeatureImportances(Lasso(), labels=new_labels_)
    viz.fit(np.array(new_features_), tclass_labels)
    plt.tight_layout()
    viz.poof(outpath="lasso.png")
    plt.close()

    # correlation plots with feature removal if corr > 0.90
    # https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf

    # now remove correlated features
    # --> p values
    # --> https://towardsdatascience.com/the-next-level-of-data-visualization-in-python-dd6e99039d5e / https://github.com/WillKoehrsen/Data-Analysis/blob/master/plotly/Plotly%20Whirlwind%20Introduction.ipynb- plotly for correlation heatmap and scatterplot matrix
    # --> https://seaborn.pydata.org/tutorial/distributions.html
    data = new_features
    corr = data.corr()

    plt.figure(figsize=(12, 12))
    fig = sns.heatmap(corr)
    fig = fig.get_figure()
    plt.title('Heatmap with correlated features (top 20 features)', size=16)
    fig.tight_layout()
    fig.savefig('heatmap.png')
    plt.close(fig)

    columns = np.full((corr.shape[0], ), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= 0.9:
                if columns[j]:
                    columns[j] = False
    selected_columns = data.columns[columns]
    data = data[selected_columns]
    corr = data.corr()

    plt.figure(figsize=(12, 12))
    fig = sns.heatmap(corr)
    fig = fig.get_figure()
    plt.title('Heatmap without correlated features (top 20 features)', size=16)
    fig.tight_layout()
    fig.savefig('heatmap_clean.png')
    plt.close(fig)

    # radviz
    # Instantiate the visualizer
    plt.figure(figsize=(12, 12))
    visualizer = RadViz(classes=classes, features=new_labels)
    visualizer.fit(np.array(new_features), tclass_labels)
    visualizer.transform(np.array(new_features))
    visualizer.poof(outpath="radviz.png")
    visualizer.show()
    plt.close()

    # feature correlation plot
    plt.figure(figsize=(28, 12))
    visualizer = feature_correlation(np.array(new_features),
                                     tclass_labels,
                                     labels=new_labels)
    visualizer.poof(outpath="correlation.png")
    visualizer.show()
    plt.tight_layout()
    plt.close()

    os.mkdir('feature_plots')
    os.chdir('feature_plots')

    newdata = new_features_
    newdata['classes'] = class_labels

    for j in range(len(new_labels_)):
        fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels_[j]])
        fig = fig.get_figure()
        fig.tight_layout()
        fig.savefig('%s_%s.png' % (str(j), new_labels_[j]))
        plt.close(fig)

    os.mkdir('feature_plots_transformed')
    os.chdir('feature_plots_transformed')

    newdata = new_features
    newdata['classes'] = class_labels

    for j in range(len(new_labels)):
        fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels[j]])
        fig = fig.get_figure()
        fig.tight_layout()
        fig.savefig('%s_%s.png' % (str(j), new_labels[j]))
        plt.close(fig)

    ##################################################
    # PRECISION-RECALL CURVES
    ##################################################

    os.chdir(curdir)
    os.mkdir('model_selection')
    os.chdir('model_selection')

    plt.figure()
    visualizer = precision_recall_curve(GaussianNB(), np.array(features),
                                        tclass_labels)
    visualizer.poof(outpath="precision-recall.png")
    plt.close()

    plt.figure()
    visualizer = roc_auc(LogisticRegression(), np.array(features),
                         tclass_labels)
    visualizer.poof(outpath="roc_curve_train.png")
    plt.close()

    plt.figure()
    visualizer = discrimination_threshold(
        LogisticRegression(multi_class="auto", solver="liblinear"),
        np.array(features), tclass_labels)
    visualizer.poof(outpath="thresholds.png")
    plt.close()

    plt.figure()
    visualizer = residuals_plot(Ridge(),
                                np.array(features),
                                tclass_labels,
                                train_color="maroon",
                                test_color="gold")
    visualizer.poof(outpath="residuals.png")
    plt.close()

    plt.figure()
    visualizer = prediction_error(Lasso(), np.array(features), tclass_labels)
    visualizer.poof(outpath='prediction_error.png')
    plt.close()

    # outlier detection
    plt.figure()
    visualizer = cooks_distance(np.array(features),
                                tclass_labels,
                                draw_threshold=True,
                                linefmt="C0-",
                                markerfmt=",")
    visualizer.poof(outpath='outliers.png')
    plt.close()

    # cluster numbers
    plt.figure()
    visualizer = silhouette_visualizer(
        KMeans(len(set(tclass_labels)), random_state=42), np.array(features))
    visualizer.poof(outpath='siloutte.png')
    plt.close()

    # cluster distance
    plt.figure()
    visualizer = intercluster_distance(
        KMeans(len(set(tclass_labels)), random_state=777), np.array(features))
    visualizer.poof(outpath='cluster_distance.png')
    plt.close()

    # plot percentile of features plot with SVM to see which percentile for features is optimal
    features = preprocessing.MinMaxScaler().fit_transform(features)
    clf = Pipeline([('anova', SelectPercentile(chi2)),
                    ('scaler', StandardScaler()),
                    ('logr', LogisticRegression())])
    score_means = list()
    score_stds = list()
    percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100)

    for percentile in percentiles:
        clf.set_params(anova__percentile=percentile)
        this_scores = cross_val_score(clf, np.array(features), class_labels)
        score_means.append(this_scores.mean())
        score_stds.append(this_scores.std())

    plt.errorbar(percentiles, score_means, np.array(score_stds))
    plt.title(
        'Performance of the LogisticRegression-Anova varying the percent features selected'
    )
    plt.xticks(np.linspace(0, 100, 11, endpoint=True))
    plt.xlabel('Percentile')
    plt.ylabel('Accuracy Score')
    plt.axis('tight')
    plt.savefig('logr_percentile_plot.png')
    plt.close()

    # get PCA
    pca = PCA(random_state=1)
    pca.fit(X_train)
    skplt.decomposition.plot_pca_component_variance(pca)
    plt.savefig('pca_explained_variance.png')
    plt.close()

    # estimators
    rf = RandomForestClassifier()
    skplt.estimators.plot_learning_curve(rf, X_train, y_train)
    plt.title('Learning Curve (Random Forest)')
    plt.savefig('learning_curve.png')
    plt.close()

    # elbow plot
    kmeans = KMeans(random_state=1)
    skplt.cluster.plot_elbow_curve(kmeans,
                                   X_train,
                                   cluster_ranges=range(1, 30),
                                   title='Elbow plot (KMeans clustering)')
    plt.savefig('elbow.png')
    plt.close()

    # KS statistic (only if 2 classes)
    lr = LogisticRegression()
    lr = lr.fit(X_train, y_train)
    y_probas = lr.predict_proba(X_test)
    skplt.metrics.plot_ks_statistic(y_test, y_probas)
    plt.savefig('ks.png')
    plt.close()

    # precision-recall
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    y_probas = nb.predict_proba(X_test)
    skplt.metrics.plot_precision_recall(y_test, y_probas)
    plt.tight_layout()
    plt.savefig('precision-recall.png')
    plt.close()

    ## plot calibration curve
    rf = RandomForestClassifier()
    lr = LogisticRegression()
    nb = GaussianNB()
    svm = LinearSVC()
    dt = DecisionTreeClassifier(random_state=0)
    ab = AdaBoostClassifier(n_estimators=100)
    gb = GradientBoostingClassifier(n_estimators=100,
                                    learning_rate=1.0,
                                    max_depth=1,
                                    random_state=0)
    knn = KNeighborsClassifier(n_neighbors=7)

    rf_probas = rf.fit(X_train, y_train).predict_proba(X_test)
    lr_probas = lr.fit(X_train, y_train).predict_proba(X_test)
    nb_probas = nb.fit(X_train, y_train).predict_proba(X_test)
    # svm_scores = svm.fit(X_train, y_train).predict_proba(X_test)
    dt_scores = dt.fit(X_train, y_train).predict_proba(X_test)
    ab_scores = ab.fit(X_train, y_train).predict_proba(X_test)
    gb_scores = gb.fit(X_train, y_train).predict_proba(X_test)
    knn_scores = knn.fit(X_train, y_train).predict_proba(X_test)

    probas_list = [
        rf_probas,
        lr_probas,
        nb_probas,  # svm_scores,
        dt_scores,
        ab_scores,
        gb_scores,
        knn_scores
    ]

    clf_names = [
        'Random Forest',
        'Logistic Regression',
        'Gaussian NB',  # 'SVM',
        'Decision Tree',
        'Adaboost',
        'Gradient Boost',
        'KNN'
    ]

    skplt.metrics.plot_calibration_curve(y_test, probas_list, clf_names)
    plt.savefig('calibration.png')
    plt.tight_layout()
    plt.close()

    # pick classifier type by ROC (without optimization)
    probs = [
        rf_probas[:, 1],
        lr_probas[:, 1],
        nb_probas[:, 1],  # svm_scores[:, 1],
        dt_scores[:, 1],
        ab_scores[:, 1],
        gb_scores[:, 1],
        knn_scores[:, 1]
    ]

    plot_roc_curve(y_test, probs, clf_names)
    # more elaborate ROC example with CV = 5 fold
    # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py

    os.chdir(curdir)

    return ''
Пример #5
0
                                                    test_size=0.3,
                                                    random_state=202)

# In[127]:

from sklearn.linear_model import LinearRegression

# In[151]:

ln = LinearRegression()

# In[152]:

from yellowbrick.regressor import residuals_plot

residuals_plot(ln, X_train, y_train)

# In[153]:

pred = ln.predict(X_test)

# In[154]:

from sklearn.metrics import mean_absolute_error, mean_squared_error

# In[155]:

ln.score(X_train, y_train)

# In[132]:
Пример #6
0
# We can now see that the data has a mean close to 0
# and a std of 1
X.describe()

lr_model2 = linear_model.LinearRegression()
lr_model2.fit(X_train, y_train)
lr_model2.score(X_test, y_test)

metrics.r2_score(y_test, lr_model2.predict(X_test))

metrics.mean_squared_error(y_test, lr_model2.predict(X_test))

metrics.mean_absolute_error(y_test, lr_model2.predict(X_test))

regressor.residuals_plot(lr_model2, X_train, y_train,X_test, y_test )



# ## Regression Evaluation Exercise
#
# Using the forest fire data set
#
# * Evaluate the performance of your model with the R2 and MSE score (remember to split your data)
# * Plot a residuals plot

# +
Xff_train, Xff_test, yff_train, yff_test = model_selection.\
    train_test_split(Xff, yff, test_size=.3, random_state=42)

ff_lr = linear_model.LinearRegression()