Пример #1
0
def fitter(df_t, clade, protein):
    '''
    :param df: takes a clade or country df
    :return: prints model summary
    '''
    global df
    df_t = df_t.select_dtypes(include=['float64', 'int64'])
    df_t[f'score for {protein}'] = df[f'score for {protein}']
    X = df_t.drop('death_growth_rate', axis=1)
    y = df_t[['death_growth_rate']]
    # print(X.head())
    # print(y.head())
    lm = sm.OLS(y, X)
    model = lm.fit()
    print(model.summary())
    #print('Parameters: ', model.params)
    print('R2: ', model.rsquared)
    print('P-value: ', model.f_pvalue)
    '''applys linearity test and saves'''
    fig, ax = plt.subplots(1, 1)
    sns.residplot(model.predict(),
                  y,
                  lowess=True,
                  scatter_kws={'alpha': 0.5},
                  line_kws={'color': 'red'},
                  ax=ax)
    ax.title.set_text('Residuals vs Fitted')
    ax.set(xlabel='Fitted', ylabel='Residuals')
    fig.savefig(f'linearity_test_{clade}_{protein}.png')
    plt.show()
    plt.close(fig)
    return model, clade, protein
def model_diagnosis(model, x, x_name, y):
    """
    returns a subplot of the model's distributions (QQPlot, histogram, and a scatter plot of the homoscedasticity
    model is the model variable (model's name)
    x is the object of the feature 
    x_name is the feature's variable
    y is the object of the target variable
    """ 
    
    residuals = model.resid
    
    fig = plt.figure()
    fig, axes = plt.subplots(nrows = 1, ncols = 3, sharex=False, sharey = False, figsize=(65,20), squeeze=False)
    
    fig.text(s="Summary of Model Diagnosis", x=.40, y=1.15, fontsize=40)
    fig.text(s="r-Squared: {r}".format(r=round(model.rsquared, 2)), x=.40, y=1.05, fontsize=30)
    fig.text(s="p-value: {p}".format(p= model.f_pvalue), x=.53, y=1.05, fontsize=30)
    
    fig.text(s='Residuals Histogram', x=.45, y=.93, fontsize=55)
    
    sm.graphics.qqplot(residuals, dist=stats.norm, line='45', fit=True, ax=axes[0][0])    
    plt.show;
    
    fig.text(s='QQ Plot', x=.20, y=.93, fontsize=45)
    
    axes[0][1].hist(residuals)
    plt.show;
    
    sns.residplot(x, y, ax=axes[0][2])
 
    fig.text(s='Residuals vs {x}'.format(x=x_name), x=.73, y=.93, fontsize=40)
    plt.tight_layout;
    plt.show;
def sk_rsme(outcome, features, train_set, test_set):
    """
    Takes in outcome, features and dataset and returns rsme and r^2 using sklearn
    
    outcome: String of title of outcome column
    features: list of strings of feature columns
    train_set: pandas dataframe containing columns that include outcome and features as titles to train model on
    test_set: pandas dataframe containing columns that include outcome and features as titles to test model on
    """
    lr = LinearRegression()

    X_train = train_set[features]

    y_train = train_set[outcome]

    lr.fit(X_train, y_train)

    X_test = test_set[features]

    y_test = test_set[outcome]

    y_train_pred = lr.predict(X_train)

    y_test_pred = lr.predict(X_test)

    rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
    rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)

    sns.residplot(y_train_pred, train_set['price'], lowess=True, color="g")

    return (rmse_train, rmse_test, float(lr.score(X_train, y_train)))
def linear_model(era):
    data = scaled_df[scaled_df['Era'] == era]
    x = data.drop(['Season', 'Team', 'wPCT', 'Era', 'wPCT > 0.500'], axis=1)
    x = sm.add_constant(x)
    y = data['wPCT']

    lm = sm.OLS(y, x).fit()

    print('------- Linear Regression Result ({}) -------'.format(era))
    print(lm.summary())

    # residual plot
    fitted_y = lm.fittedvalues

    fig, ax = plt.subplots(figsize=(8, 6))

    sns.residplot(fitted_y,
                  'wPCT',
                  data=data,
                  lowess=True,
                  scatter_kws={'alpha': 0.5},
                  line_kws={
                      'color': 'red',
                      'lw': 1,
                      'alpha': 0.8
                  },
                  ax=ax)
    ax.set_title('Residuals vs Fitted Values ({})'.format(era))
    ax.set_xlabel('Fitted Values')
    ax.set_ylabel("Residuals")

    plt.show()
Пример #5
0
def scatter(filename, query=[]):
    data = pd.read_csv(filename)
    X, Y = data.columns

    x, y = data[X], data[Y]
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    pearson = stats.pearsonr(x, y)
    seaborn.regplot(x=x,
                    y=y,
                    line_kws={
                        'label':
                        "$y = {0:.2f}x + {1:.2f}$".format(slope, intercept)
                    })
    plt.legend()
    plt.show()
    seaborn.residplot(x=x, y=y)
    plt.ylabel(Y + " (Residuals)")
    plt.show()

    print(filename)
    print(len(x), sum(x), sum(y), sum(x * x), sum(y * y), sum(x * y))
    print(slope, intercept)
    print(pearson)
    for q in query:
        print(q, slope * q + intercept)
    print()
Пример #6
0
    def regression_model(column_x, column_y):
        # this function uses built in library functions to create a scatter plot,
        # plots of the residuals, compute R-squared, and display the regression eqn

        # fit the regression line using "statsmodels" library:
        X = statsmodels.add_constant(column_x)
        Y = column_y
        regressionmodel = statsmodels.OLS(Y,X).fit() #OLS = "ordinary least squares"
        
        # extract regression parameters from model, rounded to 3 decimal places:
        Rsquared = round(regressionmodel.rsquared,3)
        slope = round(regressionmodel.params[1],3)
        intercept = round(regressionmodel.params[0],3)
        
        # make plots:
        fig, (ax1, ax2) = plt.subplots(ncols=2, sharex=True, figsize=(12,4))
        sns.regplot(x=column_x, y=column_y, data=df, marker="+", ax=ax1) # scatter plot
        sns.residplot(x=column_x, y=column_y, data=df, ax=ax2) # residual plot
        ax2.set(ylabel='Residuals')
        ax2.set_ylim(min(regressionmodel.resid)-1,max(regressionmodel.resid)+1)
        plt.figure() # histogram
        sns.distplot(regressionmodel.resid, kde=False, axlabel='Residuals', color='red')
        
        # print the results:
        # print("R-squared = ",Rsquared)
        # print("Regression equation: y =",slope, "x + ",intercept)
        return slope, intercept
Пример #7
0
    def residPlots(self, exclude=[], include=[]):
        ''' Using only quantitaive data for each variable in the include arg
        or variables not in the exclude arg plot the residual values with 
        respect to the independent variable. You do not want to see patterns 
        from these plots. 
        '''
        # y^ - y vs x

        if exclude and include:
            raise ValueError("You can't use both include and exclude.")
        elif exclude and not include:
            variables = [
                var for var in self.quant_vars if not (var in exclude)
            ]
        elif not exclude and include:
            variables = [var for var in self.quant_vars if var in include]
        else:
            variables = self.quant_vars

        df = self.data

        for var in variables:
            plt.figure(self.figureindex)
            sns.residplot(var, self.dep_var, data=df)
            plt.title('Residual plot ' + var)
            plt.show()
            self.figureindex += 1
Пример #8
0
def make_plot(X_train, y_train, X, y, test_data, model, model_name, features, response):
    feature = X.columns
    f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharey=False)
    sns.regplot(X[feature[4]], y, test_data, ax=ax1)
    sns.boxplot(X[feature[4]], y, color="Blues_r", ax=ax2)
    model.fit(X_train, y_train)
    sns.residplot(X[feature[4]], (model.predict(X) - y) ** 2, color="indianred", lowess=True, ax=ax3)
    if model_name is 'linear':
        sns.interactplot(X[feature[3]], X[feature[4]], y, ax=ax4, filled=True, scatter_kws={"color": "dimgray"}, contour_kws={"alpha": .5})
    elif model_name is 'logistic':
        pal = sns.blend_palette(["#4169E1", "#DFAAEF", "#E16941"], as_cmap=True)
        levels = np.linspace(0, 1, 11)
        sns.interactplot(X[feature[3]], X[feature[4]], y, levels=levels, cmap=pal, logistic=True)
    else:
        pass
    ax1.set_title('Regression')
    ax2.set_title(feature[4]+' Value')
    ax3.set_title(feature[4]+' Residuals')
    ax4.set_title('Two-value Interaction')
    f.tight_layout()
    plt.savefig(model_name+'_'+feature[4], bbox_inches='tight')

    # Multi-variable correlation significance level
    f, ax = plt.subplots(figsize=(10, 10))
    cmap = sns.blend_palette(["#00008B", "#6A5ACD", "#F0F8FF",
                              "#FFE6F8", "#C71585", "#8B0000"], as_cmap=True)
    sns.corrplot(test_data, annot=False, diag_names=False, cmap=cmap)
    ax.grid(False)
    ax.set_title('Multi-variable correlation significance level')
    plt.savefig(model_name+'_multi-variable_correlation', bbox_inches='tight')

    # complete coefficient plot - believe this is only for linear regression
    sns.coefplot("diagnosis ~ "+' + '.join(features), test_data, intercept=True)
    plt.xticks(rotation='vertical')
    plt.savefig(model_name+'_coefficient_effects', bbox_inches='tight')
Пример #9
0
def plot_residplot(do_values_df, order, save=None):
    """
    Plot residual plot, which can help in determining if there is structure to the
    residuals.

    :type do_values_df: Pandas Dataframe
    :param do_values_df: Dataframe containing dissimilarity and overlap values with
                         lowess predictions.
    :type order: int
    :param order: Order of the polynomial to fit when calculating the residuals.
    :type save: boolean
    :param save: Set to save the plot.
    """
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111)
    sns.residplot(x="Overlap",
                  y="Dissimilarity",
                  data=do_values_df,
                  lowess=True,
                  order=order,
                  ax=ax)
    ax.set_title("Residual Plot", weight="bold", fontsize=15)
    if save is not None:
        fig.savefig(save,
                    facecolor="white",
                    edgecolor="none",
                    format="svg",
                    bbox_inches="tight",
                    pad_inches=0.1)
    else:
        plt.show()
Пример #10
0
def residuals(df, target, cols=3, figsize=(10, 15), hspace=1, wspace=1):
    '''
    Create residual plots for all numeric features. Useful for
    seeing heteroscedasticity.

    Input:
    df: Pandas DataFrame object
    target: string of the target feature in df
    cols: number of graphs to display per row
    figsize: tuple of floats representing height and width of the plots
    hspace: the amount of height reserved for space between subplots
    wspace: the amount of width reserved for space between subplots

    Output:
    Display n graphs to the screen, where n is the number of
    numeric features in df
    '''
    X = df.drop(target, axis=1).select_dtypes(include='number')
    y = df[target]

    # plot settings
    fig = plt.figure(figsize=figsize)
    fig.subplots_adjust(left=None,
                        bottom=None,
                        right=None,
                        top=None,
                        wspace=wspace,
                        hspace=hspace)
    rows = ceil(float(df.shape[1]) / cols)

    # plot graphs
    for i, column in enumerate(X.columns):
        ax = fig.add_subplot(rows, cols, i + 1)
        sns.residplot(X[column], y, line_kws=dict(color='r'), lowess=True)
        ax.set_title(column)
Пример #11
0
def _plot_residuals(model, y, ax):
    fitted = model.predict()
    residuals = y - fitted
    sns.residplot(fitted, residuals, lowess=True, line_kws=line_kw, scatter_kws=scatter_kw, ax=ax)
    ax.set_xlabel('fitted values')
    ax.set_ylabel('residuals')
    ax.set_title('Residuals vs Fitted')
Пример #12
0
def diagnostics(fittedvalues, residuals, data, predictand):
    sns.residplot(fittedvalues,
                  predictand,
                  data=data,
                  lowess=True,
                  scatter_kws={'alpha': 0.5},
                  line_kws={
                      'color': 'red',
                      'lw': 1,
                      'alpha': 0.8
                  })

    # qq plot of residuals vs. normal fit
    sm.qqplot(residuals, ss.norm, fit=True, line='45')

    # autocorrelation of residuals
    fig = plt.figure()
    ax = fig.add_subplot(2, 1, 1)
    sm.graphics.tsa.plot_acf(residuals, ax=ax)
    ax.set_xlim([0, 10])

    ax = fig.add_subplot(2, 1, 2)
    sm.graphics.tsa.plot_pacf(residuals, ax=ax)
    ax.set_xlim([0, 10])

    return None
def evaluate_model(model, X_test, y_test, plot_residuals=False, title=''):
    from sklearn.metrics import mean_squared_error, explained_variance_score

    y_pred = model.predict(X_test)

    if plot_residuals:
        _, ax = plt.subplots(figsize=(9, 9))
        ax.set_title('Residuals Plot - ' + title, fontsize=19)
        ax.set_xlabel('Predicted values', fontsize=15)
        ax.set_ylabel('Residuals', fontsize=15)
        sns.residplot(y_pred.squeeze(),
                      y_test.squeeze(),
                      lowess=True,
                      ax=ax,
                      scatter_kws={'alpha': 0.3},
                      line_kws={
                          'color': 'black',
                          'lw': 2,
                          'ls': '--'
                      })

    metrics = {
        'explained_variance': explained_variance_score(y_test, y_pred),
        'mse': mean_squared_error(y_test, y_pred)
    }
    return metrics
Пример #14
0
def residual_analysis(df, x, y, model) :
    preds = model(df[x])
    residuals = df[y] - preds
    stdResiduals = residuals / residual_standard_error(residuals)
    
    fig, axs = plt.subplots(2, 2, figsize=(8,8))
    axs[0][0].scatter(df[x], df[y])
    axs[0][0].plot(df[x], preds)
    axs[0][0].set_xlabel(x), axs[0][0].set_ylabel(y)
    plt.suptitle("                Raw plus fit / Response v Residuals \n Explan v Residuals / Residual hist")
    
    axs[0][1].scatter(df[y], stdResiduals)
    xRange = np.arange(df[y].min(), df[y].max(), df[y].max()/10)
    axs[0][1].plot(xRange, [0]*len(xRange), linestyle="--", color="gray")
    plt.xlabel(y), plt.ylabel("residual")
    
    sns.residplot(df[x], df[y], lowess=True, ax=axs[1][0])
    plt.ylabel("residual")
    
    sns.distplot(stdResiduals, ax=axs[1][1])
    axs[1][1].set_xlabel("residual")
    plt.show()
    
    fig, axs = plt.subplots(figsize=(5,5))
    stats.probplot(stdResiduals, plot=plt)
    plt.show()
    def residual_plot(self, ax=None):
        """
        Residual vs Fitted Plot

        Graphical tool to identify non-linearity.
        (Roughly) Horizontal red line is an indicator that the residual has a linear pattern
        """
        if ax is None:
            fig, ax = plt.subplots()

        sns.residplot(x=self.y_predict,
                      y=self.residual,
                      lowess=True,
                      scatter_kws={'alpha': 0.5},
                      line_kws={
                          'color': 'red',
                          'lw': 1,
                          'alpha': 0.8
                      },
                      ax=ax)

        # annotations
        residual_abs = np.abs(self.residual)
        abs_resid = np.flip(np.sort(residual_abs))
        abs_resid_top_3 = abs_resid[:3]
        for i, _ in enumerate(abs_resid_top_3):
            ax.annotate(i,
                        xy=(self.y_predict[i], self.residual[i]),
                        color='C3')

        ax.set_title('Residuals vs Fitted', fontweight="bold")
        ax.set_xlabel('Fitted values')
        ax.set_ylabel('Residuals')
        return ax
 def resid_plot(self, var_x=None, var_y=None):
     """
     Create a residuals VS fitted plot
     Input:
         - var_x: A list of predictor variable(s) (default=None)
         - var_y: A response variable (default=None)
     """
     dataframe = self.get_dataframe()
     # priority: arguments var_x, var_y
     if var_x is not None and var_y is not None:
         self.reg(var_x, var_y)
     else:
         var_x = self.get_predictors()
         var_y = self.get_response()
         if var_x is not None and var_y is not None:
             self.reg(var_x, var_y)
         else:
             raise ValueError('No predictors or response assigned')
     model = self.get_model()
     fitted = model.fittedvalues
     sns.residplot(fitted,
                   var_y,
                   data=dataframe,
                   lowess=True,
                   scatter_kws={'alpha': 0.5},
                   line_kws={
                       'color': 'red',
                       'lw': 1,
                       'alpha': 1
                   })
     plt.title('Residuals vs Fitted')
     plt.xlabel('Fitted values')
     plt.ylabel('Residuals')
Пример #17
0
def scale_transform_validate(df, target, residuals=False, selection=False):
    """
    Scale features and Evaluate model
    params : df_features - Data frame with features (numeric)
             target - series with numeric values!  
             residuals = False. Plot residuals if True 
    """

    X_train, X_test, y_train, y_test = train_test_split(df,
                                                        target,
                                                        random_state=9,
                                                        test_size=0.2)
    scaler = StandardScaler()

    # fit the scaler to the training data
    scaler.fit(X_train)

    #transform the training data
    scaled_data = scaler.transform(X_train)

    # create DF
    X_train_scaled = pd.DataFrame(data=scaled_data,
                                  columns=df.columns,
                                  index=X_train.index)

    #transform the test data
    scaled_test_data = scaler.transform(X_test)

    #create dataframe
    X_test_scaled = pd.DataFrame(data=scaled_test_data,
                                 columns=df.columns,
                                 index=X_test.index)

    # Fit the model to the training data.
    lm = LinearRegression()
    lm = lm.fit(X_train_scaled, y_train)

    # Use the model to predict on the training set and the test set.
    y_train_pred = lm.predict(X_train_scaled)
    y_test_pred = lm.predict(X_test_scaled)

    train_rmse = np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(metrics.mean_squared_error(y_test, y_test_pred))

    if residuals == True:
        sns.residplot(y_test, y_test_pred, lowess=True, color='g')

    if selection == True:

        return round(train_rmse), round(test_rmse)

    #Comparing our Model's performance on training data versus test data
    else:
        print('Training: RMSE', int(train_rmse), "vs. Testing: RMSE",
              int(test_rmse))
        print('Perfomance : {} %'.format(
            round(abs((test_rmse - train_rmse) / train_rmse) * 100)))

    return plt.show()
def checkReg(t, rMean):
    time = t 
    roiMean = rMean
    
    sns.residplot(time, roiMean , lowess=False, color="g")
    #sns.regplot(time, roiMean, lowess=False, color="g")

           
Пример #19
0
def plot_residuals(df, actual, predicted):
    '''
    user inputs a dataframe, then Data or column name in data for the predictor variable.
    then Data or column name in data for the response variable. And 
    function will produce a scatter plot
    with a baseline
    '''
    sns.residplot(x=actual, y=predicted, data=df)
Пример #20
0
def compare_error_variance(df1, df2, predictor, target):
    """Comparing before and after adjusted target vs feature """
    fig, (ax1, ax2) = plt.subplots(figsize=(22, 6),
                                   ncols=2,
                                   sharey=False,
                                   sharex=False)
    sns.residplot(x=df1[predictor], y=df1[target], ax=ax1).set_title('Before')
    sns.residplot(x=df2[predictor], y=df2[target], ax=ax2).set_title('After')
Пример #21
0
    def sns_residplot_impl(
            self,
            x_data,  # column name from df (str)
            y_data,  # column name from df (str)
            df,
            scatter_color='green'):

        sns.residplot(x=x_data, y=y_data, data=df, color=scatter_color)
        plt.show()
Пример #22
0
def residual_predicted_df(predicted_df):
    import seaborn as sns
    sns.set(style="whitegrid")
    # Plot the residuals after fitting a linear model
    sns.residplot(predicted_df.observed.values,
                  predicted_df.predicted.values,
                  lowess=True,
                  color="g")
    pr.plt.show()
Пример #23
0
    def plot_residual_fitted_values(self, y):
        sns.residplot(self.model.fittedvalues, y, lowess=True, scatter_kws={'alpha': 0.5}, line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
        plt.title('Resíduos vs Ajustados', fontsize=20)
        plt.xlabel('Valores Ajustados')
        plt.ylabel('Resíduos')

        top_3_abs_resid = self.model_abs_resid.sort_values(ascending=False)[:3]
        for index in top_3_abs_resid.index:
            plt.annotate(index, xy=(self.model.fittedvalues[index], self.model.resid[index]))
Пример #24
0
def get_results(Y_test, predictions, model_name):
    mae_RFR = mean_absolute_error(predictions, Y_test)
    print("MAE: {}".format(mae_RFR))
    sns.set(style="whitegrid")
    fig, axs = plt.subplots(ncols=2, sharey=False, figsize=(15, 5))
    sns.residplot(predictions, Y_test, color="g",
                  ax=axs[0]).set_title("Residuals plot of " + model_name)
    sns.scatterplot(x=Y_test, y=predictions,
                    ax=axs[1]).set_title("Model Error")
    axs[1].set(xlabel='True Values', ylabel='Predicted Values')
def residPlots(dict_stats,key):
    fig.clf()
    ax = fig.add_subplot()
    sns.residplot(x=dict_stats[key]['sub_charge'], y=dict_stats[key]['sub_slope'], ax=ax)
    ax.set_title("{0} Charge Residual".format(key),size=title_text_size)
    ax.tick_params(axis='y', labelsize=tick_text_size)
    ax.tick_params(axis='x', labelsize=tick_text_size)
    ax.set_xlabel('Charge', size=axis_text_size)
    ax.set_ylabel('Slope Residual', size=axis_text_size)
    app.queueFunction(app.refreshPlot, "p1")
Пример #26
0
def wykres_5(x, y, nazwa_wykres, nazwa_x, nazwa_y):
    f, ax = plt.subplots(figsize=(10, 8))

    ax.set_title(nazwa_wykres, fontsize=16)
    ax.set_ylabel(nazwa_y, fontsize=14)
    ax.set_xlabel(nazwa_x, fontsize=14)

    sns.residplot(data=selected_data, x=x, y=y, lowess=True)

    return f
Пример #27
0
def eda(team_df):
    #basic statistics
    team_stats = team_df[num_cols].describe().T
    team_skew_values = list()
    for num_col in team_stats.index:
        num_col_skew = stats.skew(team_df[num_col])
        team_skew_values.append(num_col_skew)
    team_stats['skew'] = team_skew_values

    #heatmap
    fig = plt.figure(figsize=(20, 10))
    sns.heatmap(team_df[num_cols].corr(), annot=True)

    #distribution plot
    fig = plt.figure(figsize=(30, 40))
    for i, num_col in enumerate(team_stats.index):
        fig.add_subplot(7, 2, 1 + i)
        sns.distplot(team_df[num_col])

        mean_value = team_df[num_col].mean()
        plt.axvline(mean_value, c='red')

        median_value = team_df[num_col].median()
        plt.axvline(median_value, c='green')

    #attendance box plot by year
    fig = plt.figure(figsize=(20, 10))
    order = set(team_df['year'].values)
    sns.boxplot(x='year', y='attendance', data=team_df, order=order)

    #scatterplot with attendance
    fig = plt.figure(figsize=(20, 20))
    for i, num_col in enumerate(num_cols):
        fig.add_subplot(7, 2, 1 + i)
        plt.scatter(team_df[num_col], team_df['attendance'])

    #residualplot
    fig = plt.figure(figsize=(20, 20))
    for i, num_col in enumerate(num_cols):
        fig.add_subplot(7, 2, 1 + i)
        sns.residplot(x=num_col, y='attendance', data=team_df)

    #swarmplot
    fig = plt.figure(figsize=(15, 5))
    sns.swarmplot(x='month', y="attendance", hue='weekday', data=team_df)
    fig = plt.figure(figsize=(15, 5))
    sns.swarmplot(x='weekday', y="attendance", hue='month', data=team_df)

    #pointplot
    fig = plt.figure(figsize=(10, 5))
    sns.pointplot(x="weekday", y="attendance", hue="month", data=team_df)
    fig = plt.figure(figsize=(10, 5))
    sns.pointplot(x="month", y="attendance", hue="weekday", data=team_df)

    return team_stats
Пример #28
0
def Q_6_12():
    X = pd.read_csv('Data/Q_6_12.csv')
    data = X
    X.drop(columns='Y')
    Y = data['Y']
    pca = PCA(n_components=2)
    pca.fit(X)
    newX = pca.fit_transform(X)
    print('贡献率:\n',pca.explained_variance_ratio_)  # 输出贡献率
    print('主成分分析后的自变量:\n',newX)
    X_train,X_test,Y_train,Y_test = train_test_split(newX,Y,train_size=.80)
    model = LinearRegression()
    model.fit(X_train,Y_train)
    a = model.intercept_  # 截距
    b = model.coef_  # 回归系数
    print("最佳拟合线:截距", a, ",回归系数:",b)

    # R方检测
    # 决定系数r平方
    # 对于评估模型的精确度
    # y误差平方和 = Σ(y实际值 - y预测值)^2
    # y的总波动 = Σ(y实际值 - y平均值)^2
    # 有多少百分比的y波动没有被回归拟合线所描述 = SSE/总波动
    # 有多少百分比的y波动被回归线描述 = 1 - SSE/总波动 = 决定系数R平方
    # 对于决定系数R平方来说1) 回归线拟合程度:有多少百分比的y波动刻印有回归线来描述(x的波动变化)
    # 2)值大小:R平方越高,回归模型越精确(取值范围0~1),1无误差,0无法完成拟合
    score = model.score(X_test,Y_test)
    print('R方检测: ',score)

    # 对线性回归进行预测
    Y_pred = model.predict(X_test)
    print(Y_pred)

    # 显示图像
    plt.figure()
    plt.plot(range(len(Y_pred)),Y_pred,'b',label="predict")
    plt.plot(range(len(Y_pred)),Y_test,'r',label="test")
    plt.legend(loc="upper right")  # 显示图中的标签
    plt.xlabel("the number of Y")
    plt.ylabel('Y')
    plt.title('预测与源数据对比图')
    plt.savefig("result/线性统计/6.12/compare_linear.jpg")
    plt.show()

    #  残差预测值
    #  enumerate 函数可以把一个 list 变成索引-元素对
    y_dif = []
    for i in range(len(Y_pred)):
        y_dif.append(Y_pred[i]-Y_test.values[i])
    tmp = {'x':range(len(y_dif)),'y':y_dif}
    df = pd.DataFrame(tmp)
    sns.residplot(x="x", y="y",data=df)
    plt.savefig("result/线性统计/6.12/残差图.jpg")
    plt.title('残差图')
    plt.show()
Пример #29
0
def model_evaluation():
    """
    Tells us how our model performs in the real world
    Difference with in-sample evaluation:
        - In-sample tells us how well our model fits the data already given to train it
        - Problem: It does not tell us how well the trained model can be used to predict new data
        - Solution: Split data in sets:
            - Training data: Train it with in-sample evaluation
            - Testing data:

    Example:
        - Train 70% of the data
        - Test 30% of the data

    There exists a generalization error that involves the percentages of data used for training
    and testing. TO overcome this issue, we use

    Cross Validation
        - Most common out-of-sample (testing) evaluation metric
        - More effective use of data (each observation is used for both training and testing)
    """
    from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
    from sklearn.linear_model import LinearRegression

    df = util.create_df()

    x_data = df[['highway-mpg']]
    y_data = df['price']

    lr = LinearRegression()
    lr.fit(x_data, y_data)
    # cv specifies how many folds to use
    scores = cross_val_score(lr, x_data, y_data, cv=3)
    print(f'Mean: {scores.mean()}. Standard deviation: {scores.std()}')
    predicted_scores = cross_val_predict(lr, x_data, y_data, cv=3)

    # Visualize the model
    width, height = 12, 10
    plt.figure(figsize=(width, height))
    sns.regplot(x='highway-mpg', y='price', data=df)
    plt.ylim(0, )
    plt.show()

    plt.clf()

    plt.figure(figsize=(width, height))
    sns.regplot(x="peak-rpm", y="price", data=df)
    plt.ylim(0, )
    plt.show()

    plt.clf()

    plt.figure(figsize=(width, height))
    sns.residplot(df['highway-mpg'], df['price'])
    plt.show()
Пример #30
0
def get_residual_plot(X_col,
                      y,
                      rows=1,
                      cols=1,
                      position=1,
                      ax=False,
                      title=False):
    plt.subplot(rows, cols, position)
    sns.residplot(X_col, y)
    if title:
        ax = plt.xlabel(title)
Пример #31
0
def fitAndPlot(x,y,title, degree = 1 , color='r'):
    if degree > 0:    
        rm = polyfit(x,y,degree)
        rpyp = polyval(rm,x)
        plot(x, y,color+'o',label=title)
        ttlAndEqn = title +":y="+str(rm)[1:6]#+" + "+str(rm)[1:6]+"x"
        degree-=1
        #while degree>0:
        #    ttlAndEqn = ttlAndEqn + ""
        plot(x, rpyp, color+'--',label=ttlAndEqn)
    else:
        sns.residplot(x,y)
Пример #32
0
def fitAndPlot(x, y, title, degree=1, color='r'):
    if degree > 0:
        rm = polyfit(x, y, degree)
        rpyp = polyval(rm, x)
        plot(x, y, color + 'o', label=title)
        ttlAndEqn = title + ":y=" + str(rm)[1:6]  #+" + "+str(rm)[1:6]+"x"
        degree -= 1
        #while degree>0:
        #    ttlAndEqn = ttlAndEqn + ""
        plot(x, rpyp, color + '--', label=ttlAndEqn)
    else:
        sns.residplot(x, y)
Пример #33
0
import seaborn as sns


# In[10]:

# lm = linear model

g = sns.lmplot("Temp", "Gas", hue="Insul", data=whiteside, size=8)
g.set_axis_labels("Temp", "Gas")


# In[11]:

# lm residuals
sns.residplot('Temp', 'Gas', data=whiteside[whiteside.Insul=='Before'])


# In[12]:

# histogram of data in each dimension

sns.jointplot("Temp", "Gas", data=whiteside[whiteside.Insul=='After'], kind='reg', 
                    size=8, xlim=(-5,15), color='b')
plt.title("After")


# In[13]:

# pandas has built-in boxplot function for 
# looking at each column in dataframe
sns.set_style("ticks")
sns.despine(offset=10, trim=True)
fig = plt.figure(figsize=(13,12))
ax = fig.add_subplot(111)
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False)  
ax.get_xaxis().tick_bottom()  
ax.get_yaxis().tick_left() 
plt.ylim(-500000000, 700000000)
plt.xlim(-50000000, 600000000)
ax.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: ('%.0f')%(x*1e-8)))
ax.xaxis.set_major_formatter(FuncFormatter(lambda x, pos: ('%.0f')%(x*1e-8)))
plt.xticks(fontsize=20)  
plt.yticks(fontsize=20)  
sns.residplot('yhat', 'foreign', data=df2, color = '#226b53', scatter_kws={"s": 80}, lowess = True)

ax.set_xlabel('')
plt.ylabel('Residuals', fontsize=20) 
fig.savefig('temp.png', transparent=True)

# Movie genre bar graph

sns.set(style = 'white')
fig = plt.figure(figsize=(26,12))
ax = fig.add_subplot(111)
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False)  
ax.spines["bottom"].set_visible(False)  
ax.spines["left"].set_visible(False)
ax.set_ylabel('')
Пример #35
0
# In[ ]:




# In[164]:

sns.lmplot('cholesterol', 'diagnosis', df)


# In[165]:

f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
sns.regplot('cholesterol', 'diagnosis', df, ax=ax1)
sns.boxplot(df["cholesterol"], df["diagnosis"], color="Blues_r", ax=ax2).set_ylabel("")
sns.residplot(df["cholesterol"], (linear_reg.predict(features) - response) ** 2, color="indianred", order=1, lowess=True)
sns.residplot(df["cholesterol"], (linear_reg.predict(features) - response) ** 2, color="indianred", order=2, lowess=True)
ax1.set_title('Regression')
ax2.set_title('ax2 title')
f.tight_layout()

sns.residplot(X.cholesterol, (linear_reg.predict(X) - y) ** 2, color="indianred", lowess=True, ax=ax3)
# f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex='col', sharey='row')
# ax1.plot(x, y)
# ax1.set_title('Sharing x per column, y per row')
# ax2.scatter(x, y)
# ax3.scatter(x, 2 * y ** 2 - 1, color='r')
# ax4.plot(x, 2 * y ** 2 - 1, color='r')

# f, axarr = plt.subplots(2, 2)
# axarr[0, 0].plot(x, y)
Пример #36
0
"""
Plotting model residuals
========================

"""
import numpy as np
import seaborn as sns
sns.set(style="whitegrid")

rs = np.random.RandomState(7)
x = rs.normal(2, 1, 75)
y = 2 + 1.5 * x + rs.normal(0, 2, 75)

sns.residplot(x, y, lowess=True, color="navy")
Пример #37
0
# Calcul d'une régression linéaire sur variable explicative centrée/réduite

df = df_densites.dropna()
X = df.iloc[:, 0]
y = df['MOYENNE DEPASSEMENT']
# centrer/réduire X
mean = X.mean()
std = X.std()
Xcr = (X - mean)/std
dat = pd.concat([Xcr, y-y.mean()], axis=1)
dat.columns = ['DENSITE_SPECIALISTES', 'MOYENNE_DEPASSEMENT']
#print(dat)
#plt.scatter(Xcr, y, c='blue', alpha=0.5)
ax = sns.regplot(x='DENSITE_SPECIALISTES', y='MOYENNE_DEPASSEMENT', data=dat)
plt.show()
ax = sns.residplot(x=Xcr, y=y-y.mean())
plt.show()

#
# Régression linéaire par statmodels
#
result = smf.ols('MOYENNE_DEPASSEMENT ~ DENSITE_SPECIALISTES', data=dat).fit()
print(result.params)
sns.distplot(result.resid, fit=norm)
plt.title('Residuals vs. Normal distribution')
plt.show()
#print(df_densites)

# corrélation entre densité de bébés et de pédiatres
# Calcul d'une régression linéaire sur variable explicative centrée/réduite
'''
Plotting residuals of a regression

Often, you don't just want to see the regression itself but also see the residuals to get a better idea how well the regression captured the data. Seaborn provides sns.residplot() for that purpose, visualizing how far datapoints diverge from the regression line.

In this exercise, you will visualize the residuals of a regression between the 'hp' column (horse power) and the 'mpg' column (miles per gallon) of the auto DataFrame used previously.

INSTRUCTIONS
100XP
Import matplotlib.pyplot and seaborn using the standard names plt and sns respectively.
Generate a green residual plot of the regression between 'hp' (on the x-axis) and 'mpg' (on the y-axis). You will need to specify the additional data and color parameters.
Display the plot as usual using plt.show(). This has been done for you, so hit 'Submit Answer' to view the plot.
'''
# Import plotting modules
import matplotlib.pyplot as plt
import seaborn as sns

# Generate a green residual plot of the regression between 'hp' and 'mpg'
sns.residplot(x='hp', y='mpg', data=auto, color='green')

# Display the plot
plt.show()
Пример #39
0
plt.show()

# Calculate the predicted brainweights and differences from the observed values
training_set['predictions'] = np.dot(features_norm, thetas)
training_set['difference'] = training_set['predictions'] - training_set['p.Open']
training_set.head()
print training_set

# Plot the predicted against the observed values
p = sns.lmplot("predictions", "open", data=training_set, size=7)
p.set_axis_labels("Predicted Open", "Actual Open")
plt.show()


# Plot the residuals
p = sns.residplot(training_set.predictions, training_set.open, lowess=True)
plt.show()


# Calculate the coefficient of determination (r^2)
y = np.array(training_set.open)
p = np.array(training_set.predictions)
xbar = np.mean(y)

r_squared = 1 - np.square(y - p).sum() / np.square(y - xbar).sum()
print r_squared

mse = ((y - p) ** 2).mean(axis=None) 

print "Alpha:" + str(alpha)
print "Number of Iterations: " + str(iterations)
mpg = auto_csv[:, 0]
hp = auto_csv[:, 3]

auto = df1

tips_csv = pd.read_csv('tips.csv')
tips = tips_csv

import matplotlib.pyplot as plt
import seaborn as sns

sns.lmplot(x='total_bill', y='tip', data=tips, hue='sex', palette='Set1')
plt.show()

sns.residplot(x='total_bill', y='tip', data=tips, color='indianred')
plt.show()

sns.stripplot(x='day', y='tip', data=tips)
plt.ylabel('tip ($)')
plt.show()

sns.stripplot(x='day', y='tip', data=tips, jitter=True, size=4)
plt.ylabel('tip ($)')
plt.show()

sns.swarmplot(x='day', y='tip', data=tips)
plt.ylabel('tip ($)')
plt.show()

sns.swarmplot(x='day', y='tip', data=tips, hue='sex')