def analysis_of_variance(): """ ANOVA Why? To find the correlation between different groups of a categorical variable What do we get from ANOVA? F-test score: variation between sample group means divided by variation within sample group P-value: confidence degree Notes: - Small F implies poor correlation between the variable categories and the target variable. - Large F implies strong correlation """ df = util.create_df() df_anova = df[['make', 'price']] grouped_anova = df_anova.groupby(['make']) anova_results_1 = stats.f_oneway( grouped_anova.get_group('honda')['price'], grouped_anova.get_group('subaru')['price']) anova_results_2 = stats.f_oneway( grouped_anova.get_group('honda')['price'], grouped_anova.get_group('jaguar')['price']) print(anova_results_1) print(anova_results_2)
def descriptive_statistics(): df = util.create_df() # Descriptive Statistics # Generate various summary statistics, excluding NaN values df.describe() # Summarize categorical data df['drive-wheels'].value_counts() # Helps spot outliers in a data set sns.boxplot(x='drive-wheels', y='price', data=df) plt.show() # Clear the current figure so it does not interfere with our new plot plt.clf() # Scatter plot shows the relationship between two variables # PIV variables on x-axis # TDV variables on y-axis y = df['engine-size'] x = df['price'] plt.scatter(x, y) plt.ylim(bottom=0) plt.xlim(left=0) plt.title('Scatterplot of Engine Size vs Price') plt.xlabel('Engine Size') plt.ylabel('Price') plt.show()
def group_by(): """ Group By Used on categorical variables (size, price, etc.). Groups data into subsets according to the different categories of the variable. Can be done on single or multiple variables. """ df = util.create_df() df_test = df[['drive-wheels', 'body-style', 'price']] df_grp = df_test.groupby(['drive-wheels', 'body-style'], as_index=False).mean() """ Pivot table & Heatmaps One variable displayed along the columns and the other variable displayed along the rows """ df_pivot = df_grp.pivot(index='drive-wheels', columns='body-style') plt.pcolor(df_pivot, cmap='RdBu') plt.colorbar() plt.show()
def correlation_simple(): """ Statistical metric for measuring interdependency of 2 variables Measures to what extent different variables are interdependent Examples: Lung cancer -> Smoking Rain -> Umbrella Correlation does not imply causation The umbrella didn't cause the rain, and the rain didn't cause the umbrella """ df = util.create_df() # Positive Linear Relationship sns.regplot(x='engine-size', y='price', data=df) plt.ylim(0, ) plt.show() plt.clf() # Negative Linear Relationship sns.regplot('highway-mpg', 'price', data=df) plt.ylim(0, ) plt.show() plt.clf() # Weak Linear Relationship sns.regplot('peak-rpm', 'price', data=df) plt.ylim(0, ) plt.show()
def correlation_statistics(): """ Pearson correlation - Correlation Coefficient. Explanation: - Close to +1: Large positive relationship - Close to -1: Large negative relationship - Close to 0: No relationship - P Value. Strength of result certainty: - <0.001: Strong certainty - <0.05: Moderate certainty - <0.1: Weak certainty - >0.1: No certainty Notes: https://en.wikipedia.org/wiki/Correlation_and_dependence - We can say there's a strong correlation when: 1. Correlation Coefficient is close to 1 or -1 2. P value is less than 0.001 - If the correlation coefficient is NaN? """ df = util.create_df() df['horsepower'] = df['horsepower'].astype(float) pearson_coef, p_value = stats.pearsonr(df['horsepower'], df['price']) print('Coef: {} | P value: {}'.format(pearson_coef, p_value))
def binning(): """ Grouping values into bins Converts numeric into categorical variables Group a set of numerical values into a set of bins Sometimes this can improve the accuracy of the data """ df = util.create_df() util.replace_nan_with_mean(df['horsepower'], 'float') df["horsepower"] = df["horsepower"].astype(int) # Return evenly spaced numbers over a specified interval - 4 dividers (3 bins) bins = np.linspace(min(df['horsepower']), max(df['horsepower']), 4) # Set names for our groups group_names = ['Low', 'Medium', 'High'] # Bin values into discrete intervals # It runs through every value and applies the label depending on their ranges df['horsepower-binned'] = pd.cut( df['horsepower'], bins, labels=group_names, include_lowest=True ) # Visualize it fig = plt.figure(figsize=(12, 14)) plt.bar(group_names, df['horsepower-binned'].value_counts()) fig.suptitle('Horsepower Bins', fontsize=18) plt.xlabel('Horsepower', fontsize=18) plt.ylabel('count', fontsize=16)
def dummies(): """ Turning categorical variables into quantitative variables Solution: Add dummy variables for each unique category Assign 0 or 1 in each category e.g. fuel | column, type: object --- Entries gas 0 diesel 1 1) One-hot encoding pandas.get_dummies() -- Indicator Variable An indicator variable (or dummy variable) is a numerical variable used to label categories. They are called 'dummies' because the numbers themselves don't have inherent meaning """ df = util.create_df() dummy_var = pd.get_dummies(df['fuel']) # merge data frame "df" and "dummy_var" df = pd.concat([df, dummy_var], axis=1) # drop original column "fuel-type" from "df" df.drop("fuel", axis=1, inplace=True)
def multiple_linear_regression(): """ Will use 2+ PIVs to make 1 prediction (TDV) """ df = util.create_df() x = df[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']] y = df['price'] lmr = linear_model.LinearRegression() lmr.fit(x, y) y_hat = lmr.predict(x) width, height = 12, 10 plt.figure(figsize=(width, height)) ax1 = sns.distplot(df['price'], hist=False, color="r", label="Actual Value") sns.distplot(y_hat, hist=False, color="b", label="Fitted Values", ax=ax1) plt.title('Actual vs Fitted Values for Price') plt.xlabel('Price (in dollars)') plt.ylabel('Proportion of Cars') plt.show()
def polynomial_regression(): """ Special case of the general linear regression model Useful for describing 'curvilinear' relationships: This is what you get by squaring or setting higher-order terms of the predictor variables The model can be: - Quadratic (2nd order) - Cubic (3rd order) - Higher order (4th order +) The degree of the regression can make a big difference if you pick the right value . """ import numpy as np from sklearn.preprocessing import PolynomialFeatures df = util.create_df() x = df['horsepower'] y = df['curb-weight'] f = np.polyfit(x, y, 3) p = np.poly1d(f) print(p) pr = PolynomialFeatures(degree=2, include_bias=False) x_poly = pr.fit_transform(df[['horsepower', 'curb-weight']]) print(x_poly)
def model_evaluation(): """ Tells us how our model performs in the real world Difference with in-sample evaluation: - In-sample tells us how well our model fits the data already given to train it - Problem: It does not tell us how well the trained model can be used to predict new data - Solution: Split data in sets: - Training data: Train it with in-sample evaluation - Testing data: Example: - Train 70% of the data - Test 30% of the data There exists a generalization error that involves the percentages of data used for training and testing. TO overcome this issue, we use Cross Validation - Most common out-of-sample (testing) evaluation metric - More effective use of data (each observation is used for both training and testing) """ from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict from sklearn.linear_model import LinearRegression df = util.create_df() x_data = df[['highway-mpg']] y_data = df['price'] lr = LinearRegression() lr.fit(x_data, y_data) # cv specifies how many folds to use scores = cross_val_score(lr, x_data, y_data, cv=3) print(f'Mean: {scores.mean()}. Standard deviation: {scores.std()}') predicted_scores = cross_val_predict(lr, x_data, y_data, cv=3) # Visualize the model width, height = 12, 10 plt.figure(figsize=(width, height)) sns.regplot(x='highway-mpg', y='price', data=df) plt.ylim(0, ) plt.show() plt.clf() plt.figure(figsize=(width, height)) sns.regplot(x="peak-rpm", y="price", data=df) plt.ylim(0, ) plt.show() plt.clf() plt.figure(figsize=(width, height)) sns.residplot(df['highway-mpg'], df['price']) plt.show()
def ridge_regression(): """ Prevents over-fitting, which is ALSO a big problem when you have multiple independent variables or features If the estimated polynomial coefficients have a very large magnitude, we can use Ridge regression to control it with an 'alpha' parameter Alpha is a parameter we select before fitting or training a model As alpha increases, the other parameters get smaller Must be selected carefully - If alpha is too large, the parameters will reach 0, under-fitting the model if alpha is 0, over-fitting is evident! In order to select alpha, use cross validation """ from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict from sklearn.linear_model import Ridge, LinearRegression df = util.create_df() x_data = df.drop('price', axis=1) y_data = df['price'] x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.15, random_state=1) lr = LinearRegression() lr.fit( x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_train) yhat_train = lr.predict( x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']]) print(f'Train: {yhat_train[0:5]}') yhat_test = lr.predict( x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']]) print(f'Test: {yhat_test[0:5]}') title = 'Distribution Plot of Predicted Value Using Training Data vs Training Data Distribution' DistributionPlot(y_train, yhat_train, "Actual Values (Train)", "Predicted Values (Train)", title) rm = Ridge(alpha=0.1) rm.fit(x_train, y_train) y_hat = rm.predict(x_test) print('predicted:', y_hat[0:4]) print('test set :', y_hat[0:4].values)
def simple_linear_regression(): """ Will only use 1 PIV to make 1 prediction (TDV) FORMULA: y = b0 + (b1 * x) b0: intercept b1: slope """ from sklearn.metrics import mean_squared_error, r2_score df = util.create_df() # Define PIV and TDV x = df[['highway-mpg']] y = df['price'] x_train = x[:-20] x_test = x[-20:] y_train = y[:-20] y_test = y[-20:] lmr = linear_model.LinearRegression() # Train/Fit the model lmr.fit(x_train, y_train) y_predict = lmr.predict(x_test) # The coefficients print('Coefficients: \n', lmr.coef_) # The mean squared error print('Mean squared error: %.2f' % mean_squared_error(y_test, y_predict)) # The coefficient of determination: 1 is perfect prediction print('Coefficient of determination: %.2f' % r2_score(y_test, y_predict)) width, height = 12, 10 plt.figure(figsize=(width, height)) plt.scatter(x_test, y_test, color='black') plt.plot(x_test, y_predict, color='blue', linewidth=3) plt.ylim(0,) plt.title('SLR model for predicting price') plt.xlabel('Miles Per Gallon') plt.ylabel('Price') plt.show()
def calculate_mean_squared_error(): """ As the MSE increases, the prediction will be less accurate. """ from sklearn.metrics import mean_squared_error df = util.create_df() x = df[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']] y = df['price'] lmr = linear_model.LinearRegression() lmr.fit(x, y) y_hat = lmr.predict(x) mse = mean_squared_error(df['price'], y_hat) print(mse)
""" DATA NORMALIZATION - Simple Feature Scaling - Min-Max - Z-score """ from src import util df = util.create_df() def normalization(): """ Overview 1) Simple Feature Scaling xNew = xOld/xMax 2) Min-Max xNew = (xOld-xMin)/(xMax-xMin) 3) Z-score xNew = (xOld-m)/sd m: Average -> mean() sd: Standard Deviation -> std() """ # Simple Feature Scaling df['length'] = df['length'] / df['length'].max() # Min-Max df['length'] = (df['length'] - df['length'].min()) / \ (df['length'].max()-df['length'].min()) # Z-score
def fitting(): """ Over/Under-fitting for polynomial regression How to pick the best polynomial order Under-fitting: the model is too simple to fit the data Over-fitting: The model is too flexible to fit the data. The training error decreases with the order of the polynomial, BUT The test error is a better means of estimating the error of a polynomial. """ from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict from sklearn.linear_model import LinearRegression df = util.create_df() # x_data = df[['highway-mpg']] x_data = df.drop('price', axis=1) y_data = df['price'] x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.15, random_state=1) print("number of test samples :", x_test.shape[0]) print("number of training samples:", x_train.shape[0]) lr = LinearRegression() lr.fit( x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']], y_train) yhat_train = lr.predict( x_train[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']]) print(f'Train: {yhat_train[0:5]}') yhat_test = lr.predict( x_test[['horsepower', 'curb-weight', 'engine-size', 'highway-mpg']]) print(f'Test: {yhat_test[0:5]}') title = 'Distribution Plot of Predicted Value Using Training Data vs Training Data Distribution' DistributionPlot(y_train, yhat_train, "Actual Values (Train)", "Predicted Values (Train)", title) rsqu_test = [] order = [1, 2, 3, 4] lr = LinearRegression() # Determine which polynomial degree gives the best r^2 value for n in order: pr = PolynomialFeatures(degree=n) x_train_pr = pr.fit_transform(x_train[['horsepower']]) x_test_pr = pr.fit_transform(x_test[['horsepower']]) lr.fit(x_train_pr, y_train) rsqu_test.append(lr.score(x_test_pr, y_test)) plt.plot(order, rsqu_test) plt.xlabel('order') plt.ylabel('R^2') plt.title('R^2 Using Test Data') plt.text(3, 0.75, 'Maximum R^2 ') plt.show()
def model_evaluation_using_visualization(): """ Regression plot - Gives us a good estimate of: 1) The relationship between 2 variables 2) The strength of the correlation (r2) 3) The direction of the relationship (Positive/Negative) - Combination of: 1) Scatter plot: Every point represents a different y 2) Fitted linear regression line Residual Plot - Represents the error between the actual values - y axis: residuals - x axis: TDV / Fitted values - Obtain difference by subtracting the predicted value - TDV - We expect results to have zero mean (Small variance), distributed evenly around the x axis with similar variance. - If there is NO curvature, a linear plot (function) might be more appropriate. - If there is a curvature, our LINEAR ASSUMPTION is incorrect, and it suggests a non-linear function - If the variance of the residuals increases with x, our MODEL is incorrect. Distribution Plot - Counts the predicted vs. actual values - Very useful for visualizing models with more than PIV Example - Given a data set of y values: 1, 2, 3 - Count and plot the number of predicted values and TDVs that are approximately equal to 1, 2 and 3 - """ import seaborn as sns df = util.create_df() # Regression plot sns.regplot(x='highway-mpg', y='price', data=df) plt.ylim(0, ) plt.show() plt.clf() # Residual plot sns.residplot(df['highway-mpg'], df['price']) plt.ylim(0, ) plt.show() plt.clf() # Distribution plot x = df[['highway-mpg']] y = df['price'] lmr = linear_model.LinearRegression() lmr.fit(x, y) y_hat = lmr.predict(x) ax1 = sns.distplot(y, hist=False, color='r', label='Actual value') sns.distplot(y_hat, hist=False, color='b', label='Fitted values', ax=ax1) plt.show()