import pandas as pd import data_wrangling.dataframe_manager as dm import scipy.stats as scs pd.set_option('display.max_columns', 200) location = "https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/auto.csv" headers = ["symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors", "body-style", "drive-wheels", "engine-location", "wheel-base", "length", "width", "height", "curb-weight", "engine-type", "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price"] df = dm.create_df(location, headers) print(dm.print_4_heads(df, "")) dm.replace_cols_with_nan(df, ["price", "horsepower"]) dm.replace_cols_with_mean(df, ["price", "horsepower"]) df["price"] = df["price"].astype("float") df["horsepower"] = df['horsepower'].astype('float') #PEARSON Correlation pearson_coeff, p_value = scs.pearsonr(df['horsepower'], df['price']) print('pearson_coeff: ', pearson_coeff) print('p_value: ', p_value)
import pandas as pd import data_wrangling.dataframe_manager as dm pd.set_option('display.max_columns', 200) location = "https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/auto.csv" headers = ["symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors", "body-style", "drive-wheels", "engine-location", "wheel-base", "length", "width", "height", "curb-weight", "engine-type", "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price"] df = dm.create_df(location, headers) print(dm.print_4_heads(df, "")) # replacing price and peak-rpm missing values print('\n Replacing rows where values are not available') df = dm.replace_cols_with_nan(df, ['price', 'peak-rpm']) df = dm.replace_cols_with_mean(df, ['price', 'peak-rpm']) print('\nChanging dtype for price and peak-rpm from object to float64: ') df[["price", "peak-rpm"]] = df[["price", "peak-rpm"]].astype('float') #df[["price", "peak-rpm"]] = dm.change_col_types(df, ["price", "peak-rpm"], 'float') #NORMALIZATION # normalizing the price with Simple Feature Scaling df['price'] = df['price'] / df['price'].max() print(dm.print_4_heads(df, "normalizing the price with Simple Feature Scaling")) # normalizing the peak-rpm with Min-Max Method
import pandas as pd import data_wrangling.dataframe_manager as dm pd.set_option('display.max_columns', 200) location = "https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DA0101EN/auto.csv" headers = [ "symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors", "body-style", "drive-wheels", "engine-location", "wheel-base", "length", "width", "height", "curb-weight", "engine-type", "num-of-cylinders", "engine-size", "fuel-system", "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm", "city-mpg", "highway-mpg", "price" ] df = dm.create_df(location, headers) print(dm.print_4_heads(df, "")) dm.replace_cols_with_nan(df, ["price"]) dm.replace_cols_with_mean(df, ["price"]) df["price"] = df["price"].astype("float") #GROUP BY print('\n') print("GROUP BY DEMO") df_test = df[['drive-wheels', 'body-style', 'price']] df_grp = df_test.groupby(['drive-wheels', 'body-style']).mean() print(df_grp) #PIVOT print('\n') print("PIVOT Table DEMO") df_pivot = df_grp.pivot(index='drive-wheels', columns='body-style') print(df_pivot)