def run(self): df = self.df plot(df).save('dataprep_plot.html') plot_correlation(df).save('dataprep_correlation.html') plot_missing(df).save('dataprep_missing.html') ProfileReport(df, title='Pandas Profiling Report').to_file( 'pandas_profiling_report.html')
data=pd.read_csv("healthcare-dataset-stroke-data.csv") data data.describe() #drop id data.drop(columns=['id'],inplace=True) #checking missing values data.isna() #getting the count of null values in a column data.isna().sum() #checking if we have missing data plot_missing(data) data=data.fillna(np.mean(data['bmi'])) data.info() plot(data) plot(data,'stroke') plot(data,'smoking_status') plot(data,'bmi') plot(data,'heart_disease') plot_correlation(data)
print('#rows:', df.shape[0]) # number of rows for r in df.columns: print( r, ':', # column name df[r].unique().shape[0], # number of unique elements in the column '| example:', df[r][0]) # example of the first element in the column vp_summ(df) import dataprep.eda as eda eda.plot(df, 'country') eda.plot_correlation(df, 'numeric-column') eda.plot_missing(df, 'country') # Summarizing df.groupby('country').nunique()[['show_id']].sort_values(by='show_id', ascending=False) df.groupby('country').nunique()[['show_id']].sum() 7280 - 923 # Plotting import plotly.graph_objects as go labels = ['All other movies', 'Indian movies'] values = [6357, 923] # pull is given as a fraction of the pie radius fig = go.Figure(
def missing_data_analysis(data): return (plot_missing(data))
# cleaning the dataset # select features we need - CustomerID, InvoiceDate, Quantity and Total Sales (Quantity * UnitPrice) df2 = df[['Quantity', 'InvoiceNo', 'InvoiceDate', 'UnitPrice', 'CustomerID']] df2['TotalSales'] = df2.Quantity * df2.UnitPrice df2.shape # review descriptive statistics df2.describe() # drop negative sales due to returns df3=df2[df2.TotalSales>0] df3.shape # check how many CustomerID's are missing dp.plot_missing(df2, 'CustomerID') pd.DataFrame(zip(df2.isnull().sum(), df2.isnull().sum()/len(df2)), columns=['Count', 'Proportion'], index=df2.columns) # alternate approach # drop rows with null CustomerID df2 = df2[pd.notnull(df2.CustomerID)] ############################################################## # aggregate model # assumes a constant average spend and churn rate for all the customers, and produces a single value for CLV at an overall Level # downside - unrealistic estimates if some of the customers transacted in high value and high volume ''' CLV = ((Average Sales X Purchase Frequency) / Churn) X Profit Margin Where, Average Sales = TotalSales/Total no. of orders
# Exploratory Data Analysis from dataprep.eda import plot plot(df) # distribution of each column and calculates dataset statistics plot(df,'tip') # distribution of column x in various ways and calculates column statistics plot(df, 'tip', 'total_bill') # depicting the relationship between columns x and y # Plot corr from dataprep.eda import plot_correlation plot_correlation(df) # plots correlation matrices (correlations between all pairs of columns) plot_correlation(df, 'tip') # plots the most correlated columns to column x plot_correlation(df, 'tip', 'total_bill') # plots the joint distribution of column x and column y and computes a regression line # Plot missing data from dataprep.eda import plot_missing plot_missing(df) # plots the amount and position of missing values, and their relationship between columns plot_missing(df, 'tip') # plots the impact of the missing values in column x on all other columns plot_missing(df, 'tip', 'total_bill') # plots the impact of the missing values from column x on column y in various ways # Report ''' Overview: detect the types of columns in a dataframe Variables: variable type, unique values, distint count, missing values Quantile statistics like minimum value, Q1, median, Q3, maximum, range, interquartile range Descriptive statistics like mean, mode, standard deviation, sum, median absolute deviation, coefficient of variation, kurtosis, skewness Text analysis for length, sample and letter Correlations: highlighting of highly correlated variables, Spearman, Pearson and Kendall matrices Missing Values: bar chart, heatmap and spectrum of missing values ''' from dataprep.eda import create_report create_report(df, title='My Report')