def run(self): df = self.df plot(df).save('dataprep_plot.html') plot_correlation(df).save('dataprep_correlation.html') plot_missing(df).save('dataprep_missing.html') ProfileReport(df, title='Pandas Profiling Report').to_file( 'pandas_profiling_report.html')
plot_missing(data) data=data.fillna(np.mean(data['bmi'])) data.info() plot(data) plot(data,'stroke') plot(data,'smoking_status') plot(data,'bmi') plot(data,'heart_disease') plot_correlation(data) #converting Marrital Status, Residence and Gender into 0s and 1s data['gender']=data['gender'].apply(lambda x : 1 if x=='Female' else 0) data["Residence_type"]=data["Residence_type"].apply(lambda x: 1 if x=="Urban" else 0) data["ever_married"]=data["ever_married"].apply(lambda x: 1 if x=="Yes" else 0) #removing the observations that have smoking_status type unknown data=data[data['smoking_status']!='Unknown'] data.head(12) data #using OneHotEncoding for smoking_status, work_type data_dummies=data[['smoking_status','work_type']]
print('#columns:', df.shape[1]) # number of columns print('#rows:', df.shape[0]) # number of rows for r in df.columns: print( r, ':', # column name df[r].unique().shape[0], # number of unique elements in the column '| example:', df[r][0]) # example of the first element in the column vp_summ(df) import dataprep.eda as eda eda.plot(df, 'country') eda.plot_correlation(df, 'numeric-column') eda.plot_missing(df, 'country') # Summarizing df.groupby('country').nunique()[['show_id']].sort_values(by='show_id', ascending=False) df.groupby('country').nunique()[['show_id']].sum() 7280 - 923 # Plotting import plotly.graph_objects as go labels = ['All other movies', 'Indian movies'] values = [6357, 923] # pull is given as a fraction of the pie radius
import numpy as np import pandas as pd from matplotlib.pyplot import xticks import dataprep.eda as eda broadband = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-11/broadband.csv') broadband.columns=['ST', 'COUNTY_ID', 'COUNTY_NAME', 'AVAILABILITY', 'USAGE'] eda.create_report(broadband).show_browser() eda.plot(broadband, 'ST', 'BROADBAND AVAILABILITY PER FCC').show_browser() eda.plot(broadband, 'ST', 'BROADBAND USAGE').show_browser() eda.plot_correlation(broadband).show_browser() broadband_zip = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-11/broadband_zip.csv') broadband.loc[50:55,] # there is a - in data broadband = broadband.drop(broadband[broadband.AVAILABILITY=='-'].index) broadband = broadband.drop(broadband[broadband.USAGE=='-'].index) broadband.info() broadband.AVAILABILITY=pd.to_numeric(broadband.AVAILABILITY) broadband.USAGE=pd.to_numeric(broadband.USAGE) br = broadband.groupby(['ST']).mean()[['AVAILABILITY', 'USAGE']].reset_index() import plotly.express as px fig = px.scatter(br, x='AVAILABILITY', y='USAGE', color='ST', opacity=0,
arrowwidth=1,arrowcolor='orange') fig.add_annotation(dict(xref='paper',yref='paper',x=0.095,y=0.05,xanchor='center',yanchor='top', font=dict(family='Arial', size=12, color='cornflowerblue'),showarrow=False, text='5550, Chhukung Ri'), showarrow=True,align='left',arrowhead=1,arrowsize=1, arrowwidth=1,arrowcolor='cornflowerblue') fig.add_annotation(dict(xref='paper',yref='paper',x=0.94,y=0.05,xanchor='center',yanchor='top', font=dict(family='Arial', size=12, color='cornflowerblue'),showarrow=False, text='8850, Everest'), showarrow=True,align='left',arrowhead=1,arrowsize=1, arrowwidth=1,arrowcolor='cornflowerblue') fig.show() ###################### needs to be changed ########################## # EDA def vp_summ(df): print('#columns:', df.shape[1]) # number of columns print('#rows:', df.shape[0]) # number of rows for r in df.columns: print(r, ':', # column name df[r].unique().shape[0], # number of unique elements in the column '| example:', df[r][0]) # example of the first element in the column vp_summ(df) import dataprep.eda as eda eda.plot(df) eda.plot_correlation(df) eda.plot_missing(df, 'country')
def correlation_plot(data): return (plot_correlation(data))
def bivariate_numerical_scatterplot(data, feature1, feature2): return (plot_correlation(data, x=feature1, y=feature2, k=5))
# create average fico score df1['fico_score'] = (df1['fico_range_low'] + df1['fico_range_high']) / 2 # create categorical variable for credit grade df1.grade = [ ord(x) - 64 for x in df1.grade ] # create logs for income df1['log_income'] = np.log(df1['annual_inc']) # Correlations from dataprep.eda import plot_correlation cor_features = ['emp_length', 'annual_inc', 'dti', 'fico_score', 'GDP', 'Unemployment_rate', 'hc_coverage', 'education', 'crime_rate'] df_cor = df1[cor_features] plot_correlation(df_cor) # demographic statistics # accepted dataset x = df1['crime_rate'].mean() print(x) x2 = df1['crime_rate'].min() print(x2) x3 = df1['crime_rate'].max() print(x3) """Scaling the demographics for accepted dataset""" # Scale the demographic variables scaler = StandardScaler() features = ['GDP', 'Unemployment_rate', 'hc_coverage', 'education', 'crime_rate']
df2=df.copy() df2["Sales"] = np.where(df["Sales"] <7.8936,7.8936,df['Sales']) df2["Sales"] = np.where(df["Sales"] >572.949,572.949,df['Sales']) df2["Discount"] = np.where(df["Discount"] <0.0, 0.0,df['Discount']) df2["Discount"] = np.where(df["Discount"] >0.4, 0.4,df['Discount']) plot(df2,'Discount','Profit') plot(df2,'Sales','Profit') """##Quantity and sales have high correlation with profit --- --- """ plot_correlation(df2) plot_correlation(df2,'Profit')
# Loading the Dataset import plotly.express as px df = px.data.tips() # Exploratory Data Analysis from dataprep.eda import plot plot(df) # distribution of each column and calculates dataset statistics plot(df,'tip') # distribution of column x in various ways and calculates column statistics plot(df, 'tip', 'total_bill') # depicting the relationship between columns x and y # Plot corr from dataprep.eda import plot_correlation plot_correlation(df) # plots correlation matrices (correlations between all pairs of columns) plot_correlation(df, 'tip') # plots the most correlated columns to column x plot_correlation(df, 'tip', 'total_bill') # plots the joint distribution of column x and column y and computes a regression line # Plot missing data from dataprep.eda import plot_missing plot_missing(df) # plots the amount and position of missing values, and their relationship between columns plot_missing(df, 'tip') # plots the impact of the missing values in column x on all other columns plot_missing(df, 'tip', 'total_bill') # plots the impact of the missing values from column x on column y in various ways # Report ''' Overview: detect the types of columns in a dataframe Variables: variable type, unique values, distint count, missing values Quantile statistics like minimum value, Q1, median, Q3, maximum, range, interquartile range Descriptive statistics like mean, mode, standard deviation, sum, median absolute deviation, coefficient of variation, kurtosis, skewness Text analysis for length, sample and letter Correlations: highlighting of highly correlated variables, Spearman, Pearson and Kendall matrices