def EDA(source_df, reply_df): """function that does exploratory data analysis on the data""" # Creating profile report source_report = ProfileReport(source_df, title='Profile Report', html={'style': { 'full_width': True }}) source_report.to_notebook_iframe() source_report.to_file(output_file="EDA_source_report.html") reply_report = ProfileReport(reply_df, title='Profile Report', html={'style': { 'full_width': True }}) reply_report.to_notebook_iframe() reply_report.to_file(output_file="EDA_reply_report.html") correlation_heatmap(source_df) correlation_heatmap(reply_df) import pdfkit pdfkit.from_file('EDA_source_report.html', 'EDA_source_report.pdf') pdfkit.from_file('EDA_reply_report.html', 'EDA_reply_report.pdf')
def create_report(df, filename=None): profile = ProfileReport(df, title='Pandas Profiling Report') if filename: profile.to_file(output_file = filename) else: return profile.to_notebook_iframe()
def simple_report(data: pd.DataFrame): from pandas_profiling import ProfileReport import sys inJupyter = sys.argv[-1].endswith('json') report = ProfileReport(data, title='Report', html={'style': { 'full_width': True }}) if inJupyter: return report.to_notebook_iframe() else: return report.to_file(output_file='simple_report.html')
def profile(self, title: str = 'Dataset profile report', html_path: str = None, show_report_in_notebook: bool = False): """Generates a pandas-profiling report of the dataset to be displayed in a jupyter notebook. Optionally saves the report as an html file :param html_path: If provided, the pandas-profiling report will be saved to disk :param show_report_in_notebook: Whether or not to show report in jupyter notebook :return: None """ if not os.path.exists(html_path): logger.info('Generating the profiling report') profile_report = ProfileReport(self.data, title=title) if html_path is not None: profile_report.to_file(html_path) logger.info( f'Saved the pandas-profiling report to ``{html_path}``') profile_report.to_notebook_iframe() else: logger.info( f'A profiling report was already generated and will be loaded from ``{html_path}``' ) display(IFrame(src=html_path, width=10**3, height=10**3))
alpha=0.7, palette=list(reversed(sns.color_palette("RdYlGn_r", 4))), ) plt.legend(title="Conformidad con el salario") plt.ylabel("Salario bruto") plt.xlabel("Salario neto") plt.title('Relación entre salario neto y salario bruto') plt.show() # - # ## Referencias # # - [Documentación de seaborn](http://seaborn.pydata.org/tutorial/color_palettes.html) # - [Diverging Color Maps for Scientific Visualization - Kenneth Moreland](https://cfwebprod.sandia.gov/cfdocs/CompResearch/docs/ColorMapsExpanded.pdf) # - [XKCD color survey](https://blog.xkcd.com/2010/05/03/color-survey-results/) # - [Subtleties of colors series](https://earthobservatory.nasa.gov/blogs/elegantfigures/2013/08/05/subtleties-of-color-part-1-of-6/) # - [Documentación de matplotlib](https://matplotlib.org/tutorials/colors/colormaps.html) # # Pandas profiling # + from pandas_profiling import ProfileReport report = ProfileReport(df, title='Encuesta de sueldos sysarmy 2020.02', explorative=True, lazy=False) # - report.to_notebook_iframe()
warnings.simplefilter("ignore") plt.rcParams['figure.figsize'] = (12,8) """#Step 2: Exploratory Data Analysis --- """ # Importing the dataset df = pd.read_csv('employee_data.csv') df.head() profile = ProfileReport(df, title = "Pandas Profiling Report", html = {"style":{'full_width':False}}) profile.to_notebook_iframe() profile.to_file(output_file="REPORT.html") """#Step 3: Encode Categorical Features --- """ pd.crosstab(df.salary, df.quit).plot(kind='bar') plt.title('Turnover Frequency on Salary Bracket') plt.xlabel('Salary') plt.ylabel('Frequency of Turnover') plt.show() pd.crosstab(df.department,df.quit).plot(kind='bar') plt.title('Turnover Frequency for Department')
print("Media de Contaminados:{:.2f} ".format(df['C_Acu_Dia'].mean())) plt.hist('C_Acu_Dia',data = df,bins = 8,density = False) plt.show() df.plot.scatter(x='C_Acu_Dia', y = 'O_Acu_Dia',s=60) plt.show() df.boxplot(['C_Acu_Dia','O_Acu_Dia']) corr = df[['C_Acu_Dia','O_Acu_Dia']].corr() display(corr) sns.heatmap(corr,cmap='RdBu', fmt = '.2f', square = True,linecolor='white', annot=True) !pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip -q import pandas as pd #biblioteca pandas from pandas_profiling import ProfileReport #biblioteca para usar profile report report = ProfileReport(df) #cria o profile report report.to_notebook_iframe() #gera overview dos dados ! pip install sweetviz -q # INSTALA SWEETVIZ import sweetviz relatorio_sweet = sweetviz.analyze(df) relatorio_sweet.show_html('Teste.html')
class HelloDataset(): ''' Stores methods for dataset operations. ''' # Dataloading def load(self, path_obj): ''' Loads a .csv into a class attribute. param path_obj: Path object pointing to dataset stored as .csv ''' print('Loading data...') self.df = pd.read_csv(path_obj) print('Data loaded.') # print('Columns found:') # for col in self.df.columns: # print(f'Feature name: {col}\tDatatype: {self.df[col]}\tMissing values: \ # {self.df[col].isna().count()}') def explore(self): self.profile = ProfileReport(self.df, title='Explore the Dataset', minimal=True, progress_bar=False) return self.profile.to_notebook_iframe() # Data cleaning def drop_feature(self, feature_list): ''' Removes the given column(s). ''' self.df = self.df.drop(feature_list, axis=1) print(f'Feature(s) {feature_list} removed.') def convert_to_numerical(self, feature_list): ''' Converts the given column(s) to dummies. ''' print( f'Converting features {feature_list} to numerical by creating dummy variables...' ) dummy_cols = [] for col in feature_list: dummies = pd.get_dummies(self.df[col], drop_first=True, prefix=col) dummy_cols.extend(dummies.columns) self.df = pd.concat([self.df, dummies], axis=1) self.df = self.df.drop(col, axis=1) print(f'Features removed: {feature_list}.') print(f'Dummy features added: {list(dummy_cols)}') # def drop_example(self, row): # ''' # Removes the given row. # Commented out due to need for better data exploration support for # appropriate usage. # ''' # self.df = self.df.drop(row, axis=0) # print(f'Example {row} removed.') # Feature engineering def sum_feature(self, new_feat, feature_list): ''' Sum columns. param feature_list: List of column names (as strings) to sum with each other. param new_feat: The name of the new column. ''' self.df[new_feat] = self.df[feature_list].sum(axis=1) print( f'Created new feature "{new_feat}" by summing features {feature_list}.' ) def subtract_feature(self, new_feat, feat_a, feat_b): ''' Divide one column by another. param feat_a: The column from which to subtract. param feat_b: The column to subtract from feat_a. param new_feat: The name of the new column. ''' self.df[new_feat] = self.df[feat_a] - self.df[feat_b] print( f'Created new feature "{new_feat}" by dividing {feat_num} by {feat_denom}.' ) def divide_feature(self, new_feat, feat_num, feat_denom): ''' Divide one column by another. param feat_num: The column containing the feature in the numerator. param feat_denom: The column containing the feature in the denominator. param new_feat: The name of the new column. ''' self.df[new_feat] = self.df[feat_num] / self.df[feat_denom] print( f'Created new feature "{new_feat}" by dividing {feat_num} by {feat_denom}.' ) def multiply_feature(self, new_feat, feature_list): ''' Multiply one column by another. param feature_list: List of column names (as strings) to multiply by each other. param new_feat: The name of the new column. ''' self.df[new_feat] = self.df[feature_list].prod(axis=1, numeric_only=True) print( f'Created new feature "{new_feat}" by multiplying features {feature_list}.' ) def set_target_feature(self, target_feature, scale=True): ''' param target_feature: name of column to predict. All other columns will be normalised and used for prediction. ''' self.X = self.df.drop(target_feature, axis=1) self.Y = self.df[target_feature] print(f'Target feature set to {target_feature}.') print(f'Features used for prediction:') for i in list(self.X.columns): print(f'- {i}') if scale: scaler = RobustScaler() num_df = scaler.fit_transform( self.X.select_dtypes(include=[np.number])) self.X.update(num_df) def impute_missing_values(self, feature_list, imputation_type='auto'): ''' Impute missing values in the selected columns of the dataset param feature_list: List of column names (as strings) to impute. param imputation_type: What kind of imputation to use (auto, mode, or mean) ''' for feature in feature_list: num_na = sum(self.df[feature].isna()) if num_na == 0: print( f'No null values detected in column "{feature}"... skipping to next feature' ) continue elif imputation_type == 'auto': if (self.df[feature].dtype == 'O' or self.df[feature].dtype == 'str'): print( f'{num_na} null values detected in column "{feature}"... replacing with the mode' ) new_feature = self.df[feature].fillna( self.df[feature].mode()[0]) else: print( f'{num_na} null values detected in column "{feature}"... replacing with mean' ) new_feature = self.df[feature].fillna( self.df[feature].mean()[0]) elif imputation_type == 'mode': print( f'{num_na} null values detected in column "{feature}"... replacing with the mode' ) new_feature = self.df[feature].fillna( self.df[feature].mode()[0]) elif imputation_type == 'mean' and (self.df[feature].dtype == 'O' or self.df[feature].dtype == 'str'): print( f'Error: Cannot impute mean of a non-numerical feature. Skipping imputation of feature "{feature}"...' ) continue elif imputation_type == 'mean': print( f'{num_na} null values detected in column "{feature}"... replacing with mean' ) new_feature = self.df[feature].fillna( self.df[feature].mean()[0]) else: raise AssertionError( 'Error: Incorrect imputation type. Please enter one of the options: "auto", "mean", or "mode" only.' ) self.df[feature] = new_feature
def do_a_pandas_profile(df, name): from pandas_profiling import ProfileReport profile = ProfileReport(df, title=name, html={'style':{'full_width':True}}) profile.to_widgets(), profile.to_notebook_iframe()