def report_pathology(path=PATHOLOGY_PATH, dropna=False, usecols=None, usedefault=False, removecorr=None, iqrcols=None): df = load_pathology(path=path, dropna=dropna, usecols=usecols, usedefault=usedefault, removecorr=removecorr, iqrcols=iqrcols) create_report(df).save('pathology_report')
def report_dengue(path=DENGUE_PATH, dropna=False, usecols=None, usedefault=False, removecorr=None, iqrcols=None): df = load_dengue(path=path, dropna=dropna, usecols=usecols, usedefault=usedefault, removecorr=removecorr, iqrcols=iqrcols) create_report(df).save('dengue_report')
def create_window_display(data_to_display: pd.DataFrame, display_name: str) -> None: """ Create a window display from a table of data to advertise your dish i.e. Create an automatic EDA report for table data which will be viewable through the web API using dataprep Args: data_to_display (pd.DataFrame): The data to display in the EDA report display_name (str): Name of the data to display Returns: None, but saves an HTML report in the STATIC_DIR """ dataprep_report = create_report(data_to_display, title="Window Display") dataprep_report.save(filename=f"{display_name}", to=str(STATIC_DIR)) del dataprep_report
from matplotlib.pyplot import xticks import numpy as np import pandas as pd water = pd.read_csv( 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-04/water.csv' ) import dataprep.eda as eda eda.create_report(water).show_browser() eda.plot(water, 'water_source') eda.plot(water, 'water_source', 'water_tech').show_browser() eda.plot(water, 'water_source', 'country_name').show_browser()
from matplotlib.pyplot import xticks import numpy as np import pandas as pd from dataprep.eda import plot # Prepare data netflix_titles = pd.read_csv( 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-20/netflix_titles.csv' ) df = netflix_titles.copy() from dataprep.eda import create_report create_report(df).show_browser() df.columns df2 = df[['show_id', 'date_added', 'release_year']].copy() df2['added_year'] = df2.date_added.str.strip().str[-4:] df2.dropna(inplace=True) plot(df2, 'added_year', 'release_year') import matplotlib.pyplot as plt import seaborn as sns plt.figure(figsize=(8, 6)) ax = sns.boxplot(data=df2, x='added_year', y='release_year', order=list( df2.added_year.sort_values(ascending=True).unique())) ax.set( title='Netflix: how many old titles added?', xlabel='Year added',
df.describe() """ Since this is a time series problem, I'm not really looking for quite the same things as if it were a regression problem e.g. skewness Interesting to see that the variability differs between the 3 temp metrics. I dont see any extreme min or max values that would indicate data anomalies. See nothing for Snow which cant be right given i live in the area and distinctly remember plenty of snow over the years. We'll drop these. """ #drop unneeded columns pt.3 cols_drop = ['snow', 'snowdepth'] df = df.drop(cols_drop, axis=1) #auto eda (dataprep library) """I Like pandas profiling library but lets try an alternative for this EDA""" create_report(df, title='Weather_data_EDA') """ tempmax is skewed unlike other temps precip shows that there is no rain one in three days. this is confirmed by the conditions columns. Love London! winddir showing that the prevaling wind comes from the Atlantic as expected. cloudcover showing that the will always be some clouds hanging about. Rare for it to be over 30% though. Interesting inverse correlation between humidty and solar radiation. Otherwise the correlations amongst the other variables come at no surprise.. """ #time series plots for col in df.columns: plotSeries(df, col) """ Temperature behaving in accordance with seasons/time of year as expected. Humidity is the inverse of temperature but more noisy. Patterns in precipitation are not so easy to discern.
import numpy as np import pandas as pd from matplotlib.pyplot import xticks import dataprep.eda as eda broadband = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-11/broadband.csv') broadband.columns=['ST', 'COUNTY_ID', 'COUNTY_NAME', 'AVAILABILITY', 'USAGE'] eda.create_report(broadband).show_browser() eda.plot(broadband, 'ST', 'BROADBAND AVAILABILITY PER FCC').show_browser() eda.plot(broadband, 'ST', 'BROADBAND USAGE').show_browser() eda.plot_correlation(broadband).show_browser() broadband_zip = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-11/broadband_zip.csv') broadband.loc[50:55,] # there is a - in data broadband = broadband.drop(broadband[broadband.AVAILABILITY=='-'].index) broadband = broadband.drop(broadband[broadband.USAGE=='-'].index) broadband.info() broadband.AVAILABILITY=pd.to_numeric(broadband.AVAILABILITY) broadband.USAGE=pd.to_numeric(broadband.USAGE) br = broadband.groupby(['ST']).mean()[['AVAILABILITY', 'USAGE']].reset_index() import plotly.express as px fig = px.scatter(br, x='AVAILABILITY', y='USAGE', color='ST', opacity=0,
"""Loading the datasets""" # accepted loans filepath1 = "/content/drive/MyDrive/Master Thesis/Lending Club Loan Data/accepted_sample_new.csv" df1 = pd.read_csv(filepath1) # denied loans filepath2 = "/content/drive/MyDrive/Master Thesis/Lending Club Loan Data/denied_sample_new.csv" df2 = pd.read_csv(filepath2) df1.head() # descriptive statitistics report1 = create_report(df1, title='My Report 1') df2.shape """Data cleaning for the accepted dataset""" # create average fico score df1['fico_score'] = (df1['fico_range_low'] + df1['fico_range_high']) / 2 # create categorical variable for credit grade df1.grade = [ ord(x) - 64 for x in df1.grade ] # create logs for income df1['log_income'] = np.log(df1['annual_inc']) # Correlations
plot_missing(df) # plots the amount and position of missing values, and their relationship between columns plot_missing(df, 'tip') # plots the impact of the missing values in column x on all other columns plot_missing(df, 'tip', 'total_bill') # plots the impact of the missing values from column x on column y in various ways # Report ''' Overview: detect the types of columns in a dataframe Variables: variable type, unique values, distint count, missing values Quantile statistics like minimum value, Q1, median, Q3, maximum, range, interquartile range Descriptive statistics like mean, mode, standard deviation, sum, median absolute deviation, coefficient of variation, kurtosis, skewness Text analysis for length, sample and letter Correlations: highlighting of highly correlated variables, Spearman, Pearson and Kendall matrices Missing Values: bar chart, heatmap and spectrum of missing values ''' from dataprep.eda import create_report create_report(df, title='My Report') # report = create_report(df, title='My Report') # report.show_browser() # report.save(filename='report_01', to='~/Desktop') # Clean ''' DataPrep.Clean provides functions for quickly and easily cleaning and validating your data. Column Headers Country Names Email Addresses Geographic Goordinates IP Addresses Phone Numbers URLs
import numpy as np import pandas as pd from matplotlib.pyplot import xticks import dataprep.eda as eda survey = pd.read_csv( 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-18/survey.csv' ) broadband.columns = ['ST', 'COUNTY_ID', 'COUNTY_NAME', 'AVAILABILITY', 'USAGE'] eda.create_report(survey).show_browser() eda.plot(survey[survey.annual_salary < 1000000], 'annual_salary', 'industry').show_browser() eda.plot(survey[survey.annual_salary < 500000], 'annual_salary', 'industry').show_browser() eda.plot( survey[survey.annual_salary < 500000].replace( to_replace={ '5-7 years': '05-07 years', '2 - 4 years': '02-04 years', '21 - 30 years': '21-30 years', '11 - 20 years': '11-20 years', '1 year or less': '01 year or less', '8 - 10 years': '08-10 years', '31 - 40 years': '31-40 years', '41 years or more': '41 years or more' }), 'annual_salary', 'years_of_experience_in_field').show_browser()
""" Pandas Profile ============== """ # Libraries import pandas as pd # Specific from dataprep.eda import create_report # path path = './data/dataset.csv' # Load csv data = pd.read_csv(path) # Show print(data) # Create report profile = create_report(data, title="Pandas Profiling Report") # Save to file profile.to_file("report.html")