示例#1
0
def report_pathology(path=PATHOLOGY_PATH,
                     dropna=False,
                     usecols=None,
                     usedefault=False,
                     removecorr=None,
                     iqrcols=None):
    df = load_pathology(path=path,
                        dropna=dropna,
                        usecols=usecols,
                        usedefault=usedefault,
                        removecorr=removecorr,
                        iqrcols=iqrcols)
    create_report(df).save('pathology_report')
示例#2
0
def report_dengue(path=DENGUE_PATH,
                  dropna=False,
                  usecols=None,
                  usedefault=False,
                  removecorr=None,
                  iqrcols=None):
    df = load_dengue(path=path,
                     dropna=dropna,
                     usecols=usecols,
                     usedefault=usedefault,
                     removecorr=removecorr,
                     iqrcols=iqrcols)
    create_report(df).save('dengue_report')
示例#3
0
def create_window_display(data_to_display: pd.DataFrame,
                          display_name: str) -> None:
    """
    Create a window display from a table of data to advertise your dish

    i.e. Create an automatic EDA report for table data which will be viewable
         through the web API using dataprep

    Args:
        data_to_display (pd.DataFrame): The data to display in the EDA report
        display_name (str): Name of the data to display

    Returns:
        None, but saves an HTML report in the STATIC_DIR
    """
    dataprep_report = create_report(data_to_display, title="Window Display")
    dataprep_report.save(filename=f"{display_name}", to=str(STATIC_DIR))
    del dataprep_report
from matplotlib.pyplot import xticks
import numpy as np
import pandas as pd

water = pd.read_csv(
    'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-04/water.csv'
)

import dataprep.eda as eda
eda.create_report(water).show_browser()

eda.plot(water, 'water_source')
eda.plot(water, 'water_source', 'water_tech').show_browser()

eda.plot(water, 'water_source', 'country_name').show_browser()
示例#5
0
from matplotlib.pyplot import xticks
import numpy as np
import pandas as pd
from dataprep.eda import plot

# Prepare data
netflix_titles = pd.read_csv(
    'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-20/netflix_titles.csv'
)
df = netflix_titles.copy()

from dataprep.eda import create_report
create_report(df).show_browser()

df.columns
df2 = df[['show_id', 'date_added', 'release_year']].copy()
df2['added_year'] = df2.date_added.str.strip().str[-4:]
df2.dropna(inplace=True)
plot(df2, 'added_year', 'release_year')

import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8, 6))
ax = sns.boxplot(data=df2,
                 x='added_year',
                 y='release_year',
                 order=list(
                     df2.added_year.sort_values(ascending=True).unique()))
ax.set(
    title='Netflix: how many old titles added?',
    xlabel='Year added',
示例#6
0
df.describe()
"""
Since this is a time series problem, I'm not really looking for quite the same things as if it were a regression problem e.g. skewness
Interesting to see that the variability differs between the 3 temp metrics.
I dont see any extreme min or max values that would indicate data anomalies.
See nothing for Snow which cant be right given i live in the area and distinctly remember plenty of snow over the years. We'll drop these.
"""

#drop unneeded columns pt.3
cols_drop = ['snow', 'snowdepth']

df = df.drop(cols_drop, axis=1)

#auto eda (dataprep library)
"""I Like pandas profiling library but lets try an alternative for this EDA"""
create_report(df, title='Weather_data_EDA')
"""
tempmax is skewed unlike other temps
precip shows that there is no rain one in three days. this is confirmed by the conditions columns. Love London!
winddir showing that the prevaling wind comes from the Atlantic as expected.
cloudcover showing that the will always be some clouds hanging about. Rare for it to be over 30% though.
Interesting inverse correlation between humidty and solar radiation. Otherwise the correlations amongst the other variables come at no surprise..
"""

#time series plots
for col in df.columns:
    plotSeries(df, col)
"""
Temperature behaving in accordance with seasons/time of year as expected.
Humidity is the inverse of temperature but more noisy.
Patterns in precipitation are not so easy to discern.
import numpy as np
import pandas as pd
from matplotlib.pyplot import xticks
import dataprep.eda as eda

broadband = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-11/broadband.csv')
broadband.columns=['ST', 'COUNTY_ID', 'COUNTY_NAME', 'AVAILABILITY', 'USAGE']

eda.create_report(broadband).show_browser()

eda.plot(broadband, 'ST', 'BROADBAND AVAILABILITY PER FCC').show_browser()
eda.plot(broadband, 'ST', 'BROADBAND USAGE').show_browser()

eda.plot_correlation(broadband).show_browser()

broadband_zip = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-11/broadband_zip.csv')

broadband.loc[50:55,] # there is a - in data
broadband = broadband.drop(broadband[broadband.AVAILABILITY=='-'].index)
broadband = broadband.drop(broadband[broadband.USAGE=='-'].index)

broadband.info()
broadband.AVAILABILITY=pd.to_numeric(broadband.AVAILABILITY)
broadband.USAGE=pd.to_numeric(broadband.USAGE)

br = broadband.groupby(['ST']).mean()[['AVAILABILITY', 'USAGE']].reset_index()

import plotly.express as px
fig = px.scatter(br, x='AVAILABILITY', 
            y='USAGE', color='ST', opacity=0,
"""Loading the datasets"""

# accepted loans
filepath1 = "/content/drive/MyDrive/Master Thesis/Lending Club Loan Data/accepted_sample_new.csv"

df1 = pd.read_csv(filepath1)

# denied loans
filepath2 = "/content/drive/MyDrive/Master Thesis/Lending Club Loan Data/denied_sample_new.csv"

df2 = pd.read_csv(filepath2)

df1.head()

# descriptive statitistics
report1 = create_report(df1, title='My Report 1')

df2.shape

"""Data cleaning for the accepted dataset"""

# create average fico score
df1['fico_score'] = (df1['fico_range_low'] + df1['fico_range_high']) / 2

# create categorical variable for credit grade
df1.grade = [ ord(x) - 64 for x in df1.grade ]

# create logs for income
df1['log_income'] = np.log(df1['annual_inc'])

# Correlations
plot_missing(df) # plots the amount and position of missing values, and their relationship between columns
plot_missing(df, 'tip') # plots the impact of the missing values in column x on all other columns
plot_missing(df, 'tip', 'total_bill') # plots the impact of the missing values from column x on column y in various ways

# Report
'''
Overview: detect the types of columns in a dataframe
Variables: variable type, unique values, distint count, missing values
Quantile statistics like minimum value, Q1, median, Q3, maximum, range, interquartile range
Descriptive statistics like mean, mode, standard deviation, sum, median absolute deviation, coefficient of variation, kurtosis, skewness
Text analysis for length, sample and letter
Correlations: highlighting of highly correlated variables, Spearman, Pearson and Kendall matrices
Missing Values: bar chart, heatmap and spectrum of missing values
'''
from dataprep.eda import create_report
create_report(df, title='My Report')
# report = create_report(df, title='My Report')
# report.show_browser()
# report.save(filename='report_01', to='~/Desktop')


# Clean
'''
DataPrep.Clean provides functions for quickly and easily cleaning and validating your data.
    Column Headers
    Country Names
    Email Addresses
    Geographic Goordinates
    IP Addresses
    Phone Numbers
    URLs
示例#10
0
import numpy as np
import pandas as pd
from matplotlib.pyplot import xticks
import dataprep.eda as eda

survey = pd.read_csv(
    'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-18/survey.csv'
)

broadband.columns = ['ST', 'COUNTY_ID', 'COUNTY_NAME', 'AVAILABILITY', 'USAGE']

eda.create_report(survey).show_browser()

eda.plot(survey[survey.annual_salary < 1000000], 'annual_salary',
         'industry').show_browser()
eda.plot(survey[survey.annual_salary < 500000], 'annual_salary',
         'industry').show_browser()

eda.plot(
    survey[survey.annual_salary < 500000].replace(
        to_replace={
            '5-7 years': '05-07 years',
            '2 - 4 years': '02-04 years',
            '21 - 30 years': '21-30 years',
            '11 - 20 years': '11-20 years',
            '1 year or less': '01 year or less',
            '8 - 10 years': '08-10 years',
            '31 - 40 years': '31-40 years',
            '41 years or more': '41 years or more'
        }), 'annual_salary', 'years_of_experience_in_field').show_browser()
示例#11
0
"""
Pandas Profile
==============
"""
# Libraries
import pandas as pd

# Specific
from dataprep.eda import create_report

# path
path = './data/dataset.csv'

# Load csv
data = pd.read_csv(path)

# Show
print(data)

# Create report
profile = create_report(data, title="Pandas Profiling Report")

# Save to file
profile.to_file("report.html")