Пример #1
0
    def compare(df_train, df_test, targetfeat, config):
        pairwise_analysis = 'on'  #相關性和其他型別的資料關聯可能需要花費較長時間。如果超過了某個閾值,就需要設定這個引數為on或者off,以判斷是否需要分析資料相關性。
        compare_subsets_report = sv.compare(
            [df_train, 'Train'],  # 使用compare
            [df_test, 'Test'],
            target_feat=targetfeat,
            pairwise_analysis=pairwise_analysis)

        compare_subsets_report.show_html(
            filepath='./report/{}_CompareReport.html'.format(
                config.modelFileKey))
Пример #2
0
 def sweetviz_compare(self, test_data: pd.DataFrame,
                      target_col: str) -> None:
     """
     Compare two dataframes and output as HTML file using sweetviz.
     https://github.com/fbdesignpro/sweetviz
     :param test_data: The dataset will be compared with original dataset - Dataframe
     :param target_col: The name of the target column - str
     :return: None
     """
     my_report = sweetviz.compare([self.data, "Data"], [test_data, "Test"],
                                  target_col)
     my_report.show_html(
         "Report.html"
     )  # Not providing a filename will default to SWEETVIZ_REPORT.html
Пример #3
0
def make_sweetviz_report(df_train, df_val):
    print(f"\nSweetViz report start...")
    # # Code to view SweetViz configuration from config file
    # sweetviz.config_parser.read("Override.ini")
    # for sect in sweetviz.config_parser.sections():
    #     print("Section:", sect)
    #     for k, v in sweetviz.config_parser.items(sect):
    #         print(" {} = {}".format(k, v))
    #     print()

    sweetviz.config_parser.set(section="Layout", option="show_logo", value="0")
    feature_config = sweetviz.FeatureConfig(skip=None, force_text=None)

    #%% Create and save SweetViz report for train set
    sv_train = sweetviz.analyze(df_train,
                                target_feat="rate",
                                feat_cfg=feature_config)
    sv_train.show_html(reports_dir / f"SweeetViz_train.html")

    #%% Comparing two datasets (Validation vs Training sets)
    sv_compare = sweetviz.compare([df_train, "TRAIN"], [df_val, "VALID"],
                                  target_feat="rate",
                                  feat_cfg=feature_config)
    sv_compare.show_html(reports_dir / f"SweeetViz_compare.html")
Пример #4
0
 def generate_sv(train, test):
     my_report = sv.compare([train, "train"], [test, "test"])
     my_report.show_html(Path(save_path) / "sv.html")
Пример #5
0
Файл: titan.py Проект: lokcyi/AI
feature_config = sv.FeatureConfig(
    skip='Name',  # 要忽略哪個特徵
    force_cat=['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket',
               'Cabin'],  # Categorical特徵
    force_num=['Age', 'Fare'],  # Numerical特徵
    force_text=None)  # Text特徵

report_train_with_target = sv.analyze(
    [train, 'train'],
    target_feat='Survived',  # 加入特徵變數
    feat_cfg=feature_config)

report_train_with_target.show_html(
    filepath='Basic_train_report_with_target.html')

compare_report = sv.compare(
    [train, 'Training Data'],  # 使用compare
    [test, 'Test Data'],
    'Survived',
    feat_cfg=feature_config)

compare_report.show_html(filepath='Compare_train_test_report.html')

compare_subsets_report = sv.compare_intra(
    train,
    train['Sex'] == 'male',  # 給條件區分
    ['Male', 'Female'],  # 為兩個子資料集命名 
    target_feat='Survived',
    feat_cfg=feature_config)

compare_subsets_report.show_html(filepath='Compare_male_female_report.html')
Пример #6
0
def structured_data_app():

    st.write("Welcome to the DQW for structured data analysis. ",
             "Structured data analysis is an important step ",
             "in AI model development or Data Analysis. This app ",
             "offers visualisation of descriptive statistics of a ",
             "csv input file by using the sweetviz package.",
             " You can pick to analyse only 1 file or compare 2.")

    # Side panel setup
    # Step 1 includes Uploading
    display_app_header(main_txt="Step 1",
                       sub_txt="Upload data",
                       is_sidebar=True)

    data_input_mthd = st.sidebar.radio(
        "Select Data Input Method",
        ('Upload a CSV file', 'Upload a json file'))

    selected_structure = st.selectbox("Choose type of analysis",
                                      ("Analyse 1 file", "Compare 2 files"))

    if selected_structure == "Compare 2 files":

        st.subheader('Choose data to analyse :alembic:')

        uploaded_files = st.file_uploader("Upload CSVs to compare",
                                          type="csv",
                                          accept_multiple_files=True)

        data = []
        for file in uploaded_files:
            dataframe = pd.read_csv(file)
            file.seek(0)
            data.append(dataframe)

        st.subheader(
            'A preview of input files is below, please wait for data to be compared :bar_chart:'
        )
        st.write(data[0].head(5))
        st.write(data[1].head(5))

        my_report = sv.compare([data[0], "Input file 1"],
                               [data[1], "Input file 2"])

        my_report.show_html(layout='vertical', open_browser=True)

        #profile = ProfileReport(data, title='Your input data profile report').to_html()
        display = open("SWEETVIZ_REPORT.html", 'r', encoding='utf-8')
        source_code = display.read()
        # display html page in streamlit
        components.html(source_code, height=600, scrolling=True)

    if selected_structure == "Analyse 1 file":

        st.subheader('Choose data to analyse :alembic:')
        data, txt = check_input_method(data_input_mthd)

        st.subheader(
            'A preview of input data is below, please wait for data to be analyzed :bar_chart:'
        )
        st.write(data.head(5))

        my_report = sv.analyze(data)

        my_report.show_html(layout='vertical', open_browser=True)

        #profile = ProfileReport(data, title='Your input data profile report').to_html()
        display = open("SWEETVIZ_REPORT.html", 'r', encoding='utf-8')
        source_code = display.read()
        # display html page in streamlit
        components.html(source_code, height=600, scrolling=True)
Пример #7
0
def test_train_split(csv):
    df = pd.read_csv(csv)
    # feature_config = sv.FeatureConfig(skip="PassengerId", force_text=["Age"])
    train, test = train_test_split(df, test_size=0.20, random_state=42)
    my_report = sv.compare([test, "Training Data"], [train, "Test Data"])
    my_report.show_html("splitreport.html")
Пример #8
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 17 20:17:22 2020

@author: hirokawaeiji
"""

import pandas as pd
import sweetviz as sv

# データの読み込み
train = pd.read_csv('train.csv', index_col=0)
test = pd.read_csv('test.csv', index_col=0)

my_report = sv.compare([train, "Training Data"], [test, "Test Data"],
                       "SalePrice")
my_report.show_html()
Пример #9
0
########################################################################################################################

### Want to compare 
# 	provider_id	hospital_name
#   	 330393	   SUNY/STONY BROOK UNIVERSITY HOSPITAL

# Create stonybrook datasets 
sb_hospital = df_hospital_2[df_hospital_2['provider_id'] == '330393']
sb_inpatient = df_inpatient_2[df_inpatient_2['provider_id'] == 330393]
sb_outpatient = df_outpatient_2[df_outpatient_2['provider_id'] == 330393]


sb_inpatient_analysis = sv.analyze(sb_inpatient)
sb_inpatient_analysis.show_html('/Users/hantswilliams/Dropbox/Biovirtua/Python_Projects/ahi/AHI_STATS_507/Week13_Summary/output/sweet_report_df_inpatient_sb.html')

sb_outpatient_analysis = sv.analyze(sb_outpatient)
sb_outpatient_analysis.show_html('/Users/hantswilliams/Dropbox/Biovirtua/Python_Projects/ahi/AHI_STATS_507/Week13_Summary/output/sweet_report_df_outpatient_sb.html')


########################################################################################################################
########################################################################################################################
########################################################################################################################


nonsb_hospital = df_hospital_2[df_hospital_2['provider_id'] != '330393']
nonsb_inpatient = df_inpatient_2[df_inpatient_2['provider_id'] != 330393]
nonsb_outpatient = df_outpatient_2[df_outpatient_2['provider_id'] != 330393]


my_report = sv.compare([sb_inpatient, "Inpatient_SB"], [nonsb_inpatient, "Inpatient_NonSB"])
my_report.show_html('/Users/hantswilliams/Dropbox/Biovirtua/Python_Projects/ahi/AHI_STATS_507/Week13_Summary/output/sweet_report_inpatient_compare.html')
Пример #10
0
# load data
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

# generate report
sel_cols = ['PassengerId', 'Survived', 'Age']
in_args = {
    'source': [df_train[sel_cols], 'Train'],
    'target_feat': 'Survived',
    'feat_cfg': sv.FeatureConfig(skip="PassengerId"),
    'pairwise_analysis': 'on'
}
titanic_report = sv.analyze(**in_args)
titanic_report.show_html('doc/titanic_report.html')

# Compare
my_report = sv.compare([df_train, "Train"], [df_test, "Test"], "Survived")
my_report.show_html('doc/titanic_report_train_test.html')

# Some eda
df_train.groupby(['Pclass']).agg({'PassengerId': 'count', 'Survived': 'mean'})
df_train.groupby(['Sex']).agg({'PassengerId': 'count', 'Survived': 'mean'})
df_train.groupby(['Fare']).agg({'PassengerId': 'count', 'Survived': 'mean'})
df_train.groupby(['SibSp']).agg({'PassengerId': 'count', 'Survived': 'mean'})
pd.cut(df_train['Fare'], bins=3).value_counts()

df_test.groupby(['SibSp']).agg({'PassengerId': 'count'})
df_test.groupby(['Pclass']).agg({'PassengerId': 'count'})
df_test.groupby(['Sex']).agg({'PassengerId': 'count'})
Пример #11
0
def output_sweetviz(tra, val, output_filename="Report"):
    my_report = sv.compare([tra, "Train"], [val, "Test"])
    my_report.show_html(output_filename + ".html")
Пример #12
0
# 데이터 불러오기
from sklearn.datasets import load_iris
iris_dataset = load_iris()
X = iris_dataset.data
Y = iris_dataset.target

import pandas as pd
df = pd.DataFrame(X)

# EDA 자동화 라이브러리
## 1. Sweetviz

import sweetviz as sv

advert_report = sv.analyze([df[0:100], 'train'])  # sv.analyze([dataframe, 이름])
advert_report.show_html('./sweetviz_iris.html')

advert_report2 = sv.compare([df[0:100], 'train'], [df[100:150], 'test'])
advert_report2.show_html('./sweetviz_iris2.html')

## 2. Pandas_profiling

import pandas_profiling

profile = df.profile_report()
profile.to_file(output_file="./profiling_iris.html")
Пример #13
0
drive.mount('/content/drive/')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
!pip install sweetviz==1.0a7
import sweetviz
train=pd.read_csv("/content/drive/My Drive/data/server/Train.csv")
test=pd.read_csv("/content/drive/My Drive/data/server/Test.csv")
#from google.colab import output
#output.clear()
#Performing sweetviz which will show all eda process
##Analyzing a single dataframe (and its optional target feature)
my_report1 = sweetviz.analyze([train, "Train"],target_feat='MULTIPLE_OFFENSE')
my_report1.show_html("Report.html") 
#Comparing two dataframes (e.g. Test vs Training sets)
my_report2 = sweetviz.compare([train, "Train"], [test, "Test"], "MULTIPLE_OFFENSE")
my_report2.show_html("Report.html")
train.head()
test.head()
#feature preprocessing
train.drop(train.columns[[0,1]], axis = 1, inplace = True)
test.drop(test.columns[[0,1]], axis = 1, inplace = True)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
print(scaler.fit(train))
print(scaler.fit(train))
train.X_12.value_counts
train.X_12.isna().sum()
test.X_12.value_counts
test.X_12.isna().sum()
#imputing missing values
Пример #14
0
    if ".pdf" in i:
        pdf_name = i

if pdf_name != None:
    if pdf_name in os.listdir():
        df = tabula.read_pdf(pdf_name, pages='all')
        tabula.convert_into(pdf_name,
                            "output.csv",
                            output_format="csv",
                            pages='all')
        print(df)
    else:
        df = pd.read_csv('output.csv')
        bank_report = sv.analyze(df)
        # bank_report.show_html('bank.html')
        df1 = sv.compare(df[5000:], df[:5000])
        # df1.show_html('Compare.html')


@app.route('/')
def home():
    return render_template('Compare.html')


@app.route('/bank')
def bank():
    return render_template('bank.html')


if __name__ == "__main__":
    app.run(debug=True)
import sweetviz as sv

data_dir = './input'
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')

col_name = [
    'ID', 'Type', 'Region', 'City Code', 'Prefecture Name', 'City Name',
    'District Name', 'Nearest Station: Name', 'Nearest station: distance',
    'Floor plan', 'Area', 'Land shape', 'Frontage', 'Total floor area',
    'Year built', 'Building structure', 'Use', 'Future use',
    'Frontal road: Direction', 'Frontal road: Type', 'Frontal road: Width',
    'City planning', 'Building coverage', 'Floor-area ratio',
    'Date of transaction', 'Renovation', 'Circumstances of the transaction',
    'Transaction price_log'
]

train.columns = col_name
test.columns = col_name[:-1]

train['Area'] = train['Area'].replace('2000㎡以上', 2000)
train['Area'] = train['Area'].astype(int)

skip_cols = ["ID"]
target_col = 'Transaction price_log'

feature_config = sv.FeatureConfig(skip=skip_cols)
my_report = sv.compare([train, "Training Data"], [test, "Test Data"],
                       target_col, feature_config)

my_report.show_html()
import pandas as pd

# In[12]:

# Load the data
df = pd.read_csv(r'D:\pythonworkspace\breastcancer.csv')

# In[21]:

# check for the rows and columns in the data
df.shape

# In[17]:

# from sweetviz analyze the data Loaded in df
snsreport = sv.analyze(df)

# In[18]:

# To show the report of analayzed data
snsreport.show_html('breastcancer.html')

# In[19]:

# We can also compare with various rows within the data and extract the insights from it

df1 = sv.compare(df[100:], df[:100])
df1.show_html('compare.html')

# In[ ]:
Пример #17
0
# データフレーム内の比較 (`compare_intra()` 関数)

# 月ラベルの付与
df2['月'] = np.where(df2['日時'] < '2021-05-01', 4, 5)

# データフレームから数値型の列のみ抽出
# compare_intra() 関数では、文字列型の列があるとエラーになる
df2 = df2.select_dtypes(include='number')

# Sweetvizインスタンスの作成
my_report3 = sv.compare_intra(df2, df2['月'] == 4, ['4月', '5月'])

# レポートの作成
my_report3.show_html(filepath='sweetviz_report03.html',
                     open_browser=False, layout='vertical')


# データフレーム間の比較 (`compare()` 関数)

# データフレームの分割
df1_train = df1.sample(frac=0.7, random_state=334)
df1_test = df1.drop(df1_train.index)

# Sweetvizインスタンスの作成
my_report4 = sv.compare([df1_train, 'Training'], [df1_test, 'Test'], 'target')

# レポートの作成
my_report4.show_html(filepath='sweetviz_report04.html',
                     open_browser=False, layout='vertical')
Пример #18
0
    x_train_txt_label_encode[:, i-1] = labelencoder_X.fit_transform(x_train_txt_label_encode[:, i-1])

for i in range(np.shape(x_test_txt_label_encode)[1]):
    x_test_txt_label_encode[:, i-1] = labelencoder_X.fit_transform(x_test_txt_label_encode[:, i-1])

# Hot Encode Categorical Variables
#x_train_txt_hot_encode = pd.get_dummies(data=x_train_txt_avg.iloc[:, list(range(x_train_txt_encode_split, len(x_train_txt_avg.columns)))], columns=x_train_txt_hot_encode_col)
#x_train_txt_hot_encoded_col = list(x_train_txt_hot_encode.columns)
#x_train_txt_hot_encode = x_train_txt_hot_encode.values
#x_test_txt_hot_encode = pd.get_dummies(data=x_test_txt_avg.iloc[:, list(range(x_test_txt_encode_split, len(x_test_txt_avg.columns)))], columns=x_test_txt_hot_encode_col)
#x_test_txt_hot_encoded_col = list(x_test_txt_hot_encode.columns)
#x_test_txt_hot_encode = x_test_txt_hot_encode.values

x_train_Imp_En = pd.concat([pd.DataFrame(x_train_num_avg), pd.DataFrame(x_train_txt_label_encode)], axis=1)  # Update with Hot Encode Data if available
x_test_Imp_En = pd.concat([pd.DataFrame(x_test_num_avg), pd.DataFrame(x_test_txt_label_encode)], axis=1)  # Update with Hot Encode Data if available


# Feature Scaling
sc_x = StandardScaler()
x_train_Imp_En_Fs = sc_x.fit_transform(x_train_Imp_En)
x_test_Imp_En_Fs = sc_x.transform(x_test_Imp_En)
sc_y = StandardScaler()
y_train_Imp_En = y_train
y_train_Imp_En_Fs = sc_y.fit_transform(y_train_Imp_En)


# EDA Analysis with required data (update with necessary processed data based on the need) 
EDA = sweetviz.compare([train_data, "Train"], [test_data, "Test"], "Survived")
EDA.show_html("Titanic Data EDA.html")