def compare(df_train, df_test, targetfeat, config): pairwise_analysis = 'on' #相關性和其他型別的資料關聯可能需要花費較長時間。如果超過了某個閾值,就需要設定這個引數為on或者off,以判斷是否需要分析資料相關性。 compare_subsets_report = sv.compare( [df_train, 'Train'], # 使用compare [df_test, 'Test'], target_feat=targetfeat, pairwise_analysis=pairwise_analysis) compare_subsets_report.show_html( filepath='./report/{}_CompareReport.html'.format( config.modelFileKey))
def sweetviz_compare(self, test_data: pd.DataFrame, target_col: str) -> None: """ Compare two dataframes and output as HTML file using sweetviz. https://github.com/fbdesignpro/sweetviz :param test_data: The dataset will be compared with original dataset - Dataframe :param target_col: The name of the target column - str :return: None """ my_report = sweetviz.compare([self.data, "Data"], [test_data, "Test"], target_col) my_report.show_html( "Report.html" ) # Not providing a filename will default to SWEETVIZ_REPORT.html
def make_sweetviz_report(df_train, df_val): print(f"\nSweetViz report start...") # # Code to view SweetViz configuration from config file # sweetviz.config_parser.read("Override.ini") # for sect in sweetviz.config_parser.sections(): # print("Section:", sect) # for k, v in sweetviz.config_parser.items(sect): # print(" {} = {}".format(k, v)) # print() sweetviz.config_parser.set(section="Layout", option="show_logo", value="0") feature_config = sweetviz.FeatureConfig(skip=None, force_text=None) #%% Create and save SweetViz report for train set sv_train = sweetviz.analyze(df_train, target_feat="rate", feat_cfg=feature_config) sv_train.show_html(reports_dir / f"SweeetViz_train.html") #%% Comparing two datasets (Validation vs Training sets) sv_compare = sweetviz.compare([df_train, "TRAIN"], [df_val, "VALID"], target_feat="rate", feat_cfg=feature_config) sv_compare.show_html(reports_dir / f"SweeetViz_compare.html")
def generate_sv(train, test): my_report = sv.compare([train, "train"], [test, "test"]) my_report.show_html(Path(save_path) / "sv.html")
feature_config = sv.FeatureConfig( skip='Name', # 要忽略哪個特徵 force_cat=['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin'], # Categorical特徵 force_num=['Age', 'Fare'], # Numerical特徵 force_text=None) # Text特徵 report_train_with_target = sv.analyze( [train, 'train'], target_feat='Survived', # 加入特徵變數 feat_cfg=feature_config) report_train_with_target.show_html( filepath='Basic_train_report_with_target.html') compare_report = sv.compare( [train, 'Training Data'], # 使用compare [test, 'Test Data'], 'Survived', feat_cfg=feature_config) compare_report.show_html(filepath='Compare_train_test_report.html') compare_subsets_report = sv.compare_intra( train, train['Sex'] == 'male', # 給條件區分 ['Male', 'Female'], # 為兩個子資料集命名 target_feat='Survived', feat_cfg=feature_config) compare_subsets_report.show_html(filepath='Compare_male_female_report.html')
def structured_data_app(): st.write("Welcome to the DQW for structured data analysis. ", "Structured data analysis is an important step ", "in AI model development or Data Analysis. This app ", "offers visualisation of descriptive statistics of a ", "csv input file by using the sweetviz package.", " You can pick to analyse only 1 file or compare 2.") # Side panel setup # Step 1 includes Uploading display_app_header(main_txt="Step 1", sub_txt="Upload data", is_sidebar=True) data_input_mthd = st.sidebar.radio( "Select Data Input Method", ('Upload a CSV file', 'Upload a json file')) selected_structure = st.selectbox("Choose type of analysis", ("Analyse 1 file", "Compare 2 files")) if selected_structure == "Compare 2 files": st.subheader('Choose data to analyse :alembic:') uploaded_files = st.file_uploader("Upload CSVs to compare", type="csv", accept_multiple_files=True) data = [] for file in uploaded_files: dataframe = pd.read_csv(file) file.seek(0) data.append(dataframe) st.subheader( 'A preview of input files is below, please wait for data to be compared :bar_chart:' ) st.write(data[0].head(5)) st.write(data[1].head(5)) my_report = sv.compare([data[0], "Input file 1"], [data[1], "Input file 2"]) my_report.show_html(layout='vertical', open_browser=True) #profile = ProfileReport(data, title='Your input data profile report').to_html() display = open("SWEETVIZ_REPORT.html", 'r', encoding='utf-8') source_code = display.read() # display html page in streamlit components.html(source_code, height=600, scrolling=True) if selected_structure == "Analyse 1 file": st.subheader('Choose data to analyse :alembic:') data, txt = check_input_method(data_input_mthd) st.subheader( 'A preview of input data is below, please wait for data to be analyzed :bar_chart:' ) st.write(data.head(5)) my_report = sv.analyze(data) my_report.show_html(layout='vertical', open_browser=True) #profile = ProfileReport(data, title='Your input data profile report').to_html() display = open("SWEETVIZ_REPORT.html", 'r', encoding='utf-8') source_code = display.read() # display html page in streamlit components.html(source_code, height=600, scrolling=True)
def test_train_split(csv): df = pd.read_csv(csv) # feature_config = sv.FeatureConfig(skip="PassengerId", force_text=["Age"]) train, test = train_test_split(df, test_size=0.20, random_state=42) my_report = sv.compare([test, "Training Data"], [train, "Test Data"]) my_report.show_html("splitreport.html")
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Dec 17 20:17:22 2020 @author: hirokawaeiji """ import pandas as pd import sweetviz as sv # データの読み込み train = pd.read_csv('train.csv', index_col=0) test = pd.read_csv('test.csv', index_col=0) my_report = sv.compare([train, "Training Data"], [test, "Test Data"], "SalePrice") my_report.show_html()
######################################################################################################################## ### Want to compare # provider_id hospital_name # 330393 SUNY/STONY BROOK UNIVERSITY HOSPITAL # Create stonybrook datasets sb_hospital = df_hospital_2[df_hospital_2['provider_id'] == '330393'] sb_inpatient = df_inpatient_2[df_inpatient_2['provider_id'] == 330393] sb_outpatient = df_outpatient_2[df_outpatient_2['provider_id'] == 330393] sb_inpatient_analysis = sv.analyze(sb_inpatient) sb_inpatient_analysis.show_html('/Users/hantswilliams/Dropbox/Biovirtua/Python_Projects/ahi/AHI_STATS_507/Week13_Summary/output/sweet_report_df_inpatient_sb.html') sb_outpatient_analysis = sv.analyze(sb_outpatient) sb_outpatient_analysis.show_html('/Users/hantswilliams/Dropbox/Biovirtua/Python_Projects/ahi/AHI_STATS_507/Week13_Summary/output/sweet_report_df_outpatient_sb.html') ######################################################################################################################## ######################################################################################################################## ######################################################################################################################## nonsb_hospital = df_hospital_2[df_hospital_2['provider_id'] != '330393'] nonsb_inpatient = df_inpatient_2[df_inpatient_2['provider_id'] != 330393] nonsb_outpatient = df_outpatient_2[df_outpatient_2['provider_id'] != 330393] my_report = sv.compare([sb_inpatient, "Inpatient_SB"], [nonsb_inpatient, "Inpatient_NonSB"]) my_report.show_html('/Users/hantswilliams/Dropbox/Biovirtua/Python_Projects/ahi/AHI_STATS_507/Week13_Summary/output/sweet_report_inpatient_compare.html')
# load data df_train = pd.read_csv('data/train.csv') df_test = pd.read_csv('data/test.csv') # generate report sel_cols = ['PassengerId', 'Survived', 'Age'] in_args = { 'source': [df_train[sel_cols], 'Train'], 'target_feat': 'Survived', 'feat_cfg': sv.FeatureConfig(skip="PassengerId"), 'pairwise_analysis': 'on' } titanic_report = sv.analyze(**in_args) titanic_report.show_html('doc/titanic_report.html') # Compare my_report = sv.compare([df_train, "Train"], [df_test, "Test"], "Survived") my_report.show_html('doc/titanic_report_train_test.html') # Some eda df_train.groupby(['Pclass']).agg({'PassengerId': 'count', 'Survived': 'mean'}) df_train.groupby(['Sex']).agg({'PassengerId': 'count', 'Survived': 'mean'}) df_train.groupby(['Fare']).agg({'PassengerId': 'count', 'Survived': 'mean'}) df_train.groupby(['SibSp']).agg({'PassengerId': 'count', 'Survived': 'mean'}) pd.cut(df_train['Fare'], bins=3).value_counts() df_test.groupby(['SibSp']).agg({'PassengerId': 'count'}) df_test.groupby(['Pclass']).agg({'PassengerId': 'count'}) df_test.groupby(['Sex']).agg({'PassengerId': 'count'})
def output_sweetviz(tra, val, output_filename="Report"): my_report = sv.compare([tra, "Train"], [val, "Test"]) my_report.show_html(output_filename + ".html")
# 데이터 불러오기 from sklearn.datasets import load_iris iris_dataset = load_iris() X = iris_dataset.data Y = iris_dataset.target import pandas as pd df = pd.DataFrame(X) # EDA 자동화 라이브러리 ## 1. Sweetviz import sweetviz as sv advert_report = sv.analyze([df[0:100], 'train']) # sv.analyze([dataframe, 이름]) advert_report.show_html('./sweetviz_iris.html') advert_report2 = sv.compare([df[0:100], 'train'], [df[100:150], 'test']) advert_report2.show_html('./sweetviz_iris2.html') ## 2. Pandas_profiling import pandas_profiling profile = df.profile_report() profile.to_file(output_file="./profiling_iris.html")
drive.mount('/content/drive/') import pandas as pd import numpy as np import matplotlib.pyplot as plt !pip install sweetviz==1.0a7 import sweetviz train=pd.read_csv("/content/drive/My Drive/data/server/Train.csv") test=pd.read_csv("/content/drive/My Drive/data/server/Test.csv") #from google.colab import output #output.clear() #Performing sweetviz which will show all eda process ##Analyzing a single dataframe (and its optional target feature) my_report1 = sweetviz.analyze([train, "Train"],target_feat='MULTIPLE_OFFENSE') my_report1.show_html("Report.html") #Comparing two dataframes (e.g. Test vs Training sets) my_report2 = sweetviz.compare([train, "Train"], [test, "Test"], "MULTIPLE_OFFENSE") my_report2.show_html("Report.html") train.head() test.head() #feature preprocessing train.drop(train.columns[[0,1]], axis = 1, inplace = True) test.drop(test.columns[[0,1]], axis = 1, inplace = True) from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() print(scaler.fit(train)) print(scaler.fit(train)) train.X_12.value_counts train.X_12.isna().sum() test.X_12.value_counts test.X_12.isna().sum() #imputing missing values
if ".pdf" in i: pdf_name = i if pdf_name != None: if pdf_name in os.listdir(): df = tabula.read_pdf(pdf_name, pages='all') tabula.convert_into(pdf_name, "output.csv", output_format="csv", pages='all') print(df) else: df = pd.read_csv('output.csv') bank_report = sv.analyze(df) # bank_report.show_html('bank.html') df1 = sv.compare(df[5000:], df[:5000]) # df1.show_html('Compare.html') @app.route('/') def home(): return render_template('Compare.html') @app.route('/bank') def bank(): return render_template('bank.html') if __name__ == "__main__": app.run(debug=True)
import sweetviz as sv data_dir = './input' train = pd.read_csv('./input/train.csv') test = pd.read_csv('./input/test.csv') col_name = [ 'ID', 'Type', 'Region', 'City Code', 'Prefecture Name', 'City Name', 'District Name', 'Nearest Station: Name', 'Nearest station: distance', 'Floor plan', 'Area', 'Land shape', 'Frontage', 'Total floor area', 'Year built', 'Building structure', 'Use', 'Future use', 'Frontal road: Direction', 'Frontal road: Type', 'Frontal road: Width', 'City planning', 'Building coverage', 'Floor-area ratio', 'Date of transaction', 'Renovation', 'Circumstances of the transaction', 'Transaction price_log' ] train.columns = col_name test.columns = col_name[:-1] train['Area'] = train['Area'].replace('2000㎡以上', 2000) train['Area'] = train['Area'].astype(int) skip_cols = ["ID"] target_col = 'Transaction price_log' feature_config = sv.FeatureConfig(skip=skip_cols) my_report = sv.compare([train, "Training Data"], [test, "Test Data"], target_col, feature_config) my_report.show_html()
import pandas as pd # In[12]: # Load the data df = pd.read_csv(r'D:\pythonworkspace\breastcancer.csv') # In[21]: # check for the rows and columns in the data df.shape # In[17]: # from sweetviz analyze the data Loaded in df snsreport = sv.analyze(df) # In[18]: # To show the report of analayzed data snsreport.show_html('breastcancer.html') # In[19]: # We can also compare with various rows within the data and extract the insights from it df1 = sv.compare(df[100:], df[:100]) df1.show_html('compare.html') # In[ ]:
# データフレーム内の比較 (`compare_intra()` 関数) # 月ラベルの付与 df2['月'] = np.where(df2['日時'] < '2021-05-01', 4, 5) # データフレームから数値型の列のみ抽出 # compare_intra() 関数では、文字列型の列があるとエラーになる df2 = df2.select_dtypes(include='number') # Sweetvizインスタンスの作成 my_report3 = sv.compare_intra(df2, df2['月'] == 4, ['4月', '5月']) # レポートの作成 my_report3.show_html(filepath='sweetviz_report03.html', open_browser=False, layout='vertical') # データフレーム間の比較 (`compare()` 関数) # データフレームの分割 df1_train = df1.sample(frac=0.7, random_state=334) df1_test = df1.drop(df1_train.index) # Sweetvizインスタンスの作成 my_report4 = sv.compare([df1_train, 'Training'], [df1_test, 'Test'], 'target') # レポートの作成 my_report4.show_html(filepath='sweetviz_report04.html', open_browser=False, layout='vertical')
x_train_txt_label_encode[:, i-1] = labelencoder_X.fit_transform(x_train_txt_label_encode[:, i-1]) for i in range(np.shape(x_test_txt_label_encode)[1]): x_test_txt_label_encode[:, i-1] = labelencoder_X.fit_transform(x_test_txt_label_encode[:, i-1]) # Hot Encode Categorical Variables #x_train_txt_hot_encode = pd.get_dummies(data=x_train_txt_avg.iloc[:, list(range(x_train_txt_encode_split, len(x_train_txt_avg.columns)))], columns=x_train_txt_hot_encode_col) #x_train_txt_hot_encoded_col = list(x_train_txt_hot_encode.columns) #x_train_txt_hot_encode = x_train_txt_hot_encode.values #x_test_txt_hot_encode = pd.get_dummies(data=x_test_txt_avg.iloc[:, list(range(x_test_txt_encode_split, len(x_test_txt_avg.columns)))], columns=x_test_txt_hot_encode_col) #x_test_txt_hot_encoded_col = list(x_test_txt_hot_encode.columns) #x_test_txt_hot_encode = x_test_txt_hot_encode.values x_train_Imp_En = pd.concat([pd.DataFrame(x_train_num_avg), pd.DataFrame(x_train_txt_label_encode)], axis=1) # Update with Hot Encode Data if available x_test_Imp_En = pd.concat([pd.DataFrame(x_test_num_avg), pd.DataFrame(x_test_txt_label_encode)], axis=1) # Update with Hot Encode Data if available # Feature Scaling sc_x = StandardScaler() x_train_Imp_En_Fs = sc_x.fit_transform(x_train_Imp_En) x_test_Imp_En_Fs = sc_x.transform(x_test_Imp_En) sc_y = StandardScaler() y_train_Imp_En = y_train y_train_Imp_En_Fs = sc_y.fit_transform(y_train_Imp_En) # EDA Analysis with required data (update with necessary processed data based on the need) EDA = sweetviz.compare([train_data, "Train"], [test_data, "Test"], "Survived") EDA.show_html("Titanic Data EDA.html")