# In[ ]: train_copy = train train_copy = train_copy.replace(-1, np.NaN) # Next, we can use resident Kaggler's [Aleksey Bilogur](https://www.kaggle.com/residentmario) - creator of the "Missingno" package which is a most useful and convenient tool in visualising missing values in the dataset, so check it out. # In[ ]: import missingno as msno # Nullity or missing values by columns msno.matrix(df=train_copy.iloc[:,2:39], figsize=(20, 14), color=(0.42, 0.1, 0.05)) # As we can see, the missing values now become much more apparent and clear when we visualise it, where the empty white bands (data that is missing) superposed on the vertical dark red bands (non-missing data) reflect the nullity of the data in that particular column. In this instance, we can observe that there are 7 features out of the 59 total features (although as rightly pointed out by Justin Nafe in the comments section there are really a grand total of 13 columns with missing values) that actually contained null values. This is due to the fact that the missingno matrix plot can only comfortable fit in approximately 40 odd features to one plot after which some columns may be excluded, and hence the remaining 5 null columns have been excluded. To visualize all nulls, try changing the figsize argument as well as tweaking how we slice the dataframe. # # For the 7 null columns that we are able to observe, they are hence listed here as follows: # # **ps_ind_05_cat | ps_reg_03 | ps_car_03_cat | ps_car_05_cat | ps_car_07_cat | ps_car_09_cat | ps_car_14** # # Most of the missing values occur in the columns suffixed with _cat. One should really take further note of the columns ps_reg_03, ps_car_03_cat and ps_car_05_cat. Evinced from the ratio of white to dark bands, it is very apparent that a big majority of values are missing from these 3 columns, and therefore a blanket replacement of -1 for the nulls might not be a very good strategy. # **Target variable inspection** # # Another standard check normally conducted on the data is with regards to our target variable, where in this case, the column is conveniently titled "target". The target value also comes by the moniker of class/label/correct answer and is used in supervised learning models along with the corresponding data that is given (in our case all our train data except the id column) to learn the function that best maps the data to our target in the hope that this learned function can generalize and predict well with new unseen data. # In[ ]:
print(draw) draw.plot(kind='bar', color=['r', 'b'], label='Survived') plt.legend(['0', '1']) print( "The family size has a considerable impact on our outcome whether family") train.Embarked.replace("", "NAN", inplace=True) train.Embarked.fillna('S', inplace=True) train.Embarked.isnull().sum() # check train['Age'].hist(bins=10) train.Age.replace("", "NAN", inplace=True) train.Age.fillna(np.random.randint(20, 31), inplace=True) train.Age.isnull().sum() msno.matrix(train) train['Title'] = train.Name.apply( lambda x: re.search(' ([A-Z][a-z]+)\.', x).group(1)) sns.countplot(x='Title', data=train) plt.xticks(rotation=45) train['Title'] = train['Title'].replace({ 'Mlle': 'Miss', 'Mme': 'Mrs', 'Ms': 'Miss' }) train['Title'] = train['Title'].replace([ 'Don', 'Dona', 'Rev', 'Dr', 'Major', 'Lady', 'Sir', 'Col', 'Capt', 'Countess', 'Jonkheer' ], 'Special')
def visualize(df_train, df_labels): print(df_train.head().T) print(df_train.info()) msno.matrix(df_train) # Numerical features print(df_train.describe()) # Let's inspect now the categorical features cat_df = pd.DataFrame(columns=["Feature", "Cardinality", "% Missings"]) total_cardinality = 0 i = 0 for col in df_train.columns: if (df_train[col].dtype == np.object): cat_df.loc[i, "Feature"] = col cat_df.loc[i, "Cardinality"] = len(df_train[col].unique()) total_cardinality += len(df_train[col].unique()) pct_of_missing_values = float( (len(df_train[col]) - df_train[col].count()) / len(df_train[col])) cat_df.loc[i, "% Missings"] = pct_of_missing_values * 100 i += 1 print("Total cardinality of categorical features:", total_cardinality) print(cat_df) # Visualizations data_viz = pd.concat([df_train, df_labels['status_group']], axis=1) # Label distribution plt.figure(figsize=(14, 7)) sns.countplot(x='status_group', data=data_viz, palette="Greens_d") plt.show() # Construction year distribution # We need to filter the instances with year 0 that will be taken care of in the Data Preparation part plt.figure(figsize=(14, 7)) sns.distplot( data_viz['construction_year'][data_viz['construction_year'] > 0]) plt.show() # Water pump geographical distribution with population proportional circles and year of pump color bar # We need to filter the instances with year 0, longitud 0 and latitude 0 in this case data_viz[data_viz['longitude']>0][data_viz['latitude']<0][data_viz['construction_year']>0].plot\ (kind="scatter", x="longitude", y="latitude", alpha=0.4, s=data_viz["population"]/10, label="population", figsize=(14,10), c="construction_year", cmap=plt.get_cmap("jet"), colorbar=True, sharex=False) plt.legend plt.show() # Correlation heatmap of the numerical features cor = data_viz.corr() plt.figure(figsize=(14, 13)) sns.heatmap(cor, square=True, annot=True, cbar=False) plt.show() # Boxplot of label distribution by pump construction year plt.figure(figsize=(14, 7)) sns.boxplot(x='status_group', y="construction_year", data=data_viz[data_viz['construction_year'] > 0]) plt.show() # A different way of seeing this same concept, with proportions within the distribution plot, using violin plots fig, ax = plt.subplots(figsize=(14, 12)) ax = sns.violinplot(x='status_group', y="construction_year",\ data=data_viz[data_viz['construction_year']>0], split=True) plt.show() # Mosaic of permit distribution per label fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(16, 8)) fig = mosaic(data_viz, ['status_group', 'permit'], axes, title="Permit distribution per label") plt.show() # Mosaic of public meeting distribution per label fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(16, 8)) fig = mosaic(data_viz, ['status_group', 'public_meeting'], axes, title="Public meeting distribution") plt.show() # Mosaic of source class distribution per label fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(16, 8)) fig = mosaic(data_viz, ['status_group', 'source_class'], axes, title="Source class distribution per label") plt.show() # Bar charts of some relevant categorical features per label variables = ['quantity', 'payment', 'source_type', 'waterpoint_type'] label = 'status_group' plot_tables(data_viz, label, variables) plt.show() # Value distribution for some important features with low cardinalit variables = ['basin','extraction_type_class','management','management_group',\ 'water_quality','source','source_class'] plot_proportions(data_viz, variables) # Height distribution per label plt.figure(figsize=(14, 10)) p1=sns.kdeplot(data_viz[['gps_height','status_group']][data_viz.status_group == 'functional']\ [data_viz.gps_height > 0].gps_height, shade=True, color="g",label='functional') p1=sns.kdeplot(data_viz[['gps_height','status_group']][data_viz.status_group == 'non functional']\ [data_viz.gps_height > 0].gps_height, shade=True, color="r",label='non functional') p1=sns.kdeplot(data_viz[['gps_height','status_group']][data_viz.status_group == 'functional needs repair']\ [data_viz.gps_height > 0].gps_height, shade=True, color="y",label='functional needs repair') plt.show() # Pair plot of the relevant numerical features against each other, differentiating by label value sns.set(style="ticks") sns.pairplot(data_viz[['population','num_private','amount_tsh','status_group']],\ hue="status_group", diag_kind="kde") plt.show()
question = pd.read_csv('C:\\Users\\Lenovo\\Downloads\\data\\schema.csv') print(question.shape) print(question.tail(10)) # MultipleChoiceQuestions mcq = pd.read_csv('C:\\Users\\Lenovo\\Downloads\\data\\multipleChoiceResponses.csv', encoding="ISO-8859-1",low_memory=False) print(mcq.shape) print(mcq.head(10)) # nan data visualization - missingno import missingno as msno plt.show(msno.matrix(mcq, figsize=(12,5))) # SurveyStatics # 1. Gender print(sns.countplot(y='GenderSelect', data = mcq)) # sns.countplot("column", "data = using data) # 2. Country con_df = pd.DataFrame(mcq['Country'].value_counts()) print(con_df)
import numpy as np import pandas as pd import missingno as msno import seaborn as sn import matplotlib.pyplot as plt # Reading the data data = pd.read_csv('winequality-white.csv', sep=';') # Missing data detection msno.matrix(data, figsize=(10, 3)) # Distribution fig, axes = plt.subplots(nrows=2, ncols=1) fig.set_size_inches(10, 20) sn.boxplot(data=data, orient="v", ax=axes[0]) sn.boxplot(data=data, y="quality", orient="pH", ax=axes[1]) # Correlation analasys corrMatt = data.corr() mask = np.array(corrMatt) mask[np.tril_indices_from(mask)] = False fig, ax = plt.subplots() fig.set_size_inches(20, 10) sn.heatmap(corrMatt, mask=mask, vmax=.8, square=True, annot=True)
def test_freq_matrix(self): msno.matrix(self.freq_df, freq='BQ') return plt.gcf()
import seaborn as sns from statistics import mode import matplotlib.pyplot as plt import numpy as np from datetime import date trab = pd.read_excel("C:\\Users\\eduar\\Downloads\\GroupDatasets\\dataset.xlsx") #---------------------------------------------------------------------------------------- #STEP 1 #---------------------------------------------------------------------------------------- # cria um gráfico que mostra os valores em falta import missingno as msno msno.matrix(trab,figsize=(12,5)) # preenche os dados com valores em falta com a média ou moda dos intervalos trab= trab.fillna(trab.mean()) #---------------------------------------------------------------------------------------- #STEP 2 #---------------------------------------------------------------------------------------- # Criando novas variáveis # 1 - total em compras por cada cliente trab['MntTotal'] = trab['MntAcessories'] + trab['MntClothing'] + trab['MntBags'] + trab['MntAthletic'] + trab['MntShoes']
# Make sure we set the correct maximum for rating column out of range values # Isolate rows of rating > 5.0 airbnb[airbnb['rating'] > 5.0] airbnb[airbnb['rating'] > 5.0]['rating'] # Drop these rows and make sure we have effected changes airbnb.drop(airbnb[airbnb['rating'] > 5.0].index, inplace=True) # airbnb['rating'] = airbnb[airbnb['rating'] > 5.0].replace(5) # Visualize the rating column again sns.distplot(airbnb['rating'], bins=20) plt.show() # Get the maximum airbnb['rating'].max() # Dealing with missing data # Visualize the missingness msno.matrix(airbnb) plt.show() # Visualize the missingness on sorted values msno.matrix(airbnb.sort_values(by='rating')) plt.show() # Missingness barplot msno.bar(airbnb) plt.show() # Understand DataFrame with missing values in rating, number_of_stays, 5_stars, reviews_per_month airbnb[airbnb['rating'].isna()].describe() # Understand DataFrame with NO missing values in rating, number_of_stays, 5_stars, reviews_per_month airbnb[~airbnb['rating'].isna()].describe() # Impute missing data airbnb = airbnb.fillna({ 'reviews_per_month': 0,
st_time = time.time() while np.where(masks == 0)[0].shape[0] < miss_size: coordi_x = np.random.randint(0, masks.shape[1]) coordi_y = np.random.randint(0, masks.shape[0]) burst_len = np.random.randint(options.burst_min, options.burst_max) judge_res = (masks[coordi_y:coordi_y + burst_len, coordi_x] == [1]) if judge_res.all() == True: #如果都是1,即都没有缺失或人工缺失 data_noisy[coordi_y:coordi_y + burst_len, coordi_x] = np.nan masks[coordi_y:coordi_y + burst_len, coordi_x] = 0 np.save('./coalmill-mask/mask_{}.npy'.format(name_list[j]), masks) print('Save mask success.') en_time = time.time() print('Successful masking, time cosumed: {:.2f}s'.format(en_time - st_time)) msno.matrix(pd.DataFrame(data_noisy[:5000, :]), labels=False) plt.savefig('visual/matrix_{}.pdf'.format(name_list[j]), dpi=300, bbox_inches='tight') msno.matrix(pd.DataFrame(data_ground[:5000, :]), labels=False) plt.savefig('visual/matrix_{}_origin.pdf'.format(name_list[j]), dpi=300, bbox_inches='tight') # 储存最终结果 list_final = [] data_noisy = scaler.transform(data_noisy) data_ground = scaler.transform(data_ground) #### 减少数据量 small_or_not = 'medium' # ? normal, medium, small if small_or_not == 'small': # ! 为了测试专用,加快加载速度
# In[ ]: DimDf.head(20) # In[ ]: print(DimDf.dtypes) # In[ ]: pivot_ui(DimDf) # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') msno.matrix(Df) # In[288]: sns.pairplot(Df) # In[ ]: sns.pairplot(Df, hue="Day") # In[ ]: sns.pairplot(DimDf, hue="DeviceName") # # Feature selection methods ( based on importance for ML)
#Missing data part print("Number of missing values per feature") missingValueShare = [] for col in features.columns: #if is_string_dtype(df_dig[col]): missingValueShare.append(sum(features[col].isna()) / numSamples) #Print missing value graph vis.paintBarChartForMissingValues(features.columns, missingValueShare) # In[30]: #Visualize missing data with missingno import missingno as msno get_ipython().run_line_magic('matplotlib', 'inline') msno.matrix(features) # In[31]: if features.isnull().values.sum() > 0: msno.heatmap(features) # #### View Prepared Binary Features # # We need some more plots for the binary data types. # In[32]: #vis.plotBinaryValues(df_dig, df_dig.columns) #0:-1 #plt.savefig(image_save_directory + "/BinaryFeatures.png", dpi=70)
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory import os print(os.listdir("../input")) # Any results you write to the current directory are saved as output. # Define dictionary dictionary = {"column1":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], "column2":[1,2,3,4,np.nan,6,7,8,np.nan,10,np.nan,12,13,14,15,16,np.nan,18,np.nan,20], "column3":[1,2,3,4,np.nan,6,7,8,9,10,11,12,13,np.nan,15,16,17,18,np.nan,20]} # Create data frame from dictionary data_missingno = pd.DataFrame(dictionary) # import missingno library import missingno as msno msno.matrix(data_missingno) plt.show() # missingno bar plot msno.bar(data_missingno) plt.show() # load iris data data = pd.read_csv('../input/Iris.csv') data = data.drop(['Id'],axis=1) # Make the plot plt.figure(figsize=(15,10)) parallel_coordinates(data, 'Species', colormap=plt.get_cmap("Set1")) plt.title("Iris data class visualization according to features (setosa, versicolor, virginica)") plt.xlabel("Features of data set") plt.ylabel("cm") plt.savefig('graph.png')
columns = [ "Location", "name", "Date", "Result", "Belligerents.allies", "Belligerents.axis", "Casualties and losses.allies", "Casualties and losses.axis", ] # %% battles[columns].head(3) # %% msno.matrix(battles, labels=True, sparkline=False) # %% mask = battles[["Date", "Location"]].isnull().all(1) # %% print(battles.loc[mask, ["name", "url"]].to_string()) # %% battles = battles.dropna(subset=["Date", "Location"]) # %% pattern = r"/ ([\d|\.]+); ([\d|\.]+)" # %% battles.head(10).Location.str.extract(pattern)
import pandas as pd import matplotlib.pyplot as plt #plot data import seaborn as sns #plot data import missingno as ms #plot missing data """2. Data Cleaning""" url = 'https://raw.githubusercontent.com/nachi-hebbar/Forest-Fire-Prediction-Website/master/Forest_fire.csv' url1 = 'https://raw.githubusercontent.com/hiyabose/Depression/master/depressed.csv' url2 = 'https://raw.githubusercontent.com/hiyabose/Depression/master/newsurvey.csv' df = pd.read_csv(url2) df.head() df.info() ms.matrix(df) df.max() df.describe() df.shape sns.swarmplot(y="Age", x=" Risk", data=df) plt.show() """Here also we can see that the majority are depressed in their mid life.""" from sklearn.linear_model import LogisticRegression from sklearn import svm
def test_no_sparkline_matrix(self): msno.matrix(self.simple_df, sparkline=False) return plt.gcf()
def plot_us(): fig, ax = plt.subplots(1, 1) by = var_by.get() if by == 'None': by = None data_dropped_na = data.dropna() plot_type = type_combo.get() if plot_type == 'Histogram': g = sns.distplot(data_dropped_na[var_x.get()], rug=True, rug_kws={'color': '#777777', 'alpha': 0.2}, hist_kws={'edgecolor': 'black', 'color': '#6899e8', 'label': 'розподіл'}, kde_kws={'color': 'black', 'alpha': 0.2, 'label': 'ядрова оцінка густини'}) sns.despine(left=True, bottom=True) # видалити осі повністю g.set_xlabel(var_x.get(), color='black', fontsize=15, alpha=0.5) g.set_ylabel('Густина', color='black', fontsize=15, alpha=0.5) plt.legend(loc='upper right') fig.savefig('Plots/hist.pdf') plt.close(fig) os.startfile('Plots\hist.pdf') return if plot_type == 'Scatter plot': a = sns.jointplot(var_x.get(), var_y.get(), data=data_dropped_na, kind='reg', color='#5394d6', annot_kws={'fontsize': 14, 'loc': [-0.1, 0.85]}, marginal_kws={'rug': True, 'bins': 25, 'hist_kws': {'edgecolor': 'black'}}, joint_kws={'scatter_kws': {'alpha': 0.7}}) plt.setp(a.ax_marg_x.patches, linewidth=1.0, color='#a9c8e8') plt.setp(a.ax_marg_y.patches, linewidth=1.0, color='#a9c8e8') a.ax_joint.set_xlabel(var_x.get(), fontsize=15, alpha=0.7) a.ax_joint.set_ylabel(var_y.get(), fontsize=15, alpha=0.7) plt.savefig('Plots/scatter.pdf') plt.close() os.startfile('Plots\scatter.pdf') return if plot_type == 'Bar plot': ax = sns.barplot(x=var_x.get(), y=var_y.get(), hue=by, data=data_dropped_na, palette=combo_palette.get(), errcolor='0.4', errwidth=1.1) ax.set_ylabel('Середнє значення ' + var_y.get(), color='#666666') ax.set_xlabel(var_x.get(), color='#666666') plt.legend(loc=[0.8, 0.9]) sns.despine() fig.savefig('Plots/barplot.pdf') plt.close(fig) os.startfile('Plots\\barplot.pdf') return if plot_type == 'Count bar': ax = sns.countplot(x=var_x.get(), hue=by, data=data_dropped_na, palette=combo_palette.get()) ax.set_ylabel('Кількість', color='#666666') ax.set_xlabel(var_x.get(), color='#666666') plt.legend(loc=[0.8, 0.9]) sns.despine() fig.savefig('Plots/countbar.pdf') plt.close(fig) os.startfile('Plots\\countbar.pdf') return if plot_type == 'Boxplot': ax = sns.boxplot(var_x.get(), var_y.get(), data=data_dropped_na, hue=by, width=0.4, palette=combo_palette.get()) ax.set_ylabel(var_y.get(), color='#666666') ax.set_xlabel(var_x.get(), color='#666666') plt.legend(loc='upper right') sns.despine() plt.savefig('Plots/Boxplot.pdf') plt.close(fig) os.startfile('Plots\Boxplot.pdf') return if plot_type == 'Violin plot': ax = sns.violinplot(var_x.get(), var_y.get(), data=data_dropped_na, hue=by, scale='count', split=True, palette=combo_palette.get()) ax.set_ylabel(var_y.get(), color='#666666') ax.set_xlabel(var_x.get(), color='#666666') plt.legend(loc='upper right') sns.despine() plt.savefig('Plots/violin.pdf') plt.close(fig) os.startfile('Plots\\violin.pdf') return if plot_type == 'Beeswarm plot': ax = sns.swarmplot(var_x.get(), var_y.get(), data=data_dropped_na, hue=by, alpha=0.7, palette=combo_palette.get()) mean_width = .5 for tick, text in zip(ax.get_xticks(), ax.get_xticklabels()): sample_name = text.get_text() mean_val = data_dropped_na[data_dropped_na[var_x.get()] == sample_name][var_y.get()].mean() ax.plot([tick - mean_width / 2, tick + mean_width / 2], [mean_val, mean_val], lw=2, color='#777777') ax.set_ylabel(var_y.get(), color='#666666') ax.set_xlabel(var_x.get(), color='#666666') sns.despine() plt.savefig('Plots/beeswarm.pdf') plt.close(fig) os.startfile('Plots\\beeswarm.pdf') return if plot_type == 'Missing data with matrix': figsize = None if len(data.columns) > 10: figsize = (30, 27) else: figsize = (25, 10) ax = missingno.matrix(data if len(data) < 500 else data.sample(500), inline=False, figsize=figsize) plt.savefig('Plots/missing matrix.pdf') plt.close(fig) os.startfile('Plots\\missing matrix.pdf') return if plot_type == 'Missing data with bars': figsize = None if len(data.columns) > 10: figsize = (30, 27) else: figsize = (25, 10) ax = missingno.bar(data if len(data) < 500 else data.sample(500), inline=False, figsize=figsize) plt.savefig('Plots/missing bars.pdf') plt.close(fig) os.startfile('Plots\\missing bars.pdf') return if plot_type == 'Missing data correlations': ax = missingno.heatmap(data, inline=False, figsize=(25, 25)) plt.savefig('Plots/missing correlations.pdf') plt.close(fig) os.startfile('Plots\\missing correlations.pdf') return
def test_color_matrix(self): msno.matrix(self.simple_df, color=(70 / 255, 130 / 255, 180 / 255)) return plt.gcf()
""" Created on Sat May 9 19:49:30 2020 @author: Surraj """ import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import missingno as msno test = pd.read_csv('test.csv') df = pd.read_csv('train.csv') msno.matrix(df, figsize=(12, 5)) msno.matrix(test, figsize=(12, 5)) def null(df): null_value = df.isnull().sum() per_null = 100 * df.isnull().sum() / len(df) unique = pd.DataFrame(columns=['unique']) for i in df.columns: nunique = df[i].nunique() unique.loc[i] = [nunique] miss_val = pd.concat([null_value, per_null, unique], 1) miss_val_rename = miss_val.rename(columns={
# Drop a column df.drop("PassengerId", axis = 1) # Drop columns where there are greater than 10 missing values df.dropna(axis = 1, thresh = len(df)-10) # Compute number of missing values in a column df['Cabin'].isnull().sum() # Return all rows where 'Cabin' has a value (i.e. non-null) df[df['Cabin'].notnull()] # Use missingno msno.matrix(df); plt.show()
# # nullity analysis # In[ ]: import missingno as msno # In[ ]: msno.bar(dftrain.sample(890)) # # now nultity correlation wehave to see between age , cabin and embarked # In[ ]: msno.matrix(dftrain) # In[ ]: msno.heatmap(dftrain) # In[ ]: msno.dendrogram(dftrain) # # our finding says that when cabin and age values will come and will be null together where as in case of emabarked it is reverse # # from this we are concluding a fact that only 38.8 % people survived , and even most young people died in this disaster about age of 30 # # Now a pie chart percentage of Categories of people travelling survived
{col: 'max' for col in data_cols})) players = get_subgroup(data, player_index, player_cols) # 球员和裁判的关系 dyad_index = ['refNum', 'playerShort'] dyad_cols = [ 'games', 'victories', 'ties', 'defeats', 'goals', 'yellowCards', 'yellowReds', 'redCards', ] dyads = get_subgroup(data, dyad_index, dyad_cols) # 3、对于缺失值数据的处理 msno.matrix(players.sample(1000), labels=True) # 无效数据密度显示 msno.bar(players.sample(1000)) # 条形图显示 msno.heatmap(players.sample(1000)) # 热图相关性显示 msno.dendrogram(players.sample(1000)) # 树状图显示 players['rater1'] = players[['rater1'].notnull()] players['rater2'] = players[['rater2'].notnull()]
def main(): """Write Streamlit commands here to display text and data in the app. Replace the code within this function with your own data workflow and UI. Streamlit API reference: https://docs.streamlit.io/en/stable/api.html """ # Configures the default settings st.set_page_config(page_title='datathon-starter', page_icon='🛠️', layout='wide') # Page title and header st.title('🛠️📊') st.title('Starter code for data applications') st.subheader('MIT License') st.markdown(""" --- 🙌 Build your own data app Modify pre-existing code and implement empty functions:\n 1. Data tasks are found in `server/tasks.py` 2. Data workflows are found in `server/pipeline.py` 3. The Streamlit app's UI code is found in `app.py` --- 🚀 Try a quick example From the sidebar *(click on > if closed)*:\n 1. Select a dataset 2. Select all categorical variables in the multiselect widget 3. Select an endogenous variable in the chosen dataset From the main UI below:\n 4. Press the "Run workflow" button --- """) # Example app params = sidebar() # Display sidebar in Streamlit app # Drop `data` and return its value data = params.pop('data') # Drop dataset `item` code and return its value item = params.pop('item') title = DATASET_TITLES[item] st.subheader(f'{title}') st.text('A random sample of 5 rows:') st.table(data.sample(5)) # Display random sample as a static table # Column container for buttons col1, col2, col3 = st.beta_columns(3) # Data profiling if col1.button('🔬 Data profiling report'): profile_report = ProfileReport(data, explorative=True) st_profile_report(profile_report) # Missing value analysis if col2.button('🔎 Missing value plots'): # Check if there are any missing values if pd.notna(data).all().all(): st.warning('No missing values in dataset') else: fig1 = msno.matrix(data).get_figure() st.pyplot(fig1) fig2 = msno.heatmap(data).get_figure() st.pyplot(fig2) fig3 = msno.dendrogram(data).get_figure() st.pyplot(fig3) # Run data workflow if col3.button('✨ Run workflow!'): st.write('---') # Stop execution until a valid endogenous variable is selected if not (params.get('endog')): st.warning('Please select an endogenous variable') st.stop() flow_name = 'e2e_pipeline' project_name = 'datathon-starter' task_refs = ['wrangle_na'] params = { 'url': params.get('url'), 'sep': params.get('sep'), 'strategy': params.get('na_strategy') } results, state_msg = create_prefect_flow_run(flow_name, project_name, task_refs, params) # Check if all tasks were successfully executed if 'fail' in state_msg: # List of each state's (name, state message) in the workflow st.warning(state_msg) st.info('Please view the Flow logs on the Prefect Server\'s' ' [UI](localhost:8080).') # If all tasks were successfully executed else: # Unpack results preprocessed_data, conf_int_chart = results # Success! st.balloons() st.success(state_msg) # Retrieve results from prefect flow run st.subheader('Pre-processed Data') st.dataframe(preprocessed_data) st.subheader('Regression Results') st.text('Dot and whisker plot of coefficients' ' and their confidence intervals:') # Plot regression coefficient's confidence intervals st.altair_chart(conf_int_chart, use_container_width=True)
#Préparation des données #Chargement des packages import numpy as np import matplotlib.pyplot as plt import pandas as pd import missingno as msno #Chargements des données dataset = pd.read_csv("hcvdat0.csv") #Visualisation des données manquantes msno.matrix(dataset) x = dataset.iloc[:, 2:].values y = dataset.iloc[:, 1].values #Traitements des données manquantes from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy="mean") imputer = imputer.fit(x[:, 2:-1]) x[:, 2:-1] = imputer.transform(x[:, 2:-1]) #concatener les deux tableaux pour généré un fichier csv avec les donées complétées z = np.c_[y, x] #Généré fichier csv entetes = [ u'Category', u'Age', u'Sex', u'ALB', u'ALP', u'ALT', u'AST', u'BIL', u'CHE', u'CHOL', u'CREA', u'GGT', u'PROT'
# > Values of -1 indicate that the feature was missing from the observation. # # So, we need to find null value by finding '-1' value. # ## 2.2 Find Null data # We need to find some features containing null data.<br> # reference: [Anisotropic's work](https://www.kaggle.com/arthurtok/interactive-porto-insights-a-plot-ly-tutorial) # In[ ]: import missingno as msno train_null = train train_null = train_null.replace(-1, np.NaN) msno.matrix(df=train_null.iloc[:, :], figsize=(20, 14), color=(0.8, 0.5, 0.2)) # In[ ]: test_null = test test_null = test_null.replace(-1, np.NaN) msno.matrix(df=test_null.iloc[:, :], figsize=(20, 14), color=(0.8, 0.5, 0.2)) # In[ ]: # Extract columns with null data train_null = train_null.loc[:, train_null.isnull().any()] test_null = test_null.loc[:, test_null.isnull().any()] print(train_null.columns)
"""## 1.2 ) Reading the data from a CSV file""" df = pd.read_csv(r'WA_Fn-UseC_-HR-Employee-Attrition.csv') df.head() df.shape df.columns """## 1.3 ) Missing Values Treatment""" df.info() # no null or Nan values. df.isnull().sum() msno.matrix(df) # just to visualize. """## 1.4 ) The Features and the 'Target'""" df.columns df.head() """## 1.5 ) Univariate Analysis""" df.describe() """Let us first analyze the various numeric features. To do this we can actually plot a boxplot showing all the numeric features.""" sns.factorplot(data=df, kind='box', size=10, aspect=3) """Note that all the features have pretty different scales and so plotting a boxplot is not a good idea. Instead what we can do is plot histograms of various continuously distributed features. > We can also plot a kdeplot showing the distribution of the feature. Below I have plotted a kdeplot for the 'Age' feature. Similarly we plot for other numeric features also. Similarly we can also use a distplot from seaborn library.
# data['REGION_CODE'] = data['REGION_CODE'].astype(float) data['REGION_CODE'] = pd.to_numeric(data['REGION_CODE'], errors='coerce') # F:/ML_Project_April_2020/SD_Sales_Predict_ML_Projects/Sales_SD_Sample_ML_Projects/Month_Sales_JasonBrownie_Dataset.csv # , header=0, index_col=['BILLING_DATE'] print('Data Shape') print('\n-----------------') print(data.info) print(data.head(10)) print('Shape:', data.shape) print('\nAnalyzing missing Values in Dataset') print('\n-------------------------------------') # Visualize missing values as a matrix msno.matrix(data) # Visualize the number of missing values as a bar chart msno.bar(data) # Visualize the correlation between the number of missing values in different columns as a heatmap msno.heatmap(data) # fill missing values with mean column values data.fillna(data.mean(), inplace=True) # count the number of NaN values in each column print('\nSummary on Null Values') print('\n----------------------------') print(data.isnull().sum()) data.head(20)
from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, f1_score, confusion_matrix from sklearn.ensemble import GradientBoostingClassifier import category_encoders as ce import lightgbm as lgbm import re train = pd.read_csv( "C:/Users/10188/local_git/tabular-playground-series-apr-2021/train.csv") test = pd.read_csv( "C:/Users/10188/local_git/tabular-playground-series-apr-2021/test.csv") train.head() train.info() train.describe() msno.matrix(train) msno.bar(train) #Age, Ticket, Fare, Cabin, Embarked have null msno.bar(test) #Age, Ticket, Fare, Cabin(to drop), Embarked have null numeric_v = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'] string_v = ['Survived', 'Sex', 'Embarked', 'Ticket', 'Cabin', 'Name'] delete_v = ['PassengerId'] ################################################################## # - > name(family name), ticket(앞글자), cabin(a,b...)이용해보기 : Name은 의미x, ticket, cabin 이용 # - > famsize, name freq 를 categorical vari.로 이용해보기 & 남녀나눠서 모델링 ################################################################## # label encoding
def missing_value_vis(df): mv_vis = mn.matrix(df) del df gc.collect() return mv_vis
# # atemp - "feels like" temperature in Celsius # # humidity - relative humidity # # windspeed - wind speed # # casual - number of non-registered user rentals initiated # # registered - number of registered user rentals initiated */ # In[90]: df.isnull().sum() import missingno as msno msno.matrix(df) # In[93]: df.season.value_counts() # In[94]: df.weather.value_counts() # In[95]: sns.factorplot(x='season', data=df, kind='count') # In[96]:
# In[ ]: #TitanicSubmission.head() TitanicTrain.info() TitanicTrain.describe() # # 2 | Data Analysis and Visualisation # # **2.1 Missing values** # In[ ]: msno.matrix(TitanicTrain) TitanicTrain.isnull().sum() #msno.bar(TitanicTrain) #msno.heatmap(TitanicTrain) # It looks there are a lot of missing values for Age and Cabin and only 2 for Embarked. # This is interesting information to impute these missing values later to may be improve our prediction model. # # **2.2 Individual features** # In[ ]: columns = TitanicTrain.select_dtypes(include=[np.number]).drop(['PassengerId','Age','Fare'], axis=1).columns.tolist()
def test_simple_matrix(self): msno.matrix(self.simple_df) return plt.gcf()
## Closeness Centrality #draw_graph(G, pos, nx.closeness_centrality(G, distance='weight'), 'Closeness Centrality') draw_graph(G, pos, nx.closeness_centrality(G, distance=None, wf_improved=True), 'closeness Centrality', './figures/closeness_credit_transaction.png') # PART 2: IN THIS PART WE WILL BE WORKING WITH REAL-WORLD DATA. WE WILL VIZUALIZE THE CREDIT MOVEMEMTS OF # OF MTN-BENIN FOR THE SOLE PURPOSE OF IDENTIFYING SUSPICIOUS LINKS. import missingno as mn # Read in the data we will need to build the network df = pd.read_csv("./Data/CDRtestdata.csv") df.head() # We begin by taking a look at the first five rows of the data mn.matrix(df) # Visualize missing values in each columns of the dataset Graph = nx.DiGraph() for i, elrow in df.iterrows(): Graph.add_edge(elrow[0], elrow[1], attr_dict=elrow[0:].to_dict()) # Here our program assigns color to nodes base on their type node_col = [] NodeSet = list(Graph.nodes()) for node in NodeSet: if (node in list(df.ers_from_partner_id) and df.ers_sender_rs_type[df.loc[ df.ers_from_partner_id == node].index[0]] == 'SC') or ( node in list(df.ers_to_partner_id) and df.ers_receiver_rs_type[ df.loc[df.ers_to_partner_id == node].index[0]] == 'SC'): node_col.append('red')
def test_width_ratios_matrix(self): msno.matrix(self.simple_df, width_ratios=(30, 1)) return plt.gcf()
curbal_median = data_df['tot_cur_bal'].median() data_df['tot_cur_bal'] = data_df['tot_cur_bal'].fillna(curbal_median) #Replace NaN values in 'total_rev_hi_lim' column with median revlimit_median = data_df['total_rev_hi_lim'].median() data_df['total_rev_hi_lim'] = data_df['total_rev_hi_lim'].fillna(revlimit_median) #Bad Customer Definition df['BadLoan'] = np.where(np.isin(data_df['loan_status'],['Charged Off','Default','Late (31-120 days)', 'In Grace Period', 'Late (16-30 days)', 'Does not meet the credit policy. Status:Charged Off']), 1, 0) df.drop(['loan_status'],axis=1,inplace=True) #Lets see if there are any missing values left plt.figure(figsize=(16,6)) msno.matrix(data_df,labels = True, color = (0.2,0.15,0.45)) #Correlation Matrix for new dataset fig,ax = plt.subplots(figsize =(8,8)) corr = data_df.corr() mask = np.zeros_like(corr, dtype = np.bool) mask[np.triu_indices_from(mask)]=True sns.heatmap(corr,mask=mask,square = False, linewidths = .5,cbar_kws={"shrink": .5}) #Lets see how the interest rate varies by grade data_df.boxplot(column='int_rate', by='grade', rot=90) #Lets assign input and output values to the data y = data_df['int_rate'].values
def test_fontsize_matrix(self): msno.matrix(self.simple_df, fontsize=8) return plt.gcf()
from sklearn.preprocessing import Imputer from sklearn.model_selection import train_test_split from sklearn.linear_model import LassoLarsCV, LassoLarsIC import os #%% # Retrieve current working directory (`cwd`) cwd = os.getcwd() cwd # Change directory os.chdir(r"C:\Users\yehadji\Documents\MCS\MCS 02\Arrhythmia Data Set") #%% df_original = pd.read_csv(r"C:\Users\yehadji\Documents\MCS\MCS 02\Arrhythmia Data Set\arrhythmia.csv", na_values=['?'], delimiter = ";") #%% missingdata = df_original.columns[df_original.isnull().any()].tolist() fig1 = msno.matrix(df_original[missingdata], figsize=(30,20)) #nullity matrix fig1.plot() plt.savefig('missing1.jpg') # fig2 = msno.bar(df_original[missingdata], color="blue", log=True, figsize=(30,20))#bar chart visualization of the data nullity fig2.plot() plt.savefig('missing2.jpg') #% fig3 = msno.heatmap(df_original[missingdata], figsize=(30,20)) #correlation heatmap fig3.plot() plt.savefig('missing3.jpg') #%% #%%
def test_large_matrix(self): msno.matrix(self.large_df) return plt.gcf()
s=requests.get(PROCESSED_DATA_URL).content immigration_df=pd.read_csv(io.StringIO(s.decode('utf-8'))) # ## A general exploration of the immigration data # In[21]: immigration_df.head() # In[31]: immigration_report = pandas_profiling.ProfileReport(immigration_df) # In[32]: immigration_report.to_file('immigration_data_exploration_report.html') # In[30]: msno.matrix(immigration_df) # In[ ]: