# Removing the Rows with NAN value which started before the actual session (iMotions always start earlier) dataFrame_Eye_Car = dataFrame_Eye_Car[np.logical_not( dataFrame_Eye_Car['Mode'].isnull())] dataFrame_Eye_Car = dataFrame_Eye_Car[np.logical_not( dataFrame_Eye_Car['Speed'].isnull())] # Checking how many features have NaN values pd.isnull(dataFrame_Eye_Car).sum() # Almost all Car data DON'T have NaN value ############################################### tmpDataFrame = dataFrame_Eye_Car msno.matrix(tmpDataFrame) # Keep those columns that have at least 20 percent data atLeast80Data = msno.nullity_filter(tmpDataFrame, filter='top', p=0.20) #msno.matrix(atLeast80Data, labels=True, fontsize=10) # show the value range of each column sample_m = atLeast80Data sample_m.describe().T.loc[:, ('count', 'mean', 'std', 'min', 'max')] impute_cols = set(sample_m.select_dtypes(include='number').columns) impute_cols.remove('Alarm') impute_cols.remove('Mode') imputer = SimpleImputer(missing_values=np.nan, strategy="mean") for col in impute_cols: imputed = imputer.fit_transform(np.array(sample_m[col]).reshape(-1, 1)) sample_m[col] = imputed #msno.matrix(sample_m, labels=True, fontsize=10)
在正常的matplotlib中使用热力图看特征之间的相关性, 这里也同样的,使用heatmap看缺失特征之间的相关性 代码同样非常简单,如下所示''' msno.heatmap(df) plt.show() '''由于这里是随机生成的Na值,因此特征之间没有强相关性,如果是有规律的缺失值, 可以看出哪些特征之间互相影响,相互缺失,这个感兴趣的同学,可以测试一下真实数据;''' '''5、bar 条形图和matplotlib功能上没有太大的区别, msno使用的是占比情况,df使用的count形式;''' msno.bar(df, labels=True) plt.show() df.isnull().sum().plot(kind='bar') plt.show() '''6、dendrogram 树状图''' msno.dendrogram(df) plt.show() '''树状图 a、特征越单调,越相关,距离越接近于0,从图上看,特征之间并没有相关 b、从层次聚类角度来看,这里缺失值特征基本上属于一个类别''' '''7、nullity_filter 从filter可以看出这个方法用来做筛选的,选择缺失值小于10%的top前2特征''' df1 = msno.nullity_filter(df, filter='top', p=0.9, n=2) print(df.shape) print(df1.shape) '''missingno的方法介绍就到这里 由于水平有限,请参照指正'''
msno.matrix(variable_slice(data, 'national_rainfall_index'), inline=False, sort='descending') plt.xlabel('Time period') plt.ylabel('Country') plt.title( 'Missing national rainfall index data across countries and time periods \n \n \n \n' ) data = data.loc[~(data.variable == 'national_rainfall_index')] #By country north_america = subregion(data, 'North America') msno.matrix(msno.nullity_sort(time_slice(north_america, '2013-2017'), sort='descending').T, inline=False) plt.show() msno.nullity_filter(country_slice(data, 'Bahamas').T, filter='bottom', p=0.1) #By country for a single variable geo = r'world.json' null_data = recent['agg_to_gdp'].notnull() * 1 map = folium.Map(location=[48, -102], zoom_start=2) map.choropleth( geo_data=geo, data=null_data, columns=['country', 'agg_to_gdp'], key_on='feature.properties.name', reset=True, fill_color='GnBu', fill_opacity=1, line_opacity=0.2, legend_name='Missing agricltural contribution to GDP data 2013-2017')
def test_combined_cutoff_bottom(self): expected = self.df.loc[:, ['A']] result = msno.nullity_filter(self.df, n=2, p=0.4, filter='bottom') assert result.equals(expected)
def test_combined_cutoff_top(self): expected = self.df.loc[:, ['C']] result = msno.nullity_filter(self.df, n=2, p=0.7, filter='top') assert result.equals(expected)
def test_percentile_cutoff_top_n(self): expected = self.df.loc[:, ['A']] result = msno.nullity_filter(self.df, n=1, filter='bottom') assert result.equals(expected)
def test_percentile_cutoff_top_p(self): expected = self.df.loc[:, ['B', 'C']] result = msno.nullity_filter(self.df, p=0.6, filter='top') assert result.equals(expected)
def test_no_op(self): assert self.df.equals(msno.nullity_filter(self.df)) assert self.df.equals(msno.nullity_filter(self.df, filter='top')) assert self.df.equals(msno.nullity_filter(self.df, filter='bottom'))