예제 #1
0
# Removing the Rows with NAN value which started before the actual session (iMotions always start earlier)
dataFrame_Eye_Car = dataFrame_Eye_Car[np.logical_not(
    dataFrame_Eye_Car['Mode'].isnull())]
dataFrame_Eye_Car = dataFrame_Eye_Car[np.logical_not(
    dataFrame_Eye_Car['Speed'].isnull())]

# Checking how many features have NaN values
pd.isnull(dataFrame_Eye_Car).sum()
# Almost all Car data DON'T have NaN value

###############################################
tmpDataFrame = dataFrame_Eye_Car
msno.matrix(tmpDataFrame)

# Keep those columns that have at least 20 percent data
atLeast80Data = msno.nullity_filter(tmpDataFrame, filter='top', p=0.20)
#msno.matrix(atLeast80Data, labels=True, fontsize=10)

# show the  value range of each column
sample_m = atLeast80Data
sample_m.describe().T.loc[:, ('count', 'mean', 'std', 'min', 'max')]
impute_cols = set(sample_m.select_dtypes(include='number').columns)
impute_cols.remove('Alarm')
impute_cols.remove('Mode')
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")

for col in impute_cols:
    imputed = imputer.fit_transform(np.array(sample_m[col]).reshape(-1, 1))
    sample_m[col] = imputed

#msno.matrix(sample_m, labels=True, fontsize=10)
예제 #2
0
在正常的matplotlib中使用热力图看特征之间的相关性,
这里也同样的,使用heatmap看缺失特征之间的相关性
代码同样非常简单,如下所示'''
msno.heatmap(df)
plt.show()
'''由于这里是随机生成的Na值,因此特征之间没有强相关性,如果是有规律的缺失值,
可以看出哪些特征之间互相影响,相互缺失,这个感兴趣的同学,可以测试一下真实数据;'''
'''5、bar
条形图和matplotlib功能上没有太大的区别,
msno使用的是占比情况,df使用的count形式;'''

msno.bar(df, labels=True)
plt.show()

df.isnull().sum().plot(kind='bar')
plt.show()
'''6、dendrogram
树状图'''
msno.dendrogram(df)
plt.show()
'''树状图
a、特征越单调,越相关,距离越接近于0,从图上看,特征之间并没有相关
b、从层次聚类角度来看,这里缺失值特征基本上属于一个类别'''
'''7、nullity_filter 
从filter可以看出这个方法用来做筛选的,选择缺失值小于10%的top前2特征'''
df1 = msno.nullity_filter(df, filter='top', p=0.9, n=2)
print(df.shape)
print(df1.shape)
'''missingno的方法介绍就到这里
由于水平有限,请参照指正'''
예제 #3
0
msno.matrix(variable_slice(data, 'national_rainfall_index'),
            inline=False,
            sort='descending')
plt.xlabel('Time period')
plt.ylabel('Country')
plt.title(
    'Missing national rainfall index data across countries and time periods \n \n \n \n'
)
data = data.loc[~(data.variable == 'national_rainfall_index')]
#By country
north_america = subregion(data, 'North America')
msno.matrix(msno.nullity_sort(time_slice(north_america, '2013-2017'),
                              sort='descending').T,
            inline=False)
plt.show()
msno.nullity_filter(country_slice(data, 'Bahamas').T, filter='bottom', p=0.1)
#By country for a single variable
geo = r'world.json'

null_data = recent['agg_to_gdp'].notnull() * 1
map = folium.Map(location=[48, -102], zoom_start=2)
map.choropleth(
    geo_data=geo,
    data=null_data,
    columns=['country', 'agg_to_gdp'],
    key_on='feature.properties.name',
    reset=True,
    fill_color='GnBu',
    fill_opacity=1,
    line_opacity=0.2,
    legend_name='Missing agricltural contribution to GDP data 2013-2017')
예제 #4
0
 def test_combined_cutoff_bottom(self):
     expected = self.df.loc[:, ['A']]
     result = msno.nullity_filter(self.df, n=2, p=0.4, filter='bottom')
     assert result.equals(expected)
예제 #5
0
 def test_combined_cutoff_top(self):
     expected = self.df.loc[:, ['C']]
     result = msno.nullity_filter(self.df, n=2, p=0.7, filter='top')
     assert result.equals(expected)
예제 #6
0
 def test_percentile_cutoff_top_n(self):
     expected = self.df.loc[:, ['A']]
     result = msno.nullity_filter(self.df, n=1, filter='bottom')
     assert result.equals(expected)
예제 #7
0
 def test_percentile_cutoff_top_p(self):
     expected = self.df.loc[:, ['B', 'C']]
     result = msno.nullity_filter(self.df, p=0.6, filter='top')
     assert result.equals(expected)
예제 #8
0
 def test_no_op(self):
     assert self.df.equals(msno.nullity_filter(self.df))
     assert self.df.equals(msno.nullity_filter(self.df, filter='top'))
     assert self.df.equals(msno.nullity_filter(self.df, filter='bottom'))