def M_data_scan(df: pd.DataFrame) -> None: """ 数据整体描述 维度、占内存大小、字段类型、缺失 """ print('# -------------------------') print('# 维度与数据大小 ') print('# -------------------------') print("数据 {} 行 {} 列".format(df.shape[0], df.shape[1])) print("数据占内存:{:.2f}MB".format(df.memory_usage().sum() / 1024**2)) print("数据集的特征类型:\n", df.dtypes) print('# -------------------------') print('# 特征类型和缺失情况 ') print('# -------------------------') cnt = 0 for col in df.columns: dtp = df[col].dtypes mis_cnt = df[col].isna().sum() if mis_cnt > 0: print("特征名称:{}, 特征类型:{}, 缺失数量:{}".format(col, dtp, mis_cnt)) cnt += 1 msg = '所有变量均无缺失' if cnt == 0 else '总共 %d 个缺失' % (cnt) print(msg) msn.bar(df) plt.show()
def clean_solve(): datatmsp = pd.read_excel('test2.xlsx') import missingno as msno #missingno绘制缺失数据分布图 msno.bar(datatmsp.sample(len(datatmsp)), figsize=(10, 4)) #缺失值可视化处理 #删除缺失值过半的列 half_count = len(datatmsp) / 2 datatmsp = datatmsp.dropna(thresh=half_count, axis=1) #删除重复行 datatmsp = datatmsp.drop_duplicates() #取出这4列数据 data = datatmsp[['item_loc', 'raw_title', 'view_price', 'view_sales']] #对区域列的省份和城市进行拆分: #生成province列: data['province'] = data.item_loc.apply(lambda x: x.split(' ')[0]) #注:因直辖市的省份和城市相同 这里根据字符长度进行判断 data['city'] = data.item_loc.apply(lambda x: x.split()[0] if len(x) < 5 else x.split()[1]) #提取销量列中的数字,得到sales列: data['sales'] = data.view_sales.apply(lambda x: x.split('人')[0]) #将数据类型进行转换 data['sales'] = data.sales.astype('int') list_col = ['province', 'city'] for i in list_col: data[i] = data[i].astype('category') #删除不用的列 data = data.drop( ['item_loc', 'view_sales'], axis=1) #使用0值表示沿着每一列或行标签\索引值向下执行方法 使用1值表示沿着每一行或者列标签模向执行对应的方法 return data
def missing_vals_vis(df, figsize=(8, 4)): _, ax = plt.subplots(figsize=figsize) mn.matrix(df, ax=ax) _, ax2 = plt.subplots(figsize=figsize) mn.dendrogram(df, ax=ax2) _, ax3 = plt.subplots(figsize=figsize) mn.bar(df, ax=ax3)
def plot_missing_bar(df): '''plot missing values bar''' fig, ax = plt.subplots() fig.set_size_inches(3, 2, forward=True) msno.bar(df, ax=ax) st.pyplot(fig)
def showMissValue(self): """ Show the ratio of missing value of each features. It will not save fig. :return: None """ msno.bar(self.data)
def missing_values(self, dataframe, byclass=False): """ Creates a bar plot for the count of missing values Parameters ---------- dataframe : a Dask dataframe A Dask dataframe for which missing values are to be visualized byclass: bool, default = False Specifies whether separate plots should be made for each class of the target feature """ df = dataframe.copy() if byclass: classes = df[self.target_feature].unique() for c in classes: graph = dataframe[dataframe[self.target_feature] == c].compute() plt.figure() plt.title('Missing Values for Class - ' + str(c)) plt.xlabel('Features') plt.ylabel('Missing Observations') missingno.bar(graph) else: plt.figure() plt.title('Missing Values in Dataset') plt.xlabel('Features') plt.ylabel('Missing Observations') missingno.bar(graph)
def missing_bar(df): missingValueColumns = df.columns[df.isnull().any()].tolist() msno.bar(df[missingValueColumns], figsize=(20, 8), color="#34495e", fontsize=12, labels=True) plt.show()
def nan_analysis(df, figure_size=(12, 5)): # fig, axs = plt.subplots(3,1) # nan ratio in each feature msno.bar(df, figsize=figure_size) time.sleep(0.2) # nan ratio in each row msno.matrix(df, figsize=figure_size) time.sleep(0.2) # plot nan correlation between features msno.heatmap(df, figsize=figure_size)
def bar_missing(sample_df, title): missing_data_df = sample_df.columns[sample_df.isnull().any()].tolist() msno.bar(sample_df[missing_data_df], color="black", log=False, figsize=(30, 18)) plt.title(title, fontsize=24, y=1.05) fig = plt.gcf() fig.savefig('graphs/' + title + '.png') plt.show()
def missings_viz(self, df, visualizar=True, escolhido_tipo=None, df_missings=False): ''' Visualizar os missings, plota o tipo de visualizacao : param df: pd.DataFrame para visualizar : param visualizar: booleano para decidir qual visualizar : param escolhido_tipo: inteiro para decidir qual tipo visualizar : param df_missings: booleano para retorna Dataframe com percentual de nulos : return: pd.DataFrame com nomes das colunas e porcentagem missings ''' if visualizar: # para quem usar um tema dark na IDE from matplotlib.pyplot import style style.use('classic') # colunas com missings apenas cols_miss = df.isnull().any() cols_miss = df.columns[cols_miss] if escolhido_tipo == None: print('Tipo de visualizacao: ', '\n', 'total de missings - 1', '\n', 'ordem de aparição - 2', '\n', 'correlação - 3', '\n', 'dendograma - 4') escolhido_tipo = int(input()) print('Visualização missings') # total if escolhido_tipo == 1: from missingno import bar bar(df[cols_miss]) # ordem aparicao elif escolhido_tipo == 2: from missingno import matrix matrix(df[cols_miss]) # correlacao elif escolhido_tipo == 3: from missingno import heatmap heatmap(df[cols_miss]) # dendograma elif escolhido_tipo == 4: from missingno import dendrogram dendrogram(df[cols_miss]) if df_missings: from funcoesProprias import dfExploracao print('Cálculo do percentual de missings num DataFrame') explora = dfExploracao(df) explora = explora.sort_values(['tipos', 'na_perct', 'quantUnicos']) return explora
def main(args): train_path = os.path.join(args.subjects_path, 'train') test_path = os.path.join(args.subjects_path, 'test') if not (os.path.exists(train_path) or os.path.exists(test_path)): raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), train_path) elif not (os.path.exists(args.plots_path)): os.makedirs(args.plots_path) subject_directories_train = get_subject_dirs(train_path) subject_directories_test = get_subject_dirs(test_path) subject_directories = subject_directories_train + subject_directories_test los_hours, los_remaining_hours, los_targets_coarse, \ los_remaining_targets_coarse, los_targets_fine, \ los_remaining_targets_fine = [], [], [], [], [], [] with open(args.config) as f: config = json.load(f) variables = config['variables'] # Store all data in a single dataframe complete_data_df = pd.DataFrame(columns=variables) # Per subject, store which variables have no values in the time series subject_no_values_df = pd.DataFrame(columns=variables) for i, sd in enumerate(tqdm(subject_directories)): ts = pd.read_csv(os.path.join(sd, 'timeseries.csv')) ts = ts[variables] empty_vars_series = ts.notnull().any() subject_no_values_df = subject_no_values_df.append(empty_vars_series, ignore_index=True) complete_data_df = complete_data_df.append(ts) # Visualize the percentage of missing values per variable for all data ax = missingno.bar(complete_data_df, color=(31 / 256, 119 / 256, 180 / 256)) ax.figure.savefig(os.path.join(args.plots_path, 'missing_data_bar_plot.pdf'), format="pdf", bbox_inches='tight', pad_inches=0) # For each variable, visualize the percentage of subjects that have no # recorded measurement subject_no_values_df = subject_no_values_df.replace(False, np.nan) ax = missingno.bar(subject_no_values_df, color=(31 / 256, 119 / 256, 180 / 256)) ax.figure.savefig(os.path.join(args.plots_path, 'no_variable_recording_per_subject.pdf'), format="pdf", bbox_inches='tight', pad_inches=0)
def _plot_missing_numbers(dframe): import missingno as msno msno.bar( dframe, sort = True, figsize = (30,8), color = "#34495e", fontsize = 15, labels = True ) plt.show()
def missing_visualization(self): # bar chart msno.bar(self.data) plt.savefig('../resources/bar.png', bbox_inches='tight') # correlation msno.heatmap(self.data) plt.savefig('../resources/correlation.png', bbox_inches='tight') # heat map sns.heatmap(self.data.isnull(), cbar=False) plt.savefig('../resources/heat_map.png', bbox_inches='tight')
def visualize_missing(df=None): """Visualize missing values. The missingness of the dataset is visualized in bar chart, matrix and heatmap. """ print("") display(HTML('<h4>Visualize Missing Data ...</h4>')) print("") msno.matrix(df, figsize=(6, 4), fontsize=12) msno.bar(df, figsize=(6, 4), fontsize=12) msno.heatmap(df, figsize=(6, 4), fontsize=12) plt.show()
def clean(file): pd.options.display.max_rows = 10 pd.options.display.max_columns = 999 black = pd.read_csv(file) black = pd.DataFrame(black) # Matrix ms.matrix(black) plt.show() # Bar plot ms.bar(black) plt.show() plt.show() # missing value black.isnull().any() # and applying son the entire data-set black.isnull().any().any() # number of missing null values in each column black.isnull().sum() class color: PURPLE = '\033[95m' CYAN = '\033[96m' DARKCYAN = '\033[36m' BLUE = '\033[94m' GREEN = '\033[92m' YELLOW = '\033[93m' RED = '\033[91m' BOLD = '\033[1m' UNDERLINE = '\033[4m' END = '\033[0m' print(color.BOLD + "From visualization and isnull command I found out:\n") print("Product_Category_2 has 166986\t\t") print("Product_Category_3 has 3732299\n") black["Product_Category_2"] = black.Product_Category_2.fillna( black['Product_Category_2'].mean()) black['Product_Category_3'] = black.Product_Category_3.fillna( black['Product_Category_3']).mean() print(black.head) print( color.BOLD + color.UNDERLINE + "replacing the na values with mean and Removing inconsistency from the data set" + color.END + '\n') print("here I am changing all the lower case column names to uppercase\n") black.columns = black.columns.str.upper() print(black.columns) b = black.iloc[1:10] return b
def data_clean(self, datatmsp): msno.bar(datatmsp.sample(len(datatmsp)), figsize=(10, 4), color='purple') # 删除缺失值过半的列 half_count = len(datatmsp) / 2 datatmsp = datatmsp.dropna(thresh=half_count, axis=1) datatmsp = datatmsp.drop_duplicates() # 删除重复行 self.data = datatmsp[[ 'title', 'province', 'region', 'discount_price', 'sale' ]] #print(self.data.head(10)) self.title = self.data.title.values.tolist()
def visualizing_nulls(df, graph): ''' This function visualizes nulls using the missingno package. It takes in a dataframe and the type of graph we want, and then returns the graph ''' if graph == 'nullity': mno.matrix(df) elif graph == 'bar': mno.bar(df, color='purple', log='True', figsize=(30, 18)) elif graph == 'corr': mno.heatmap(df, figsize=(20, 20)) plt.show()
def plot_md_percent(data, **kwargs): """Plot the percentage of missing data by column within a DataFrame. Args: data (pd.DataFrame): DataFrame to plot. **kwargs: Keyword arguments for plot. Passed to missingno.bar. Returns: matplotlib.axes._subplots.AxesSubplot: missingness percent plot. Raises: TypeError: if data is not a DataFrame. Error raised through decorator. """ defaults = _default_plot_args(**kwargs) msno.bar(data, figsize=defaults["figure.figsize"], **kwargs)
def plot_missing_values(df, kind="matrix"): if kind == "matrix": return msno.matrix(df) if kind == "bar": return msno.bar(df) if kind == "heatmap": msno.heatmap(df)
def view_missingvalue(df): df = pd.DataFrame(df)#, columns=['date', 'time', 'category', 'si', 'dong', 'value']) # ===========by seaborn import seaborn as sns # ax = sns.heatmap(df.isnull(), cbar=False) # plt.title('sns.heatmap') # plt.show() # ===========by missingno import missingno as msno # 1) matrix : 최대 50개의 레이블이 지정된 열만 요약해서 표시 # ax = msno.matrix(df) # plt.title('msno.matrix') # plt.show() # 2) bar chart : 각열의 결측치가 합해진 값(log=True or False) ax = msno.bar(df, log=True) plt.title('msno.bar') plt.show() # 3) heatmap : 결측치가 있는 컬럼만 표시, 상관관계를 파악하기에 효과적 # ax = msno.heatmap(df) # plt.title('msno.heatmap') # plt.show() # 4) dendrogram : 결측값이 있는 컬럼의 상관관계를 파악하기에 효과적 ax = msno.dendrogram(df) plt.title('msno.dendrogram') plt.show() return df
def missing_values_analysis(df: pd.DataFrame) -> None: """ Analyse missing values in dataframe attributes. Args: df: Input dataframe. """ for column in df.columns: missing_count = len(df[column][df[column].isna()]) missing_percentage = round(missing_count / len(df) * 100, 2) if missing_count > 0: print(f'{column}: {missing_count} ({missing_percentage}%)') print( '\nMissing values plot (inverse logic, plot is showing how many values are not NaN):' ) msno.bar(df)
def miss_value(): # pip install missingno import missingno as msno import pandas as pd import numpy as ny data = pd.read_csv("model.csv") # 无效矩阵的数据密集显示 msno.matrix(data, labels=True, inline=False, sort='descending') # 条形图 msno.bar(data) # 热图相关性 一个变量的存在或不存在如何强烈影响的另一个的存在 # 关性为1,说明X5只要发生了缺失,那么X1.1也会缺失。 相关性为-1,说明X7缺失的值,那么X8没有缺失;而X7没有缺失时,X8为缺失。 msno.heatmap(data) # 树状图 层次聚类算法通过它们的无效性相关性(根据二进制距离测量)将变量彼此相加, # 哪个组合最小化剩余簇的距离来分割变量。变量集越单调,它们的总距离越接近零,并且它们的平均距离(y轴)越接近零。 msno.dendrogram(data)
def plot_miss(self,filename:str,asc=0,figsize=(10,6)): """ 缺失可视化 :param df:df :param filename:str 路径及文件名 :param asc: int 统计方法,Matrix(asc=0),BarChart(asc=1),Heatmap(asc=2) :param figsize tupe 图片大小 :return:保存结果 """ filename = check_str(filename) if asc == 0: msno.matrix(df=self._df) elif asc == 1: msno.bar(df=self._df, figsize=figsize) else: msno.heatmap(df=self._df, figsize=figsize) plt.savefig(filename)
def show_NaN(data,features_list,nplots): if nplots == 1: # Only plots NaN matrix msno.matrix(data, labels=True, fontsize=8,figsize=(9,10)) else: # Plots both NaN matrix and bar graphs fig, (ax0, ax1) = plt.subplots(nrows=2, figsize=(9, 12)) m = msno.matrix(data, labels=True, fontsize=8,ax=ax0, sparkline=False) m = msno.bar(data, labels=True, fontsize=8,ax=ax1) # Fits the figure to the content fig.tight_layout()
def label_distribution(data): p = data.Outcome.value_counts().plot(kind='bar') # 使用柱状图画出 plt.show() # 可视化数据发布, 有些数据本不该为0的却为0,其实是空的 p = seaborn.pairplot(data, hue='Outcome') plt.show() # 把空值的用柱状图画出来 p = msn.bar(data) plt.show()
def PlotMissingBar(self, df, start, end): """ input: df: dataframe start: the column of dataframe we want to start end: the column of dataframe we want to end """ plotMB = msno.bar(df.iloc[:, start:end], figsize=(20, 14)) plt.show(plotMB)
def missing_bar(data: pd.DataFrame) -> str: """Generate missing values bar plot. Args: data: Pandas DataFrame to generate missing values bar plot from. Returns: The resulting missing values bar plot encoded as a string. """ missingno.bar( data, figsize=(10, 5), color=hex_to_rgb(config["style"]["primary_color"].get(str)), fontsize=get_font_size(data), ) for ax0 in plt.gcf().get_axes(): ax0.grid(False) plt.subplots_adjust(left=0.1, right=0.9, top=0.8, bottom=0.3) return plot_360_n0sc0pe(plt)
def missing_value_plotting(self): ''' Display plot for the missing value of the dataframe Parameters ---------- None. Returns ------- Display plot for the missing value of the dataframe and save them. ''' print("Plotting Missing Values...") ''' The sparkline at right summarizes the general shape of the data completeness and points out the rows with the maximum and minimum nullity in the dataset. ''' plt.figure() msno.bar(self.dataframe) plt.title("Matrice des valeurs manquantes des données\n", fontsize=18) plt.figure() msno.matrix(self.dataframe) plt.title("Diagramme à barres des valeurs manquantes des données\n", fontsize=18) ''' A value near -1 means if one variable appears then the other variable is very likely to be missing. A value near 0 means there is no dependence between the occurrence of missing values of two variables. A value near 1 means if one variable appears then the other variable is very likely to be present. ''' plt.figure() msno.heatmap(self.dataframe) plt.title("Diagramme à barres des valeurs manquantes des données\n", fontsize=18) plt.figure() msno.dendrogram(self.dataframe)
def na_plots(self, df, name): " Plot missing values " # Show missing values in data frame msno.matrix(df) count_vars = len(df.columns) self._plot_show(f"Missing value dataFrame plot", f'dataset_explore.{self.name}', count_vars_x=count_vars) # Barplot of number of misisng values try: msno.bar(df) self._plot_show(f"Missing value by column", f'dataset_explore.{self.name}', count_vars_x=count_vars) except ValueError as ve: self._debug(f"Exception when invoking missingno.bar: {ve}") # Heatmap: Correlation of missing values msno.heatmap(df) self._plot_show(f"Nullity correlation", f'dataset_explore.{self.name}', count_vars_x=count_vars, count_vars_y=count_vars)
def missing_values(df: pd.DataFrame): ''' Functions uses missingno library and prints each DataFrame's column name and count of NaN which were found in it. Parameters: ----------- (1) df --> given DataFrame; ----------- Returns a chart bar with each feature non-NaN value count. ''' for column in df.columns[df.isnull().any(axis=0)]: print( f'Column "{column}" has {df[column].isnull().sum()} missing values.' ) return missingno.bar(df)
def test_log_bar(self): msno.bar(self.simple_df, log=True) return plt.gcf()
def test_simple_bar(self): msno.bar(self.simple_df) return plt.gcf()