Exemplo n.º 1
0
def M_data_scan(df: pd.DataFrame) -> None:
    """
    数据整体描述  
        维度、占内存大小、字段类型、缺失
    """
    print('# -------------------------')
    print('#        维度与数据大小     ')
    print('# -------------------------')
    print("数据 {} 行 {} 列".format(df.shape[0], df.shape[1]))
    print("数据占内存:{:.2f}MB".format(df.memory_usage().sum() / 1024**2))
    print("数据集的特征类型:\n", df.dtypes)
    print('# -------------------------')
    print('#   特征类型和缺失情况 ')
    print('# -------------------------')
    cnt = 0

    for col in df.columns:
        dtp = df[col].dtypes
        mis_cnt = df[col].isna().sum()
        if mis_cnt > 0:
            print("特征名称:{}, 特征类型:{}, 缺失数量:{}".format(col, dtp, mis_cnt))
            cnt += 1
    msg = '所有变量均无缺失' if cnt == 0 else '总共 %d 个缺失' % (cnt)
    print(msg)
    msn.bar(df)
    plt.show()
Exemplo n.º 2
0
def clean_solve():
    datatmsp = pd.read_excel('test2.xlsx')
    import missingno as msno  #missingno绘制缺失数据分布图
    msno.bar(datatmsp.sample(len(datatmsp)), figsize=(10, 4))  #缺失值可视化处理
    #删除缺失值过半的列
    half_count = len(datatmsp) / 2
    datatmsp = datatmsp.dropna(thresh=half_count, axis=1)
    #删除重复行
    datatmsp = datatmsp.drop_duplicates()

    #取出这4列数据
    data = datatmsp[['item_loc', 'raw_title', 'view_price', 'view_sales']]
    #对区域列的省份和城市进行拆分:
    #生成province列:
    data['province'] = data.item_loc.apply(lambda x: x.split(' ')[0])
    #注:因直辖市的省份和城市相同 这里根据字符长度进行判断
    data['city'] = data.item_loc.apply(lambda x: x.split()[0]
                                       if len(x) < 5 else x.split()[1])
    #提取销量列中的数字,得到sales列:
    data['sales'] = data.view_sales.apply(lambda x: x.split('人')[0])

    #将数据类型进行转换
    data['sales'] = data.sales.astype('int')
    list_col = ['province', 'city']
    for i in list_col:
        data[i] = data[i].astype('category')
    #删除不用的列
    data = data.drop(
        ['item_loc', 'view_sales'],
        axis=1)  #使用0值表示沿着每一列或行标签\索引值向下执行方法 使用1值表示沿着每一行或者列标签模向执行对应的方法
    return data
Exemplo n.º 3
0
def missing_vals_vis(df, figsize=(8, 4)):
    _, ax = plt.subplots(figsize=figsize)
    mn.matrix(df, ax=ax)
    _, ax2 = plt.subplots(figsize=figsize)
    mn.dendrogram(df, ax=ax2)
    _, ax3 = plt.subplots(figsize=figsize)
    mn.bar(df, ax=ax3)
Exemplo n.º 4
0
def plot_missing_bar(df):
    '''plot missing values bar'''

    fig, ax = plt.subplots()
    fig.set_size_inches(3, 2, forward=True)
    msno.bar(df, ax=ax)
    st.pyplot(fig)
Exemplo n.º 5
0
 def showMissValue(self):
     """
     Show the ratio of missing value of each features.
     It will not save fig.
     :return: None
     """
     msno.bar(self.data)
Exemplo n.º 6
0
    def missing_values(self, dataframe, byclass=False):
        """
        Creates a bar plot for the count of missing values
        
        Parameters
        ----------
        dataframe : a Dask dataframe
            A Dask dataframe for which missing values are to be visualized
        byclass: bool, default = False
            Specifies whether separate plots should be made for each class of the target feature
        
        """

        df = dataframe.copy()

        if byclass:
            classes = df[self.target_feature].unique()
            for c in classes:
                graph = dataframe[dataframe[self.target_feature] ==
                                  c].compute()
                plt.figure()
                plt.title('Missing Values for Class - ' + str(c))
                plt.xlabel('Features')
                plt.ylabel('Missing Observations')
                missingno.bar(graph)
        else:
            plt.figure()
            plt.title('Missing Values in Dataset')
            plt.xlabel('Features')
            plt.ylabel('Missing Observations')
            missingno.bar(graph)
Exemplo n.º 7
0
def missing_bar(df):

    missingValueColumns = df.columns[df.isnull().any()].tolist()
    msno.bar(df[missingValueColumns],
             figsize=(20, 8),
             color="#34495e",
             fontsize=12,
             labels=True)
    plt.show()
Exemplo n.º 8
0
def nan_analysis(df, figure_size=(12, 5)):
    # fig, axs = plt.subplots(3,1)
    # nan ratio in each feature
    msno.bar(df, figsize=figure_size)
    time.sleep(0.2)
    # nan ratio in each row
    msno.matrix(df, figsize=figure_size)
    time.sleep(0.2)
    # plot nan correlation between features
    msno.heatmap(df, figsize=figure_size)
Exemplo n.º 9
0
def bar_missing(sample_df, title):
    missing_data_df = sample_df.columns[sample_df.isnull().any()].tolist()
    msno.bar(sample_df[missing_data_df],
             color="black",
             log=False,
             figsize=(30, 18))
    plt.title(title, fontsize=24, y=1.05)
    fig = plt.gcf()
    fig.savefig('graphs/' + title + '.png')
    plt.show()
    def missings_viz(self,
                     df,
                     visualizar=True,
                     escolhido_tipo=None,
                     df_missings=False):
        '''
        Visualizar os missings, plota o tipo de visualizacao
        : param df: pd.DataFrame para visualizar
        : param visualizar: booleano para decidir qual visualizar
        : param escolhido_tipo: inteiro para decidir qual tipo visualizar
        : param df_missings: booleano para retorna Dataframe com percentual de nulos
        : return: pd.DataFrame com nomes das colunas e porcentagem missings
        '''

        if visualizar:
            # para quem usar um tema dark na IDE
            from matplotlib.pyplot import style
            style.use('classic')

            # colunas com missings apenas
            cols_miss = df.isnull().any()
            cols_miss = df.columns[cols_miss]

            if escolhido_tipo == None:
                print('Tipo de visualizacao: ', '\n', 'total de missings - 1',
                      '\n', 'ordem de aparição - 2', '\n', 'correlação - 3',
                      '\n', 'dendograma - 4')
                escolhido_tipo = int(input())

            print('Visualização missings')
            # total
            if escolhido_tipo == 1:
                from missingno import bar
                bar(df[cols_miss])
            # ordem aparicao
            elif escolhido_tipo == 2:
                from missingno import matrix
                matrix(df[cols_miss])
            # correlacao
            elif escolhido_tipo == 3:
                from missingno import heatmap
                heatmap(df[cols_miss])
            # dendograma
            elif escolhido_tipo == 4:
                from missingno import dendrogram
                dendrogram(df[cols_miss])

        if df_missings:
            from funcoesProprias import dfExploracao

            print('Cálculo do percentual de missings num DataFrame')
            explora = dfExploracao(df)
            explora = explora.sort_values(['tipos', 'na_perct', 'quantUnicos'])
            return explora
Exemplo n.º 11
0
def main(args):
    train_path = os.path.join(args.subjects_path, 'train')
    test_path = os.path.join(args.subjects_path, 'test')

    if not (os.path.exists(train_path) or os.path.exists(test_path)):
        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                                train_path)
    elif not (os.path.exists(args.plots_path)):
        os.makedirs(args.plots_path)

    subject_directories_train = get_subject_dirs(train_path)
    subject_directories_test = get_subject_dirs(test_path)
    subject_directories = subject_directories_train + subject_directories_test

    los_hours, los_remaining_hours, los_targets_coarse, \
            los_remaining_targets_coarse, los_targets_fine, \
            los_remaining_targets_fine =  [], [], [], [], [], []

    with open(args.config) as f:
        config = json.load(f)
        variables = config['variables']

    # Store all data in a single dataframe
    complete_data_df = pd.DataFrame(columns=variables)
    # Per subject, store which variables have no values in the time series
    subject_no_values_df = pd.DataFrame(columns=variables)
    for i, sd in enumerate(tqdm(subject_directories)):
        ts = pd.read_csv(os.path.join(sd, 'timeseries.csv'))
        ts = ts[variables]

        empty_vars_series = ts.notnull().any()
        subject_no_values_df = subject_no_values_df.append(empty_vars_series,
                                                           ignore_index=True)
        complete_data_df = complete_data_df.append(ts)

    # Visualize the percentage of missing values per variable for all data
    ax = missingno.bar(complete_data_df,
                       color=(31 / 256, 119 / 256, 180 / 256))
    ax.figure.savefig(os.path.join(args.plots_path,
                                   'missing_data_bar_plot.pdf'),
                      format="pdf",
                      bbox_inches='tight',
                      pad_inches=0)

    # For each variable, visualize the percentage of subjects that have no
    # recorded measurement
    subject_no_values_df = subject_no_values_df.replace(False, np.nan)
    ax = missingno.bar(subject_no_values_df,
                       color=(31 / 256, 119 / 256, 180 / 256))
    ax.figure.savefig(os.path.join(args.plots_path,
                                   'no_variable_recording_per_subject.pdf'),
                      format="pdf",
                      bbox_inches='tight',
                      pad_inches=0)
Exemplo n.º 12
0
def _plot_missing_numbers(dframe):
    import missingno as msno
    msno.bar(
        dframe,
        sort     = True,
        figsize  = (30,8),
        color    = "#34495e",
        fontsize = 15,
        labels   = True
    )
    plt.show()
Exemplo n.º 13
0
    def missing_visualization(self):
        # bar chart
        msno.bar(self.data)
        plt.savefig('../resources/bar.png', bbox_inches='tight')

        # correlation
        msno.heatmap(self.data)
        plt.savefig('../resources/correlation.png', bbox_inches='tight')

        # heat map
        sns.heatmap(self.data.isnull(), cbar=False)
        plt.savefig('../resources/heat_map.png', bbox_inches='tight')
Exemplo n.º 14
0
def visualize_missing(df=None):
    """Visualize missing values.

    The missingness of the dataset is visualized in bar chart,
    matrix and heatmap.
    """
    print("")
    display(HTML('<h4>Visualize Missing Data ...</h4>'))
    print("")
    msno.matrix(df, figsize=(6, 4), fontsize=12)
    msno.bar(df, figsize=(6, 4), fontsize=12)
    msno.heatmap(df, figsize=(6, 4), fontsize=12)
    plt.show()
Exemplo n.º 15
0
def clean(file):

    pd.options.display.max_rows = 10
    pd.options.display.max_columns = 999
    black = pd.read_csv(file)
    black = pd.DataFrame(black)
    # Matrix
    ms.matrix(black)
    plt.show()
    # Bar plot
    ms.bar(black)
    plt.show()
    plt.show()
    # missing value
    black.isnull().any()
    # and applying son the entire data-set
    black.isnull().any().any()
    # number of missing null values in each column
    black.isnull().sum()

    class color:

        PURPLE = '\033[95m'
        CYAN = '\033[96m'
        DARKCYAN = '\033[36m'
        BLUE = '\033[94m'
        GREEN = '\033[92m'
        YELLOW = '\033[93m'
        RED = '\033[91m'
        BOLD = '\033[1m'
        UNDERLINE = '\033[4m'
        END = '\033[0m'

    print(color.BOLD + "From visualization and isnull command I found out:\n")
    print("Product_Category_2 has 166986\t\t")
    print("Product_Category_3 has 3732299\n")
    black["Product_Category_2"] = black.Product_Category_2.fillna(
        black['Product_Category_2'].mean())
    black['Product_Category_3'] = black.Product_Category_3.fillna(
        black['Product_Category_3']).mean()
    print(black.head)
    print(
        color.BOLD + color.UNDERLINE +
        "replacing  the na values with mean and Removing inconsistency from the data set"
        + color.END + '\n')
    print("here I am changing all the lower case column names to uppercase\n")
    black.columns = black.columns.str.upper()
    print(black.columns)

    b = black.iloc[1:10]
    return b
    def data_clean(self, datatmsp):
        msno.bar(datatmsp.sample(len(datatmsp)),
                 figsize=(10, 4),
                 color='purple')
        # 删除缺失值过半的列
        half_count = len(datatmsp) / 2
        datatmsp = datatmsp.dropna(thresh=half_count, axis=1)
        datatmsp = datatmsp.drop_duplicates()  # 删除重复行
        self.data = datatmsp[[
            'title', 'province', 'region', 'discount_price', 'sale'
        ]]
        #print(self.data.head(10))

        self.title = self.data.title.values.tolist()
Exemplo n.º 17
0
def visualizing_nulls(df, graph):
    '''
    This function visualizes nulls using the missingno package. It
    takes in a dataframe and the type of graph we want, and then
    returns the graph
    '''

    if graph == 'nullity':
        mno.matrix(df)
    elif graph == 'bar':
        mno.bar(df, color='purple', log='True', figsize=(30, 18))
    elif graph == 'corr':
        mno.heatmap(df, figsize=(20, 20))

    plt.show()
Exemplo n.º 18
0
def plot_md_percent(data, **kwargs):
    """Plot the percentage of missing data by column within a DataFrame.

    Args:
        data (pd.DataFrame): DataFrame to plot.
        **kwargs: Keyword arguments for plot. Passed to missingno.bar.

    Returns:
        matplotlib.axes._subplots.AxesSubplot: missingness percent plot.

    Raises:
        TypeError: if data is not a DataFrame. Error raised through decorator.
    """
    defaults = _default_plot_args(**kwargs)
    msno.bar(data, figsize=defaults["figure.figsize"], **kwargs)
Exemplo n.º 19
0
def plot_missing_values(df, kind="matrix"):
    if kind == "matrix":
        return msno.matrix(df)
    if kind == "bar":
        return msno.bar(df)
    if kind == "heatmap":
        msno.heatmap(df)
Exemplo n.º 20
0
def view_missingvalue(df):
    
    df = pd.DataFrame(df)#, columns=['date', 'time', 'category', 'si', 'dong', 'value'])

    # ===========by seaborn
    import seaborn as sns

    # ax = sns.heatmap(df.isnull(), cbar=False)
    # plt.title('sns.heatmap')
    # plt.show()

    # ===========by missingno
    import missingno as msno

    # 1) matrix : 최대 50개의 레이블이 지정된 열만 요약해서 표시
    # ax = msno.matrix(df)
    # plt.title('msno.matrix')
    # plt.show()

    # 2) bar chart : 각열의 결측치가 합해진 값(log=True or False)
    ax = msno.bar(df, log=True)
    plt.title('msno.bar')
    plt.show()

    # 3) heatmap : 결측치가 있는 컬럼만 표시, 상관관계를 파악하기에 효과적 
    # ax = msno.heatmap(df)
    # plt.title('msno.heatmap')
    # plt.show()

    # 4) dendrogram : 결측값이 있는 컬럼의 상관관계를 파악하기에 효과적
    ax = msno.dendrogram(df)
    plt.title('msno.dendrogram')
    plt.show()

    return df
Exemplo n.º 21
0
def missing_values_analysis(df: pd.DataFrame) -> None:
    """
    Analyse missing values in dataframe attributes.

    Args:
        df: Input dataframe.
    """
    for column in df.columns:
        missing_count = len(df[column][df[column].isna()])
        missing_percentage = round(missing_count / len(df) * 100, 2)
        if missing_count > 0:
            print(f'{column}: {missing_count} ({missing_percentage}%)')

    print(
        '\nMissing values plot (inverse logic, plot is showing how many values are not NaN):'
    )
    msno.bar(df)
Exemplo n.º 22
0
def miss_value():
    # pip install missingno
    import missingno as msno
    import pandas as pd
    import numpy as ny

    data = pd.read_csv("model.csv")
    # 无效矩阵的数据密集显示
    msno.matrix(data, labels=True, inline=False, sort='descending')
    # 条形图
    msno.bar(data)
    # 热图相关性 一个变量的存在或不存在如何强烈影响的另一个的存在
    # 关性为1,说明X5只要发生了缺失,那么X1.1也会缺失。 相关性为-1,说明X7缺失的值,那么X8没有缺失;而X7没有缺失时,X8为缺失。
    msno.heatmap(data)
    # 树状图 层次聚类算法通过它们的无效性相关性(根据二进制距离测量)将变量彼此相加,
    # 哪个组合最小化剩余簇的距离来分割变量。变量集越单调,它们的总距离越接近零,并且它们的平均距离(y轴)越接近零。
    msno.dendrogram(data)
Exemplo n.º 23
0
 def plot_miss(self,filename:str,asc=0,figsize=(10,6)):
     """
     缺失可视化
     :param df:df
     :param filename:str 路径及文件名
     :param asc: int 统计方法,Matrix(asc=0),BarChart(asc=1),Heatmap(asc=2)
     :param figsize tupe 图片大小
     :return:保存结果
     """
     filename = check_str(filename)
     if asc == 0:
         msno.matrix(df=self._df)
     elif asc == 1:
         msno.bar(df=self._df, figsize=figsize)
     else:
         msno.heatmap(df=self._df, figsize=figsize)
     plt.savefig(filename)
Exemplo n.º 24
0
def show_NaN(data,features_list,nplots):
    if nplots == 1: # Only plots NaN matrix
        msno.matrix(data, labels=True, fontsize=8,figsize=(9,10)) 
    else: # Plots both NaN matrix and bar graphs
        fig, (ax0, ax1) = plt.subplots(nrows=2, figsize=(9, 12))
        m = msno.matrix(data, labels=True, fontsize=8,ax=ax0, sparkline=False)     
        m = msno.bar(data, labels=True, fontsize=8,ax=ax1)
        # Fits the figure to the content
        fig.tight_layout()
Exemplo n.º 25
0
def label_distribution(data):
    p = data.Outcome.value_counts().plot(kind='bar')  # 使用柱状图画出
    plt.show()
    # 可视化数据发布, 有些数据本不该为0的却为0,其实是空的
    p = seaborn.pairplot(data, hue='Outcome')
    plt.show()
    # 把空值的用柱状图画出来
    p = msn.bar(data)
    plt.show()
Exemplo n.º 26
0
    def PlotMissingBar(self, df, start, end):
        """
        input:
            df: dataframe
            start: the column of dataframe we want to start
            end: the column of dataframe we want to end

        """
        plotMB = msno.bar(df.iloc[:, start:end], figsize=(20, 14))
        plt.show(plotMB)
Exemplo n.º 27
0
def missing_bar(data: pd.DataFrame) -> str:
    """Generate missing values bar plot.

    Args:
      data: Pandas DataFrame to generate missing values bar plot from.

    Returns:
      The resulting missing values bar plot encoded as a string.
    """
    missingno.bar(
        data,
        figsize=(10, 5),
        color=hex_to_rgb(config["style"]["primary_color"].get(str)),
        fontsize=get_font_size(data),
    )
    for ax0 in plt.gcf().get_axes():
        ax0.grid(False)
    plt.subplots_adjust(left=0.1, right=0.9, top=0.8, bottom=0.3)
    return plot_360_n0sc0pe(plt)
    def missing_value_plotting(self):
        '''
        Display plot for the missing value of the dataframe

        Parameters
        ----------
        None.

        Returns
        -------
        Display plot for the missing value of the dataframe and save them.

        '''

        print("Plotting Missing Values...")
        '''
        The sparkline at right summarizes the general shape of the data completeness 
        and points out the rows with the maximum and minimum nullity in the dataset.
        '''
        plt.figure()
        msno.bar(self.dataframe)
        plt.title("Matrice des valeurs manquantes des données\n", fontsize=18)

        plt.figure()
        msno.matrix(self.dataframe)
        plt.title("Diagramme à barres des valeurs manquantes des données\n",
                  fontsize=18)
        '''
        A value near -1 means if one variable appears then the other variable is very likely to be missing.
        A value near 0 means there is no dependence between the occurrence of missing values of two variables.
        A value near 1 means if one variable appears then the other variable is very likely to be present.
        '''
        plt.figure()
        msno.heatmap(self.dataframe)
        plt.title("Diagramme à barres des valeurs manquantes des données\n",
                  fontsize=18)

        plt.figure()
        msno.dendrogram(self.dataframe)
Exemplo n.º 29
0
 def na_plots(self, df, name):
     " Plot missing values "
     # Show missing values in data frame
     msno.matrix(df)
     count_vars = len(df.columns)
     self._plot_show(f"Missing value dataFrame plot",
                     f'dataset_explore.{self.name}',
                     count_vars_x=count_vars)
     # Barplot of number of misisng values
     try:
         msno.bar(df)
         self._plot_show(f"Missing value by column",
                         f'dataset_explore.{self.name}',
                         count_vars_x=count_vars)
     except ValueError as ve:
         self._debug(f"Exception when invoking missingno.bar: {ve}")
     # Heatmap: Correlation of missing values
     msno.heatmap(df)
     self._plot_show(f"Nullity correlation",
                     f'dataset_explore.{self.name}',
                     count_vars_x=count_vars,
                     count_vars_y=count_vars)
Exemplo n.º 30
0
def missing_values(df: pd.DataFrame):
    ''' Functions uses missingno library and prints each DataFrame's 
    column name and count of NaN which were found in it.
    Parameters:
    -----------
    (1) df --> given DataFrame;
    -----------
    Returns a chart bar with each feature non-NaN value count.
    '''
    for column in df.columns[df.isnull().any(axis=0)]:
        print(
            f'Column "{column}" has {df[column].isnull().sum()} missing values.'
        )
    return missingno.bar(df)
Exemplo n.º 31
0
 def test_log_bar(self):
     msno.bar(self.simple_df, log=True)
     return plt.gcf()
Exemplo n.º 32
0
 def test_simple_bar(self):
     msno.bar(self.simple_df)
     return plt.gcf()