Python dendrogram示例，missingno.dendrogram Python示例

示例#1

0

显示文件

    def missing_stats(self):
        # Basic Stats
        self.all.info()

        # Heatmap
        sns.heatmap(self.all.isnull(), cbar=False)
        col_missing=[name for name in self.all.columns if np.sum(self.all[name].isnull()) !=0]
        col_missing.remove('SalePrice')
        print(col_missing)
        msno.heatmap(self.all)
        plt.figure()
        msno.heatmap(self.all[['BsmtFinSF1','BsmtFinSF2','BsmtFullBath','BsmtHalfBath','BsmtUnfSF','TotalBsmtSF']])
        plt.figure()
        msno.heatmap(self.all[['GarageCond', 'GarageFinish', 'GarageFinish', 'GarageQual','GarageType', 'GarageYrBlt']])
        plt.figure()
        msno.dendrogram(self.all)
        plt.figure()

        # Bar chart
        if len(col_missing) != 0:
            plt.figure(figsize=(12,6))
            np.sum(self.all[col_missing].isnull()).plot.bar(color='b')

            # Table
            print(pd.DataFrame(np.sum(self.all[col_missing].isnull())))
            print(np.sum(self.all[col_missing].isnull())*100/self.all[col_missing].shape[0])

示例#2

0

显示文件

文件： analytica.py 项目： suchy1713/ML-Learning

def missing_vals_vis(df, figsize=(8, 4)):
    _, ax = plt.subplots(figsize=figsize)
    mn.matrix(df, ax=ax)
    _, ax2 = plt.subplots(figsize=figsize)
    mn.dendrogram(df, ax=ax2)
    _, ax3 = plt.subplots(figsize=figsize)
    mn.bar(df, ax=ax3)

示例#3

0

显示文件

    def get_missings(self, missing_tag = None):
        '''
        Sometimes missing values are denoted with a number or string, 
        enter the missing tag to replace them with NAs
        '''
        if missing_tag is not None:
            self.df.replace(missing_tag, np.nan, inplace = True)
        
        # check if there are any null values
        if self.df.isnull().sum().sum() == 0:
            print('''There is no missing value, please check if the missings have been encoded with non-NAN value.
Use argument missing_tag for encoded missing values''')
        else:
            # missing heatmap display the missing values position in the dataset
            missing_heatmap = plt.figure(1)
            msno.matrix(self.df)
            plt.title('Missing Values shown in White',fontsize=25)

            # correlation plot: how strongly the presence or absence of one variable affects the presence of another
            correlation_plot = plt.figure(2)
            msno.heatmap(self.df,cbar= False)
            plt.title('Missing Values Correlation',fontsize=25)

            # The dendrogram uses a hierarchical clustering algorithm 
            # to bin variables against one another by their missing values correlation 
            missing_dendogram = plt.figure(3)
            msno.dendrogram(self.df)
            plt.title('Missing Values Dendrogram',fontsize=25)

示例#4

0

显示文件

文件： visualizacao.py 项目： Argentin03/aceleracao-Codenation-DataScience

    def missings_viz(self,
                     df,
                     visualizar=True,
                     escolhido_tipo=None,
                     df_missings=False):
        '''
        Visualizar os missings, plota o tipo de visualizacao
        : param df: pd.DataFrame para visualizar
        : param visualizar: booleano para decidir qual visualizar
        : param escolhido_tipo: inteiro para decidir qual tipo visualizar
        : param df_missings: booleano para retorna Dataframe com percentual de nulos
        : return: pd.DataFrame com nomes das colunas e porcentagem missings
        '''

        if visualizar:
            # para quem usar um tema dark na IDE
            from matplotlib.pyplot import style
            style.use('classic')

            # colunas com missings apenas
            cols_miss = df.isnull().any()
            cols_miss = df.columns[cols_miss]

            if escolhido_tipo == None:
                print('Tipo de visualizacao: ', '\n', 'total de missings - 1',
                      '\n', 'ordem de aparição - 2', '\n', 'correlação - 3',
                      '\n', 'dendograma - 4')
                escolhido_tipo = int(input())

            print('Visualização missings')
            # total
            if escolhido_tipo == 1:
                from missingno import bar
                bar(df[cols_miss])
            # ordem aparicao
            elif escolhido_tipo == 2:
                from missingno import matrix
                matrix(df[cols_miss])
            # correlacao
            elif escolhido_tipo == 3:
                from missingno import heatmap
                heatmap(df[cols_miss])
            # dendograma
            elif escolhido_tipo == 4:
                from missingno import dendrogram
                dendrogram(df[cols_miss])

        if df_missings:
            from funcoesProprias import dfExploracao

            print('Cálculo do percentual de missings num DataFrame')
            explora = dfExploracao(df)
            explora = explora.sort_values(['tipos', 'na_perct', 'quantUnicos'])
            return explora

示例#5

0

显示文件

文件： df_explore.py 项目： AstraZeneca-NGS/LogMl

 def dendogram_na(self):
     """ Dendogram of missing values """
     count_na = self.df.isna().sum().sum()
     if count_na <= 0:
         self._debug(
             f"Dendogram of missing values {self.name}: No missing values, skipping"
         )
         return
     msno.dendrogram(self.df)
     num_vars = len(self.df.columns)
     self._plot_show(f"Dendogram missing values",
                     f'dataset_explore.{self.name}',
                     count_vars_x=num_vars)

示例#6

0

显示文件

def missing_dendrogram(data: pd.DataFrame) -> str:
    """Generate a dendrogram plot for missing values.

    Args:
      data: Pandas DataFrame to generate missing values dendrogram plot from.

    Returns:
      The resulting missing values dendrogram plot encoded as a string.

    """
    missingno.dendrogram(data, fontsize=get_font_size(data) * 2.0)
    plt.subplots_adjust(left=0.1, right=0.9, top=0.7, bottom=0.2)
    return plot_360_n0sc0pe(plt)

示例#7

0

显示文件

def view_missingvalue(df):
    
    df = pd.DataFrame(df)#, columns=['date', 'time', 'category', 'si', 'dong', 'value'])

    # ===========by seaborn
    import seaborn as sns

    # ax = sns.heatmap(df.isnull(), cbar=False)
    # plt.title('sns.heatmap')
    # plt.show()

    # ===========by missingno
    import missingno as msno

    # 1) matrix : 최대 50개의 레이블이 지정된 열만 요약해서 표시
    # ax = msno.matrix(df)
    # plt.title('msno.matrix')
    # plt.show()

    # 2) bar chart : 각열의 결측치가 합해진 값(log=True or False)
    ax = msno.bar(df, log=True)
    plt.title('msno.bar')
    plt.show()

    # 3) heatmap : 결측치가 있는 컬럼만 표시, 상관관계를 파악하기에 효과적 
    # ax = msno.heatmap(df)
    # plt.title('msno.heatmap')
    # plt.show()

    # 4) dendrogram : 결측값이 있는 컬럼의 상관관계를 파악하기에 효과적
    ax = msno.dendrogram(df)
    plt.title('msno.dendrogram')
    plt.show()

    return df

示例#8

0

显示文件

文件： plots.py 项目： SebastianRehfeldt/missing-value-aware-feature-selection

def plot_nan_dendogram(data):
    # PLOTS PERCENTAGE OF NANS USING BARS
    """
        Cluster leaves which linked together at a distance of zero 
        fully predict one another's presence
    """
    return missingno.dendrogram(data.X)

示例#9

0

显示文件

文件： utils.py 项目： shabarka/autoimpute

def plot_nullility_dendogram(data, **kwargs):
    """Plot the nullility dendogram of missing data within a DataFrame.

    Args:
        data (pd.DataFrame): DataFrame to plot.
        **kwargs: Keyword arguments for plot. Passed to missingno.dendogram.

    Returns:
        matplotlib.axes._subplots.AxesSubplot: nullility dendogram plot.

    Raises:
        TypeError: if data is not a DataFrame. Error raised through decorator.
        ValueError: dataset fully observed. Raised through helper method.
    """
    _fully_complete(data)
    defaults = _default_plot_args(**kwargs)
    msno.dendrogram(data, figsize=defaults["figure.figsize"], **kwargs)

示例#10

0

显示文件

文件： basic_mlp.py 项目： githmy/vnpymy

def miss_value():
    # pip install missingno
    import missingno as msno
    import pandas as pd
    import numpy as ny

    data = pd.read_csv("model.csv")
    # 无效矩阵的数据密集显示
    msno.matrix(data, labels=True, inline=False, sort='descending')
    # 条形图
    msno.bar(data)
    # 热图相关性 一个变量的存在或不存在如何强烈影响的另一个的存在
    # 关性为1，说明X5只要发生了缺失，那么X1.1也会缺失。 相关性为-1，说明X7缺失的值，那么X8没有缺失；而X7没有缺失时，X8为缺失。
    msno.heatmap(data)
    # 树状图 层次聚类算法通过它们的无效性相关性（根据二进制距离测量）将变量彼此相加，
    # 哪个组合最小化剩余簇的距离来分割变量。变量集越单调，它们的总距离越接近零，并且它们的平均距离（y轴）越接近零。
    msno.dendrogram(data)

示例#11

0

显示文件

def missing_dendrogram(df):
    try:
        fig = plt.figure()
        dg = dendrogram(df)
        ax = plt.gca()
        plt.savefig('datascience/' + 'missing_dendrogram.png')
        plt.close(fig)
    except:
        pass

示例#12

0

显示文件

def missing_viz(file_path):
    """
    *Drops NAs based on co2_emissions_tonne column
    
    *Provides the sum of missing values per column.
    
    *Plots a matrix that allows for a quick inspection of nullity distribution and a 
    dendrogram to more accurately correlate variable completion. To interpret this graph:
        -read it from a top-down perspective
        -cluster leaves linked together at a distance of zero fully predict one another's presence
        -cluster leaves which split close to zero, but not at it, predict one another very well, but still imperfectly.
        -the height of the cluster leaf tells you, in absolute terms, how often the records are "mismatched" or 
        incorrectly filled—that is, how many values you would have to fill in or drop, if you are so inclined.
    
    """
    df = dfr.read_file_path(file_path)
    df.dropna(subset=['co2_emissions_tonne'], inplace=True)
    msno.matrix(df)
    msno.dendrogram(df)

示例#13

0

显示文件

文件： visualization.py 项目： serkhanekarim/marketing-analysis

    def missing_value_plotting(self):
        '''
        Display plot for the missing value of the dataframe

        Parameters
        ----------
        None.

        Returns
        -------
        Display plot for the missing value of the dataframe and save them.

        '''

        print("Plotting Missing Values...")
        '''
        The sparkline at right summarizes the general shape of the data completeness 
        and points out the rows with the maximum and minimum nullity in the dataset.
        '''
        plt.figure()
        msno.bar(self.dataframe)
        plt.title("Matrice des valeurs manquantes des données\n", fontsize=18)

        plt.figure()
        msno.matrix(self.dataframe)
        plt.title("Diagramme à barres des valeurs manquantes des données\n",
                  fontsize=18)
        '''
        A value near -1 means if one variable appears then the other variable is very likely to be missing.
        A value near 0 means there is no dependence between the occurrence of missing values of two variables.
        A value near 1 means if one variable appears then the other variable is very likely to be present.
        '''
        plt.figure()
        msno.heatmap(self.dataframe)
        plt.title("Diagramme à barres des valeurs manquantes des données\n",
                  fontsize=18)

        plt.figure()
        msno.dendrogram(self.dataframe)

示例#14

0

显示文件

文件： MyFuncs.py 项目： anthropose/super-functions

def corrMissing(df, method):
    """
    Assess how strongly the presence or absence of one variable affects the
    presence or absence of another. For heatmaps, nullity correlations range
    from -1 (if one variable appears and the other does not) to 0 (variables
    appearing or not have no effect on one another) to 1 (if one variable
    appears the other also appears)). Entries marked <1 or >-1 point to records 
    in the dataset which may be erroneous. For dendrograms, variables are binned
    against one another by their nullity correlation (measured in terms of 
    binary distance). Read the graph from top-down. Cluster leaves which are 
    linked together at a distance of zero fully predict one another's presence 
    - whether negatively or positively. Cluster leaves which split close to 
    zero, but not at it, predict one another well, but not perfectly. These 
    examples may indicate erroneous data, especially if those particular 
    columns actually are or ought to match each other perfectly in nullity. See 
    missingno documentation for more information.
    """
    if type(df) is pd.DataFrame:
        if type(method) is str:
            if method == "heatmap":
                msno.heatmap(df,
                             labels=True,
                             fontsize=8,
                             cmap="copper",
                             figsize=(10, 10))
                plt.title("Missing Values Correlations")
                plt.show()

            if method == "dendrogram":
                msno.dendrogram(df,
                                orientation="right",
                                fontsize=8,
                                figsize=(10, 10))
                plt.title("Missing Values Correlations")
                plt.show()
        else:
            print("Method type requires string (i.e., heatmap, dendrogram)")
    else:
        print("Not a Pandas dataframe")

示例#15

0

显示文件

文件： plot.py 项目： LuisFelipeDutra/pandas-profiling

def missing_dendrogram(df):
    """Plot a missingno dendrogram

    Parameters
    ----------
    df: DataFrame
        The dataframe.

    Returns
    -------
    str
        The resulting image encoded as a string.
    """
    imgdata = BytesIO()
    plot = msno.dendrogram(df)
    plot.figure.savefig(imgdata)
    imgdata.seek(0)
    result_string = 'data:image/png;base64,' + quote(base64.b64encode(imgdata.getvalue()))
    plt.close(plot.figure)
    return result_string

示例#16

0

显示文件

文件： viz_tests.py 项目： johnostrowski/missingno

 def test_method_dendrogram(self):
     msno.dendrogram(self.simple_df, method='single')
     return plt.gcf()

示例#17

0

显示文件

文件： viz_tests.py 项目： johnostrowski/missingno

 def test_orientation_dendrogram(self):
     msno.dendrogram(self.simple_df, orientation='right')
     return plt.gcf()

示例#18

0

显示文件

文件： viz_tests.py 项目： johnostrowski/missingno

 def test_simple_dendrogram(self):
     msno.dendrogram(self.simple_df)
     return plt.gcf()

示例#19

0

显示文件

def explore_missing(dataframe, target=''):
    """
    Explore missing data with missingno, including matrix, heapmap and 
    some other interesting relationship.
    
    Parameters
    ----------
    dataframe : pandas.Dataframe
        dataframe with missing data.
    target : string, optional
        column name, target identifies some column which is used for 
        classification analyze.
        Relationship between missing status and target column is analyzed. 
    
    Output
    -------
    Images like matrix, heatmap.
    Statistic data is also produced.
    Storage path is define with setFileInfo(file_path,file_name)
    """

    total = dataframe.isnull().sum().sort_values(ascending=False)
    percent = (dataframe.isnull().sum() / dataframe.isnull().count() *
               100).sort_values(ascending=False)
    missing_data = pd.concat([total, percent],
                             axis=1,
                             keys=['Total', 'Percent'])
    print('missing rank',
          missing_data.head(40 if len(total) > 40 else len(total)))

    if target != '':
        dataframe['incomplete'] = 1
        dataframe.loc[dataframe.isnull().sum(axis=1) /
                      dataframe.isnull().count(axis=1) * 100 < 35,
                      'incomplete'] = 0
        mean_c = np.mean(dataframe.loc[dataframe['incomplete'] == 0,
                                       target].values)
        mean_i = np.mean(dataframe.loc[dataframe['incomplete'] == 1,
                                       target].values)
        print(
            'default ratio for more complete: {:.2} \ndefault ratio for less complete: {:.2}'
            .format(mean_c, mean_i))

    sample_size = min(dataframe.shape[0], 500)
    msno.matrix(dataframe.sample(sample_size),
                inline=False,
                sparkline=True,
                figsize=(20, 10),
                sort=None)
    plt.title('msno.matrix')
    plt.tight_layout
    plt.savefig(sourcefilepath + 'msno.matrix.png')

    scale = dataframe.shape[1] / 30 + 1
    fig_size = (20 * scale, 10 * scale)
    msno.heatmap(dataframe, fontsize=16, figsize=fig_size)
    plt.title('msno.heatmap')
    plt.tight_layout
    plt.savefig(sourcefilepath + 'msno.heatmap.png')

    msno.dendrogram(dataframe,
                    inline=False,
                    fontsize=16,
                    figsize=(40, 20),
                    orientation='top')
    plt.title('msno.dendrogram')
    plt.tight_layout
    plt.savefig(sourcefilepath + 'msno.dendrogram.png')

    with open(sourcefilepath + textfilename, "a+") as text_file:
        print('missing rank',
              missing_data.head(40 if len(total) > 40 else len(total)),
              file=text_file)
        if target != '':
            print(
                'default ratio for more complete: {:.2} \ndefault ratio for less complete: {:.2}'
                .format(mean_c, mean_i),
                file=text_file)

示例#20

0

显示文件

n.plot.bar()

# 값을 정렬해서 결측치가 많은 값이 위에 그려지도록 barh로 그립니다.
# sort_values() 값을 내림차순으로 정리
n.sort_values().plot.barh(figsize=(7, 8))

# ### 6.2 missingno 로 결측치 시각화 하기

import missingno as msno

msno.matrix(df)

# heatmap으로 표현합니다. 상관관계가 1일수록 양의 상관관계
msno.heatmap(df)

msno.dendrogram(df)

# ### 7 사용하지 않는 컬럼 제거하기

# #### 7.1 결측치가 너무 많은 컬럼 제거하기

#sort_values 를 통해 결측치가 많은 데이터를 위에서 9개 가져와서 not_use 변수에 담습니다.
# not_use 변수에 담긴 인덱스값만 추출해서 not_use_col 이라는 변수에 담습니다.
not_use = n.sort_values(ascending=False).head(9)
not_use_col = not_use.index
not_use_col

print(df.shape)
df = df.drop(not_use_col, axis=1)
print(df.shape)

示例#21

0

显示文件

文件： EDA.py 项目： Cubi123/Proyecto-Limabank

with sns.axes_style("darkgrid"), sns.plotting_context("paper"):
    g = msno.bar(data,sort="descending",color="gray",labels=True)
    g.set_xticklabels(g.get_xticklabels(), rotation=90, horizontalalignment='center')
    plt.xticks(rotation=90, horizontalalignment='center', )
    plt.tick_params(axis="x",direction="in", pad=-60)
    plt.show()
    plt.close()

# %% codecell
# correlacion de vacios
g= msno.heatmap(data)
plt.show()

# %% codecell
#dendograma de correlaciones de vacios
msno.dendrogram(data)
plt.show()
plt.close()

# %% markdown
# # Matriz de correlaciones
# %% codecell
fig, (ax) = plt.subplots(1,1,figsize=(14,14))
sns.heatmap(data.corr(),
            ax = ax,
            vmin = -1, vmax = 1,
            cmap ="coolwarm",
            annot = True,
            fmt = ".1f",
            linewidths=.05,
            )

示例#22

0

显示文件

文件： house_mod.py 项目： sid573/Technothlon-Dat-A-vengers

def dendrogram(df, credits):
    dendrogram = 400
    credits = credits - dendrogram
    return msno.dendrogram(df.sample(df.shape[0])), credits

示例#23

0

显示文件

 def test_orientation_dendrogram(self):
     msno.dendrogram(self.simple_df, orientation='right')
     return plt.gcf()

示例#24

0

显示文件

 def test_simple_dendrogram(self):
     msno.dendrogram(self.simple_df)
     return plt.gcf()

示例#25

0

显示文件

import missingno as msno

#%% 数据
collisions = missingno_data.nyc_collision_factors()
collisions = collisions.replace("nan", np.nan)

#%%  Matrix
msno.matrix(collisions.sample(250))
#msno.matrix(busines_change)

#% 时间序列
null_pattern = (np.random.random(1000).reshape((50, 20)) > 0.5).astype(bool)
null_pattern = pd.DataFrame(null_pattern).replace({False: None})
msno.matrix(null_pattern.set_index(
    pd.period_range('1/1/2011', '2/1/2015', freq='M')),
            freq='BQ')

#%% Bar Chart
msno.bar(collisions.sample(1000))

#%%  Heatmap
msno.heatmap(collisions)

#%% Dendrogram
msno.dendrogram(collisions)

#%%  Geoplot
msno.geoplot(collisions, x='LONGITUDE', y='LATITUDE')

#%%

示例#26

0

显示文件

 def test_method_dendrogram(self):
     msno.dendrogram(self.simple_df, method='single')
     return plt.gcf()

示例#27

0

显示文件

文件： missing_no_graphs.py 项目： rcsjunior1987/Artificial-Intelligence

 def _print_dendrogram(self):
     return msno.dendrogram(self, figsize=(10,5), method="centroid", fontsize=10)

示例#28

0

显示文件

文件： data_visulization.py 项目： SyHeee/capstonePro-Automatic_Data_Exploration-HYC

#missing data patterns in the dataset. Also, the sparkline on the right gives
#you a summary of the general shape of the data completeness and an indicator
#of the rows with maximum and minimum rows.
msno.matrix(merged_df[missingdata_df])

### The missingno bar chart
#is a visualization of the data nullity. We log transformed the data on the
#y-axis to better visualize features with very large missing values.
msno.bar(merged_df[missingdata_df], color="blue", log=True, figsize=(30, 18))

###The correlation heatmap
#describes the degree of nullity relationship between the different features.
#The range of this nullity correlation is from -1 to 1 (-1 ≤ R ≤ 1).
#Features with no missing value are excluded in the heatmap.
#If the nullity correlation is very close to zero (-0.05 < R < 0.05), no value
#will be displayed. Also, a perfect positive nullity correlation (R=1) indicates
#when the first feature and the second feature both have corresponding missing values
#while a perfect negative nullity correlation (R=-1) means that one of the features is
#missing and the second is not missing.
msno.heatmap(merged_df[missingdata_df], figsize=(20, 20))

### More fully correlate variable completion
#The dendrogram reveals trends deeper than the pairwise
#ones visible in the correlation heatmap.

msno.dendrogram(merged_df[missingdata_df], orientation='left')

### Quadtree nullity distribution
#
#msno.geoplot(merged_df[missingdata_df], x='longitude', y='latitude')#, by='regionidzip',histogram=True)

示例#29

0

显示文件

文件： data_preprocessing.py 项目： hayano1/deep-learning

# Identify missing data (basic)
pd.isnull(dataset)

# Identify correlation between numerical variables
dataset.corr()  # Pearson correlation
dataset.corr('kendall')  # Kendall Tau correlation
dataset.corr('spearman')  # Spearman Rank correlation

# Identify missing data
# pip install missingno
import missingno as msno  # Provides a library of data missingness functions
#%matplotlib inline
msno.matrix(dataset)
msno.bar(dataset)
msno.heatmap(dataset)
msno.dendrogram(dataset)

# Separate dependent and independent variables (ensure dependent variable is the final column in the dataset)
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Handle missing data (If necessary)
from sklearn.preprocessing import Imputer  # Imputes numerical variables

imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer.fit(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])

# Encode categorical variables (If necessary)
from sklearn.preprocessing import LabelEncoder  # Encodes categorical variables
from sklearn.preprocessing import OneHotEncoder  # Converts categorical variables to dummy variables

示例#30

0

显示文件

文件： imputation.py 项目： mirzask/summer19

# 4. Matrix factorization

import numpy as np
import pandas as pd

titanic = pd.read_csv(
    "https://raw.githubusercontent.com/Geoyi/Cleaning-Titanic-Data/master/titanic_original.csv"
)

# Visualize missingness of a sample of 200 observations

import missingno as msno

msno.matrix(titanic.sample(200))
msno.bar(titanic.sample(200))
msno.dendrogram(titanic.sample(200))

# Heatmap shows how strongly the presence or absence of one variable affects the presence of another
msno.heatmap(titanic.sample(200))

# Create a dataset using only the numeric values

titanic_numerics = titanic.loc[:, titanic.dtypes != object]

# Do we have any 'y' values that are NA?
titanic_numerics['survived'].isna().sum()

# Drop observations with missing 'y' values

titanic_numerics = titanic_numerics.loc[
    ~np.isnan(titanic_numerics['survived']), :]

示例#31

0

显示文件

文件： using-matplotlib-numpy-and-pandas-in-titanic.py 项目： nischalshrestha/automatic_wat_discovery

full[(full['Cabin'].str.contains('B2', na=False))]  #filter data by columns

# In[ ]:

full.isnull().sum()  # Check with alues are empty

# In[ ]:

#Missing values in the plot
import missingno as msno

msno.matrix(full)

# In[ ]:

msno.dendrogram(full)

# In[ ]:

train_df.groupby(['Pclass', 'Sex'])['Survived'].sum()  # grouping data

# In[ ]:

full['CabinType'] = full['Cabin'].astype(str).str[0]
full['_CabinType'] = pd.Categorical(full.CabinType).codes

full['CabinType2'] = full['Cabin'].astype(str).str[0:2]
full['_CabinType2'] = pd.Categorical(full.CabinType2).codes

full[:SURV].groupby(['Pclass',
                     'CabinType'])['Survived'].agg(['count', 'sum', 'mean'])

示例#32

0

显示文件

文件： house.py 项目： sunanda629/Final_MLProject

    def missing_stats(self):
        # Basic Stats
        self.all.info()

        # Heatmap
        heat = sns.heatmap(self.all.drop(['SalePrice'], axis=1).isnull(),
                           cbar=False)
        fig = heat.get_figure()
        fig.savefig('Figures/Missingness/missingness.png',
                    transparent=True,
                    dpi=400,
                    bbox_inches='tight',
                    format='png')  #
        col_missing = [
            name for name in self.all.columns
            if np.sum(self.all[name].isnull()) != 0
        ]
        col_missing.remove('SalePrice')
        print(col_missing)

        heat = sns.heatmap(self.all[col_missing], cbar=False)

        heat = msno.heatmap(self.all)
        # plt.figure()
        fig = heat.get_figure()
        fig.savefig('Figures/Missingness/NullityCorrelation.png',
                    transparent=True,
                    bbox_inches='tight',
                    dpi=400,
                    format='png')
        heat = msno.heatmap(self.all[[
            'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath',
            'BsmtUnfSF', 'TotalBsmtSF'
        ]])
        # plt.figure()
        fig = heat.get_figure()
        fig.savefig('Figures/Missingness/HeatMapBasement.png',
                    bbox_inches='tight',
                    dpi=400,
                    transparent=True,
                    format='png')
        heat = msno.heatmap(self.all[[
            'GarageCond', 'GarageFinish', 'GarageFinish', 'GarageQual',
            'GarageType', 'GarageYrBlt'
        ]])
        # plt.figure()
        fig = heat.get_figure()
        fig.savefig('Figures/Missingness/HeatMapGarage.png',
                    bbox_inches='tight',
                    dpi=400,
                    transparent=True,
                    format='png')
        heat = msno.dendrogram(self.all)
        fig = heat.get_figure()
        fig.savefig('Figures/Missingness/Dendrogram.png',
                    bbox_inches='tight',
                    dpi=400,
                    transparent=True,
                    format='png')

        # Bar chart
        if len(col_missing) != 0:
            # IF WE USE THIS WE SHOULD NORMALIZE IT TO PERCENTAGE?
            plt.figure(figsize=(12, 6))
            np.sum(self.all[col_missing].isnull()).plot.bar(color='b')
            # plt.savefig('Figures/Missingness/'+str(column)+'.png', bbox_inches='tight' ,dpi=400,transparent=True, format='png')
            # Table
            print(pd.DataFrame(np.sum(self.all[col_missing].isnull())))
            print(
                np.sum(self.all[col_missing].isnull()) * 100 /
                self.all[col_missing].shape[0])

示例#33

0

显示文件

文件： titanic.py 项目： nischalshrestha/automatic_wat_discovery

# In[ ]:

import missingno as msn

# In[ ]:

msn.matrix(data)

# In[ ]:

msn.heatmap(data)

# In[ ]:

msn.dendrogram(data)

# In[ ]:

data.info()

# In[ ]:

data.isnull().sum()

# In[ ]:

data.drop('Cabin', axis=1, inplace=True)

# In[ ]: