Exemplo n.º 1
0
# In[ ]:


train_copy = train
train_copy = train_copy.replace(-1, np.NaN)


# Next, we can use resident Kaggler's [Aleksey Bilogur](https://www.kaggle.com/residentmario) - creator of the  "Missingno" package which is a most useful and convenient tool in visualising missing values in the dataset, so check it out.

# In[ ]:


import missingno as msno
# Nullity or missing values by columns
msno.matrix(df=train_copy.iloc[:,2:39], figsize=(20, 14), color=(0.42, 0.1, 0.05))


# As we can see, the missing values now become much more apparent and clear when we visualise it, where the empty white bands (data that is missing) superposed on the vertical dark red bands (non-missing data) reflect the nullity of the data in that particular column. In this instance, we can observe that there are 7 features out of the 59 total features (although as rightly pointed out by Justin Nafe in the comments section there are really a grand total of 13 columns with missing values) that actually contained null values. This is due to the fact that the missingno matrix plot can only comfortable fit in approximately 40 odd features to one plot after which some columns may be excluded, and hence the remaining 5 null columns have been excluded. To visualize all nulls, try changing the figsize argument as well as tweaking how we slice the dataframe.
# 
# For the 7 null columns that we are able to observe, they are hence listed here as follows:
# 
# **ps_ind_05_cat | ps_reg_03 | ps_car_03_cat | ps_car_05_cat | ps_car_07_cat | ps_car_09_cat | ps_car_14**
# 
# Most of the missing values occur in the columns suffixed with _cat. One should really take further note of the columns ps_reg_03, ps_car_03_cat and ps_car_05_cat. Evinced from the ratio of white to dark bands, it is very apparent that a big majority of values are missing from these 3 columns, and therefore a blanket replacement of -1 for the nulls might not be a very good strategy.

# **Target variable inspection**
# 
# Another standard check normally conducted on the data is with regards to our target variable, where in this case, the column is conveniently titled "target". The target value also comes by the moniker of class/label/correct answer and is used in supervised learning models along with the corresponding data that is given (in our case all our train data except the id column) to learn the function that best maps the data to our target in the hope that this learned function can generalize and predict well with new unseen data.

# In[ ]:
print(draw)
draw.plot(kind='bar', color=['r', 'b'], label='Survived')
plt.legend(['0', '1'])
print(
    "The family size has a considerable impact on our outcome whether family")

train.Embarked.replace("", "NAN", inplace=True)
train.Embarked.fillna('S', inplace=True)
train.Embarked.isnull().sum()  # check

train['Age'].hist(bins=10)

train.Age.replace("", "NAN", inplace=True)
train.Age.fillna(np.random.randint(20, 31), inplace=True)
train.Age.isnull().sum()
msno.matrix(train)

train['Title'] = train.Name.apply(
    lambda x: re.search(' ([A-Z][a-z]+)\.', x).group(1))
sns.countplot(x='Title', data=train)
plt.xticks(rotation=45)

train['Title'] = train['Title'].replace({
    'Mlle': 'Miss',
    'Mme': 'Mrs',
    'Ms': 'Miss'
})
train['Title'] = train['Title'].replace([
    'Don', 'Dona', 'Rev', 'Dr', 'Major', 'Lady', 'Sir', 'Col', 'Capt',
    'Countess', 'Jonkheer'
], 'Special')
Exemplo n.º 3
0
def visualize(df_train, df_labels):
    print(df_train.head().T)
    print(df_train.info())
    msno.matrix(df_train)

    # Numerical features
    print(df_train.describe())

    # Let's inspect now the categorical features
    cat_df = pd.DataFrame(columns=["Feature", "Cardinality", "% Missings"])

    total_cardinality = 0

    i = 0

    for col in df_train.columns:
        if (df_train[col].dtype == np.object):
            cat_df.loc[i, "Feature"] = col
            cat_df.loc[i, "Cardinality"] = len(df_train[col].unique())
            total_cardinality += len(df_train[col].unique())
            pct_of_missing_values = float(
                (len(df_train[col]) - df_train[col].count()) /
                len(df_train[col]))
            cat_df.loc[i, "% Missings"] = pct_of_missing_values * 100
            i += 1

    print("Total cardinality of categorical features:", total_cardinality)

    print(cat_df)

    # Visualizations
    data_viz = pd.concat([df_train, df_labels['status_group']], axis=1)

    # Label distribution
    plt.figure(figsize=(14, 7))
    sns.countplot(x='status_group', data=data_viz, palette="Greens_d")
    plt.show()

    # Construction year distribution
    # We need to filter the instances with year 0 that will be taken care of in the Data Preparation part
    plt.figure(figsize=(14, 7))
    sns.distplot(
        data_viz['construction_year'][data_viz['construction_year'] > 0])
    plt.show()

    # Water pump geographical distribution with population proportional circles and year of pump color bar
    # We need to filter the instances with year 0, longitud 0 and latitude 0 in this case
    data_viz[data_viz['longitude']>0][data_viz['latitude']<0][data_viz['construction_year']>0].plot\
        (kind="scatter", x="longitude", y="latitude", alpha=0.4,
        s=data_viz["population"]/10, label="population", figsize=(14,10),
        c="construction_year", cmap=plt.get_cmap("jet"), colorbar=True,
        sharex=False)
    plt.legend
    plt.show()

    # Correlation heatmap of the numerical features
    cor = data_viz.corr()
    plt.figure(figsize=(14, 13))
    sns.heatmap(cor, square=True, annot=True, cbar=False)
    plt.show()

    # Boxplot of label distribution by pump construction year
    plt.figure(figsize=(14, 7))
    sns.boxplot(x='status_group',
                y="construction_year",
                data=data_viz[data_viz['construction_year'] > 0])
    plt.show()

    # A different way of seeing this same concept, with proportions within the distribution plot, using violin plots
    fig, ax = plt.subplots(figsize=(14, 12))
    ax = sns.violinplot(x='status_group', y="construction_year",\
                        data=data_viz[data_viz['construction_year']>0], split=True)
    plt.show()

    # Mosaic of permit distribution per label
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(16, 8))
    fig = mosaic(data_viz, ['status_group', 'permit'],
                 axes,
                 title="Permit distribution per label")
    plt.show()

    # Mosaic of public meeting distribution per label
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(16, 8))
    fig = mosaic(data_viz, ['status_group', 'public_meeting'],
                 axes,
                 title="Public meeting distribution")
    plt.show()

    # Mosaic of source class distribution per label
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(16, 8))
    fig = mosaic(data_viz, ['status_group', 'source_class'],
                 axes,
                 title="Source class distribution per label")
    plt.show()

    # Bar charts of some relevant categorical features per label
    variables = ['quantity', 'payment', 'source_type', 'waterpoint_type']
    label = 'status_group'
    plot_tables(data_viz, label, variables)
    plt.show()

    # Value distribution for some important features with low cardinalit
    variables = ['basin','extraction_type_class','management','management_group',\
                 'water_quality','source','source_class']
    plot_proportions(data_viz, variables)

    # Height distribution per label
    plt.figure(figsize=(14, 10))
    p1=sns.kdeplot(data_viz[['gps_height','status_group']][data_viz.status_group == 'functional']\
                   [data_viz.gps_height > 0].gps_height, shade=True, color="g",label='functional')
    p1=sns.kdeplot(data_viz[['gps_height','status_group']][data_viz.status_group == 'non functional']\
                   [data_viz.gps_height > 0].gps_height, shade=True, color="r",label='non functional')
    p1=sns.kdeplot(data_viz[['gps_height','status_group']][data_viz.status_group == 'functional needs repair']\
                   [data_viz.gps_height > 0].gps_height, shade=True, color="y",label='functional needs repair')
    plt.show()

    # Pair plot of the relevant numerical features against each other, differentiating by label value
    sns.set(style="ticks")
    sns.pairplot(data_viz[['population','num_private','amount_tsh','status_group']],\
                        hue="status_group", diag_kind="kde")
    plt.show()
Exemplo n.º 4
0

question = pd.read_csv('C:\\Users\\Lenovo\\Downloads\\data\\schema.csv')
print(question.shape)

print(question.tail(10))

# MultipleChoiceQuestions

mcq = pd.read_csv('C:\\Users\\Lenovo\\Downloads\\data\\multipleChoiceResponses.csv',
                  encoding="ISO-8859-1",low_memory=False)

print(mcq.shape)

print(mcq.head(10))

# nan data visualization - missingno
import missingno as msno
plt.show(msno.matrix(mcq, figsize=(12,5)))


# SurveyStatics
# 1. Gender

print(sns.countplot(y='GenderSelect', data = mcq))
# sns.countplot("column", "data = using data)

# 2. Country
con_df = pd.DataFrame(mcq['Country'].value_counts())
print(con_df)
Exemplo n.º 5
0
import numpy as np
import pandas as pd
import missingno as msno
import seaborn as sn
import matplotlib.pyplot as plt

# Reading the data
data = pd.read_csv('winequality-white.csv', sep=';')

# Missing data detection
msno.matrix(data, figsize=(10, 3))

# Distribution
fig, axes = plt.subplots(nrows=2, ncols=1)
fig.set_size_inches(10, 20)
sn.boxplot(data=data, orient="v", ax=axes[0])
sn.boxplot(data=data, y="quality", orient="pH", ax=axes[1])

# Correlation analasys
corrMatt = data.corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
fig, ax = plt.subplots()
fig.set_size_inches(20, 10)
sn.heatmap(corrMatt, mask=mask, vmax=.8, square=True, annot=True)
Exemplo n.º 6
0
 def test_freq_matrix(self):
     msno.matrix(self.freq_df, freq='BQ')
     return plt.gcf()
Exemplo n.º 7
0
import seaborn as sns
from statistics import mode 
import matplotlib.pyplot as plt
import numpy as np
from datetime import date


trab = pd.read_excel("C:\\Users\\eduar\\Downloads\\GroupDatasets\\dataset.xlsx")

#----------------------------------------------------------------------------------------
#STEP 1
#----------------------------------------------------------------------------------------

# cria um gráfico que mostra os valores em falta
import missingno as msno
msno.matrix(trab,figsize=(12,5))


# preenche os dados com valores em falta com a média ou moda dos intervalos
trab= trab.fillna(trab.mean())


#----------------------------------------------------------------------------------------
#STEP 2
#----------------------------------------------------------------------------------------

# Criando novas variáveis

#  1 - total em compras por cada cliente
trab['MntTotal'] = trab['MntAcessories'] + trab['MntClothing'] + trab['MntBags'] + trab['MntAthletic'] + trab['MntShoes']
# Make sure we set the correct maximum for rating column out of range values
# Isolate rows of rating > 5.0
airbnb[airbnb['rating'] > 5.0]
airbnb[airbnb['rating'] > 5.0]['rating']
# Drop these rows and make sure we have effected changes
airbnb.drop(airbnb[airbnb['rating'] > 5.0].index, inplace=True)
# airbnb['rating'] = airbnb[airbnb['rating'] > 5.0].replace(5)
# Visualize the rating column again
sns.distplot(airbnb['rating'], bins=20)
plt.show()
# Get the maximum
airbnb['rating'].max()

# Dealing with missing data
# Visualize the missingness
msno.matrix(airbnb)
plt.show()
# Visualize the missingness on sorted values
msno.matrix(airbnb.sort_values(by='rating'))
plt.show()
# Missingness barplot
msno.bar(airbnb)
plt.show()
# Understand DataFrame with missing values in rating, number_of_stays, 5_stars, reviews_per_month
airbnb[airbnb['rating'].isna()].describe()
# Understand DataFrame with NO missing values in rating, number_of_stays, 5_stars, reviews_per_month
airbnb[~airbnb['rating'].isna()].describe()

# Impute missing data
airbnb = airbnb.fillna({
    'reviews_per_month': 0,
Exemplo n.º 9
0
        st_time = time.time()
        while np.where(masks == 0)[0].shape[0] < miss_size:
            coordi_x = np.random.randint(0, masks.shape[1])
            coordi_y = np.random.randint(0, masks.shape[0])
            burst_len = np.random.randint(options.burst_min, options.burst_max)
            judge_res = (masks[coordi_y:coordi_y + burst_len, coordi_x] == [1])
            if judge_res.all() == True:  #如果都是1,即都没有缺失或人工缺失
                data_noisy[coordi_y:coordi_y + burst_len, coordi_x] = np.nan
                masks[coordi_y:coordi_y + burst_len, coordi_x] = 0
        np.save('./coalmill-mask/mask_{}.npy'.format(name_list[j]), masks)
        print('Save mask success.')
        en_time = time.time()
        print('Successful masking, time cosumed: {:.2f}s'.format(en_time -
                                                                 st_time))

    msno.matrix(pd.DataFrame(data_noisy[:5000, :]), labels=False)
    plt.savefig('visual/matrix_{}.pdf'.format(name_list[j]),
                dpi=300,
                bbox_inches='tight')
    msno.matrix(pd.DataFrame(data_ground[:5000, :]), labels=False)
    plt.savefig('visual/matrix_{}_origin.pdf'.format(name_list[j]),
                dpi=300,
                bbox_inches='tight')
    # 储存最终结果
    list_final = []
    data_noisy = scaler.transform(data_noisy)
    data_ground = scaler.transform(data_ground)
    #### 减少数据量
    small_or_not = 'medium'  # ? normal, medium, small
    if small_or_not == 'small':
        # ! 为了测试专用,加快加载速度
Exemplo n.º 10
0
# In[ ]:

DimDf.head(20)

# In[ ]:

print(DimDf.dtypes)

# In[ ]:

pivot_ui(DimDf)

# In[ ]:

get_ipython().run_line_magic('matplotlib', 'inline')
msno.matrix(Df)

# In[288]:

sns.pairplot(Df)

# In[ ]:

sns.pairplot(Df, hue="Day")

# In[ ]:

sns.pairplot(DimDf, hue="DeviceName")

# # Feature selection methods ( based on importance for ML)
Exemplo n.º 11
0
#Missing data part
print("Number of missing values per feature")
missingValueShare = []
for col in features.columns:
    #if is_string_dtype(df_dig[col]):
    missingValueShare.append(sum(features[col].isna()) / numSamples)

#Print missing value graph
vis.paintBarChartForMissingValues(features.columns, missingValueShare)

# In[30]:

#Visualize missing data with missingno
import missingno as msno
get_ipython().run_line_magic('matplotlib', 'inline')
msno.matrix(features)

# In[31]:

if features.isnull().values.sum() > 0:
    msno.heatmap(features)

# #### View Prepared Binary Features
#
# We need some more plots for the binary data types.

# In[32]:

#vis.plotBinaryValues(df_dig, df_dig.columns) #0:-1
#plt.savefig(image_save_directory + "/BinaryFeatures.png", dpi=70)
Exemplo n.º 12
0
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.
# Define dictionary
dictionary = {"column1":[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20],
              "column2":[1,2,3,4,np.nan,6,7,8,np.nan,10,np.nan,12,13,14,15,16,np.nan,18,np.nan,20],
              "column3":[1,2,3,4,np.nan,6,7,8,9,10,11,12,13,np.nan,15,16,17,18,np.nan,20]}
# Create data frame from dictionary
data_missingno = pd.DataFrame(dictionary) 

# import missingno library
import missingno as msno
msno.matrix(data_missingno)
plt.show()

# missingno bar plot
msno.bar(data_missingno)
plt.show()
# load iris data
data = pd.read_csv('../input/Iris.csv')
data = data.drop(['Id'],axis=1)
# Make the plot
plt.figure(figsize=(15,10))
parallel_coordinates(data, 'Species', colormap=plt.get_cmap("Set1"))
plt.title("Iris data class visualization according to features (setosa, versicolor, virginica)")
plt.xlabel("Features of data set")
plt.ylabel("cm")
plt.savefig('graph.png')
columns = [
    "Location",
    "name",
    "Date",
    "Result",
    "Belligerents.allies",
    "Belligerents.axis",
    "Casualties and losses.allies",
    "Casualties and losses.axis",
]

# %%
battles[columns].head(3)

# %%
msno.matrix(battles, labels=True, sparkline=False)

# %%
mask = battles[["Date", "Location"]].isnull().all(1)

# %%
print(battles.loc[mask, ["name", "url"]].to_string())

# %%
battles = battles.dropna(subset=["Date", "Location"])

# %%
pattern = r"/ ([\d|\.]+); ([\d|\.]+)"

# %%
battles.head(10).Location.str.extract(pattern)
Exemplo n.º 14
0
import pandas as pd
import matplotlib.pyplot as plt  #plot data
import seaborn as sns  #plot data
import missingno as ms  #plot missing data
"""2. Data Cleaning"""

url = 'https://raw.githubusercontent.com/nachi-hebbar/Forest-Fire-Prediction-Website/master/Forest_fire.csv'
url1 = 'https://raw.githubusercontent.com/hiyabose/Depression/master/depressed.csv'
url2 = 'https://raw.githubusercontent.com/hiyabose/Depression/master/newsurvey.csv'
df = pd.read_csv(url2)

df.head()

df.info()

ms.matrix(df)

df.max()

df.describe()

df.shape

sns.swarmplot(y="Age", x=" Risk", data=df)
plt.show()
"""Here also we can see that the majority are depressed in their mid life."""

from sklearn.linear_model import LogisticRegression

from sklearn import svm
Exemplo n.º 15
0
 def test_no_sparkline_matrix(self):
     msno.matrix(self.simple_df, sparkline=False)
     return plt.gcf()
Exemplo n.º 16
0
def plot_us():
    fig, ax = plt.subplots(1, 1)
    by = var_by.get()
    if by == 'None':
        by = None

    data_dropped_na = data.dropna()

    plot_type = type_combo.get()
    if plot_type == 'Histogram':

        g = sns.distplot(data_dropped_na[var_x.get()], rug=True, rug_kws={'color': '#777777', 'alpha': 0.2},
                              hist_kws={'edgecolor': 'black', 'color': '#6899e8', 'label': 'розподіл'},
                              kde_kws={'color': 'black', 'alpha': 0.2, 'label': 'ядрова оцінка густини'})
        sns.despine(left=True, bottom=True)  # видалити осі повністю
        g.set_xlabel(var_x.get(), color='black', fontsize=15, alpha=0.5)
        g.set_ylabel('Густина', color='black', fontsize=15, alpha=0.5)
        plt.legend(loc='upper right')

        fig.savefig('Plots/hist.pdf')
        plt.close(fig)
        os.startfile('Plots\hist.pdf')
        return

    if plot_type == 'Scatter plot':
        a = sns.jointplot(var_x.get(), var_y.get(), data=data_dropped_na, kind='reg', color='#5394d6',
                          annot_kws={'fontsize': 14, 'loc': [-0.1, 0.85]},
                          marginal_kws={'rug': True, 'bins': 25, 'hist_kws': {'edgecolor': 'black'}},
                          joint_kws={'scatter_kws': {'alpha': 0.7}})
        plt.setp(a.ax_marg_x.patches, linewidth=1.0, color='#a9c8e8')
        plt.setp(a.ax_marg_y.patches, linewidth=1.0, color='#a9c8e8')
        a.ax_joint.set_xlabel(var_x.get(), fontsize=15, alpha=0.7)
        a.ax_joint.set_ylabel(var_y.get(), fontsize=15, alpha=0.7)
        plt.savefig('Plots/scatter.pdf')
        plt.close()
        os.startfile('Plots\scatter.pdf')

        return

    if plot_type == 'Bar plot':

        ax = sns.barplot(x=var_x.get(), y=var_y.get(), hue=by, data=data_dropped_na, palette=combo_palette.get(),
                         errcolor='0.4', errwidth=1.1)
        ax.set_ylabel('Середнє значення ' + var_y.get(), color='#666666')
        ax.set_xlabel(var_x.get(), color='#666666')
        plt.legend(loc=[0.8, 0.9])
        sns.despine()
        fig.savefig('Plots/barplot.pdf')
        plt.close(fig)
        os.startfile('Plots\\barplot.pdf')
        return

    if plot_type == 'Count bar':
        ax = sns.countplot(x=var_x.get(), hue=by, data=data_dropped_na, palette=combo_palette.get())
        ax.set_ylabel('Кількість', color='#666666')
        ax.set_xlabel(var_x.get(), color='#666666')
        plt.legend(loc=[0.8, 0.9])
        sns.despine()
        fig.savefig('Plots/countbar.pdf')
        plt.close(fig)
        os.startfile('Plots\\countbar.pdf')
        return

    if plot_type == 'Boxplot':

        ax = sns.boxplot(var_x.get(), var_y.get(), data=data_dropped_na, hue=by, width=0.4, palette=combo_palette.get())
        ax.set_ylabel(var_y.get(), color='#666666')
        ax.set_xlabel(var_x.get(), color='#666666')
        plt.legend(loc='upper right')
        sns.despine()
        plt.savefig('Plots/Boxplot.pdf')
        plt.close(fig)
        os.startfile('Plots\Boxplot.pdf')
        return

    if plot_type == 'Violin plot':

        ax = sns.violinplot(var_x.get(), var_y.get(), data=data_dropped_na, hue=by, scale='count', split=True, palette=combo_palette.get())
        ax.set_ylabel(var_y.get(), color='#666666')
        ax.set_xlabel(var_x.get(), color='#666666')
        plt.legend(loc='upper right')
        sns.despine()
        plt.savefig('Plots/violin.pdf')
        plt.close(fig)
        os.startfile('Plots\\violin.pdf')
        return

    if plot_type == 'Beeswarm plot':
        ax = sns.swarmplot(var_x.get(), var_y.get(), data=data_dropped_na, hue=by, alpha=0.7, palette=combo_palette.get())

        mean_width = .5

        for tick, text in zip(ax.get_xticks(), ax.get_xticklabels()):
            sample_name = text.get_text()

            mean_val = data_dropped_na[data_dropped_na[var_x.get()] == sample_name][var_y.get()].mean()

            ax.plot([tick - mean_width / 2, tick + mean_width / 2], [mean_val, mean_val], lw=2, color='#777777')

        ax.set_ylabel(var_y.get(), color='#666666')
        ax.set_xlabel(var_x.get(), color='#666666')
        sns.despine()
        plt.savefig('Plots/beeswarm.pdf')
        plt.close(fig)
        os.startfile('Plots\\beeswarm.pdf')
        return

    if plot_type == 'Missing data with matrix':
        figsize = None
        if len(data.columns) > 10:
            figsize = (30, 27)
        else:
            figsize = (25, 10)

        ax = missingno.matrix(data if len(data) < 500 else data.sample(500), inline=False, figsize=figsize)

        plt.savefig('Plots/missing matrix.pdf')
        plt.close(fig)
        os.startfile('Plots\\missing matrix.pdf')
        return

    if plot_type == 'Missing data with bars':
        figsize = None
        if len(data.columns) > 10:
            figsize = (30, 27)
        else:
            figsize = (25, 10)

        ax = missingno.bar(data if len(data) < 500 else data.sample(500), inline=False, figsize=figsize)

        plt.savefig('Plots/missing bars.pdf')
        plt.close(fig)
        os.startfile('Plots\\missing bars.pdf')
        return

    if plot_type == 'Missing data correlations':

        ax = missingno.heatmap(data, inline=False, figsize=(25, 25))

        plt.savefig('Plots/missing correlations.pdf')
        plt.close(fig)
        os.startfile('Plots\\missing correlations.pdf')
        return
Exemplo n.º 17
0
 def test_color_matrix(self):
     msno.matrix(self.simple_df, color=(70 / 255, 130 / 255, 180 / 255))
     return plt.gcf()
Exemplo n.º 18
0
"""
Created on Sat May  9 19:49:30 2020

@author: Surraj
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

test = pd.read_csv('test.csv')
df = pd.read_csv('train.csv')

msno.matrix(df, figsize=(12, 5))
msno.matrix(test, figsize=(12, 5))


def null(df):
    null_value = df.isnull().sum()
    per_null = 100 * df.isnull().sum() / len(df)

    unique = pd.DataFrame(columns=['unique'])
    for i in df.columns:
        nunique = df[i].nunique()
        unique.loc[i] = [nunique]

    miss_val = pd.concat([null_value, per_null, unique], 1)

    miss_val_rename = miss_val.rename(columns={
Exemplo n.º 19
0

# Drop a column
df.drop("PassengerId", axis = 1)


# Drop columns where there are greater than 10 missing values
df.dropna(axis = 1, thresh = len(df)-10)


# Compute number of missing values in a column
df['Cabin'].isnull().sum()



# Return all rows where 'Cabin' has a value (i.e. non-null)
df[df['Cabin'].notnull()]










# Use missingno

msno.matrix(df); plt.show()
# # nullity analysis

# In[ ]:

import missingno as msno

# In[ ]:

msno.bar(dftrain.sample(890))

# # now nultity correlation wehave to see between age , cabin and embarked

# In[ ]:

msno.matrix(dftrain)

# In[ ]:

msno.heatmap(dftrain)

# In[ ]:

msno.dendrogram(dftrain)

# # our finding says that when cabin and age values will come and will be null together where as in case of emabarked it is reverse

# # from this we are concluding a fact that only 38.8 % people survived , and even most young people died in this disaster about age of 30

# # Now a pie chart percentage of Categories of people travelling survived
        {col: 'max'
         for col in data_cols}))


players = get_subgroup(data, player_index, player_cols)

# 球员和裁判的关系

dyad_index = ['refNum', 'playerShort']
dyad_cols = [
    'games',
    'victories',
    'ties',
    'defeats',
    'goals',
    'yellowCards',
    'yellowReds',
    'redCards',
]
dyads = get_subgroup(data, dyad_index, dyad_cols)

# 3、对于缺失值数据的处理

msno.matrix(players.sample(1000), labels=True)  # 无效数据密度显示
msno.bar(players.sample(1000))  # 条形图显示
msno.heatmap(players.sample(1000))  # 热图相关性显示
msno.dendrogram(players.sample(1000))  # 树状图显示

players['rater1'] = players[['rater1'].notnull()]
players['rater2'] = players[['rater2'].notnull()]
Exemplo n.º 22
0
def main():
    """Write Streamlit commands here to display text and data in the app.
    Replace the code within this function with your own data workflow and UI.

    Streamlit API reference:
    https://docs.streamlit.io/en/stable/api.html
    """

    # Configures the default settings
    st.set_page_config(page_title='datathon-starter',
                       page_icon='🛠️',
                       layout='wide')

    # Page title and header
    st.title('🛠️📊')
    st.title('Starter code for data applications')
    st.subheader('MIT License')
    st.markdown("""
        ---
        🙌 Build your own data app

        Modify pre-existing code and implement empty functions:\n
        1. Data tasks are found in `server/tasks.py`
        2. Data workflows are found in `server/pipeline.py`
        3. The Streamlit app's UI code is found in `app.py`
        ---
        🚀 Try a quick example

        From the sidebar *(click on > if closed)*:\n
        1. Select a dataset
        2. Select all categorical variables in the multiselect widget
        3. Select an endogenous variable in the chosen dataset

        From the main UI below:\n
        4. Press the "Run workflow" button
        ---
        """)

    # Example app
    params = sidebar()  # Display sidebar in Streamlit app
    # Drop `data` and return its value
    data = params.pop('data')
    # Drop dataset `item` code and return its value
    item = params.pop('item')
    title = DATASET_TITLES[item]
    st.subheader(f'{title}')
    st.text('A random sample of 5 rows:')
    st.table(data.sample(5))  # Display random sample as a static table

    # Column container for buttons
    col1, col2, col3 = st.beta_columns(3)
    # Data profiling
    if col1.button('🔬 Data profiling report'):
        profile_report = ProfileReport(data, explorative=True)
        st_profile_report(profile_report)
    # Missing value analysis
    if col2.button('🔎 Missing value plots'):
        # Check if there are any missing values
        if pd.notna(data).all().all():
            st.warning('No missing values in dataset')
        else:
            fig1 = msno.matrix(data).get_figure()
            st.pyplot(fig1)
            fig2 = msno.heatmap(data).get_figure()
            st.pyplot(fig2)
            fig3 = msno.dendrogram(data).get_figure()
            st.pyplot(fig3)
    # Run data workflow
    if col3.button('✨ Run workflow!'):
        st.write('---')
        # Stop execution until a valid endogenous variable is selected
        if not (params.get('endog')):
            st.warning('Please select an endogenous variable')
            st.stop()
        flow_name = 'e2e_pipeline'
        project_name = 'datathon-starter'
        task_refs = ['wrangle_na']
        params = {
            'url': params.get('url'),
            'sep': params.get('sep'),
            'strategy': params.get('na_strategy')
        }
        results, state_msg = create_prefect_flow_run(flow_name, project_name,
                                                     task_refs, params)
        # Check if all tasks were successfully executed
        if 'fail' in state_msg:
            # List of each state's (name, state message) in the workflow
            st.warning(state_msg)
            st.info('Please view the Flow logs on the Prefect Server\'s'
                    ' [UI](localhost:8080).')
        # If all tasks were successfully executed
        else:
            # Unpack results
            preprocessed_data, conf_int_chart = results
            # Success!
            st.balloons()
            st.success(state_msg)
            # Retrieve results from prefect flow run
            st.subheader('Pre-processed Data')
            st.dataframe(preprocessed_data)
            st.subheader('Regression Results')
            st.text('Dot and whisker plot of coefficients'
                    ' and their confidence intervals:')
            # Plot regression coefficient's confidence intervals
            st.altair_chart(conf_int_chart, use_container_width=True)
#Préparation des données

#Chargement des packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import missingno as msno

#Chargements des données
dataset = pd.read_csv("hcvdat0.csv")

#Visualisation des données manquantes
msno.matrix(dataset)

x = dataset.iloc[:, 2:].values
y = dataset.iloc[:, 1].values

#Traitements des données manquantes
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")

imputer = imputer.fit(x[:, 2:-1])
x[:, 2:-1] = imputer.transform(x[:, 2:-1])

#concatener les deux tableaux pour généré un fichier csv avec les donées complétées
z = np.c_[y, x]

#Généré fichier csv
entetes = [
    u'Category', u'Age', u'Sex', u'ALB', u'ALP', u'ALT', u'AST', u'BIL',
    u'CHE', u'CHOL', u'CREA', u'GGT', u'PROT'
Exemplo n.º 24
0
# > Values of -1 indicate that the feature was missing from the observation.
#
# So, we need to find null value by finding '-1' value.

# ## 2.2 Find Null data
# We need to find some features containing null data.<br>
# reference: [Anisotropic's work](https://www.kaggle.com/arthurtok/interactive-porto-insights-a-plot-ly-tutorial)

# In[ ]:

import missingno as msno

train_null = train
train_null = train_null.replace(-1, np.NaN)

msno.matrix(df=train_null.iloc[:, :], figsize=(20, 14), color=(0.8, 0.5, 0.2))

# In[ ]:

test_null = test
test_null = test_null.replace(-1, np.NaN)

msno.matrix(df=test_null.iloc[:, :], figsize=(20, 14), color=(0.8, 0.5, 0.2))

# In[ ]:

# Extract columns with null data
train_null = train_null.loc[:, train_null.isnull().any()]
test_null = test_null.loc[:, test_null.isnull().any()]

print(train_null.columns)
"""## 1.2 ) Reading the data from a CSV file"""

df = pd.read_csv(r'WA_Fn-UseC_-HR-Employee-Attrition.csv')

df.head()

df.shape

df.columns
"""## 1.3 ) Missing Values Treatment"""

df.info()  # no null or Nan values.

df.isnull().sum()

msno.matrix(df)  # just to visualize.
"""## 1.4 ) The Features and the 'Target'"""

df.columns

df.head()
"""## 1.5 ) Univariate Analysis"""

df.describe()
"""Let us first analyze the various numeric features. To do this we can actually plot a boxplot showing all the numeric features."""

sns.factorplot(data=df, kind='box', size=10, aspect=3)
"""Note that all the features have pretty different scales and so plotting a boxplot is not a good idea. Instead what we can do is plot histograms of various continuously distributed features.
 

> We can also plot a kdeplot showing the distribution of the feature. Below I have plotted a kdeplot for the 'Age' feature. Similarly we plot for other numeric features also. Similarly we can also use a distplot from seaborn library.
# data['REGION_CODE'] = data['REGION_CODE'].astype(float)

data['REGION_CODE'] = pd.to_numeric(data['REGION_CODE'], errors='coerce')
# F:/ML_Project_April_2020/SD_Sales_Predict_ML_Projects/Sales_SD_Sample_ML_Projects/Month_Sales_JasonBrownie_Dataset.csv
# , header=0, index_col=['BILLING_DATE']
print('Data Shape')
print('\n-----------------')
print(data.info)
print(data.head(10))
print('Shape:', data.shape)

print('\nAnalyzing missing Values in Dataset')
print('\n-------------------------------------')

# Visualize missing values as a matrix
msno.matrix(data)

# Visualize the number of missing values as a bar chart
msno.bar(data)

# Visualize the correlation between the number of missing values in different columns as a heatmap
msno.heatmap(data)

# fill missing values with mean column values
data.fillna(data.mean(), inplace=True)
# count the number of NaN values in each column
print('\nSummary on Null Values')
print('\n----------------------------')
print(data.isnull().sum())

data.head(20)
Exemplo n.º 27
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
import category_encoders as ce
import lightgbm as lgbm
import re

train = pd.read_csv(
    "C:/Users/10188/local_git/tabular-playground-series-apr-2021/train.csv")
test = pd.read_csv(
    "C:/Users/10188/local_git/tabular-playground-series-apr-2021/test.csv")
train.head()
train.info()
train.describe()

msno.matrix(train)
msno.bar(train)  #Age, Ticket, Fare, Cabin, Embarked have null

msno.bar(test)  #Age, Ticket, Fare, Cabin(to drop), Embarked have null

numeric_v = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
string_v = ['Survived', 'Sex', 'Embarked', 'Ticket', 'Cabin', 'Name']
delete_v = ['PassengerId']

##################################################################
# - > name(family name), ticket(앞글자), cabin(a,b...)이용해보기 : Name은 의미x, ticket, cabin 이용
# - > famsize, name freq 를 categorical vari.로 이용해보기 & 남녀나눠서 모델링
##################################################################


# label encoding
Exemplo n.º 28
0
def missing_value_vis(df):
    mv_vis = mn.matrix(df)
    del df
    gc.collect()
    return mv_vis
#
# atemp - "feels like" temperature in Celsius
#
# humidity - relative humidity
#
# windspeed - wind speed
#
# casual - number of non-registered user rentals initiated
#
# registered - number of registered user rentals initiated */

# In[90]:

df.isnull().sum()
import missingno as msno
msno.matrix(df)

# In[93]:

df.season.value_counts()

# In[94]:

df.weather.value_counts()

# In[95]:

sns.factorplot(x='season', data=df, kind='count')

# In[96]:
# In[ ]:


#TitanicSubmission.head()
TitanicTrain.info()
TitanicTrain.describe()


# # 2 | Data Analysis and Visualisation
# 
# **2.1 Missing values**

# In[ ]:


msno.matrix(TitanicTrain)
TitanicTrain.isnull().sum()
#msno.bar(TitanicTrain)
#msno.heatmap(TitanicTrain)


# It looks there are a lot of missing values for Age and Cabin and only 2 for Embarked.   
# This is interesting information to impute these missing values later to may be improve our prediction model.
# 

# **2.2 Individual features**

# In[ ]:


columns = TitanicTrain.select_dtypes(include=[np.number]).drop(['PassengerId','Age','Fare'], axis=1).columns.tolist()
Exemplo n.º 31
0
 def test_simple_matrix(self):
     msno.matrix(self.simple_df)
     return plt.gcf()
## Closeness Centrality
#draw_graph(G, pos, nx.closeness_centrality(G, distance='weight'), 'Closeness Centrality')
draw_graph(G, pos, nx.closeness_centrality(G, distance=None, wf_improved=True),
           'closeness Centrality',
           './figures/closeness_credit_transaction.png')

# PART 2: IN THIS PART WE WILL BE WORKING WITH REAL-WORLD DATA. WE WILL VIZUALIZE THE CREDIT MOVEMEMTS OF
# OF MTN-BENIN FOR THE SOLE PURPOSE OF IDENTIFYING SUSPICIOUS LINKS.

import missingno as mn

# Read in the data we will need to build the network
df = pd.read_csv("./Data/CDRtestdata.csv")
df.head()  # We begin by taking a look at the first five rows of the data
mn.matrix(df)  # Visualize missing values in each columns of the dataset

Graph = nx.DiGraph()
for i, elrow in df.iterrows():
    Graph.add_edge(elrow[0], elrow[1], attr_dict=elrow[0:].to_dict())

# Here our program assigns color to nodes base on their type
node_col = []
NodeSet = list(Graph.nodes())
for node in NodeSet:
    if (node in list(df.ers_from_partner_id) and df.ers_sender_rs_type[df.loc[
            df.ers_from_partner_id == node].index[0]] == 'SC') or (
                node in list(df.ers_to_partner_id) and df.ers_receiver_rs_type[
                    df.loc[df.ers_to_partner_id == node].index[0]] == 'SC'):
        node_col.append('red')
Exemplo n.º 33
0
 def test_width_ratios_matrix(self):
     msno.matrix(self.simple_df, width_ratios=(30, 1))
     return plt.gcf()
curbal_median = data_df['tot_cur_bal'].median()
data_df['tot_cur_bal'] = data_df['tot_cur_bal'].fillna(curbal_median)

#Replace NaN values in 'total_rev_hi_lim' column with median 
revlimit_median = data_df['total_rev_hi_lim'].median()
data_df['total_rev_hi_lim'] = data_df['total_rev_hi_lim'].fillna(revlimit_median)

#Bad Customer Definition 
df['BadLoan'] = np.where(np.isin(data_df['loan_status'],['Charged Off','Default','Late (31-120 days)', 
                                    'In Grace Period', 'Late (16-30 days)',
                                   'Does not meet the credit policy. Status:Charged Off']), 1, 0)
df.drop(['loan_status'],axis=1,inplace=True) 

#Lets see if there are any missing values left
plt.figure(figsize=(16,6))
msno.matrix(data_df,labels = True, color = (0.2,0.15,0.45))

#Correlation Matrix for new dataset 
fig,ax = plt.subplots(figsize =(8,8))
corr = data_df.corr()

mask = np.zeros_like(corr, dtype = np.bool)
mask[np.triu_indices_from(mask)]=True

sns.heatmap(corr,mask=mask,square = False, linewidths = .5,cbar_kws={"shrink": .5})

#Lets see how the interest rate varies by grade 
data_df.boxplot(column='int_rate', by='grade', rot=90)

#Lets assign input and output values to the data 
y = data_df['int_rate'].values 
Exemplo n.º 35
0
 def test_fontsize_matrix(self):
     msno.matrix(self.simple_df, fontsize=8)
     return plt.gcf()
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoLarsCV, LassoLarsIC
import os
#%%
# Retrieve current working directory (`cwd`)
cwd = os.getcwd()
cwd
# Change directory
os.chdir(r"C:\Users\yehadji\Documents\MCS\MCS 02\Arrhythmia Data Set")
#%%
df_original = pd.read_csv(r"C:\Users\yehadji\Documents\MCS\MCS 02\Arrhythmia Data Set\arrhythmia.csv", 
                 na_values=['?'], delimiter = ";")

#%%
missingdata = df_original.columns[df_original.isnull().any()].tolist()
fig1 = msno.matrix(df_original[missingdata], figsize=(30,20)) #nullity matrix
fig1.plot()
plt.savefig('missing1.jpg')
#
fig2 = msno.bar(df_original[missingdata], color="blue", log=True, figsize=(30,20))#bar chart visualization of the data nullity
fig2.plot()
plt.savefig('missing2.jpg')
#%
fig3 = msno.heatmap(df_original[missingdata], figsize=(30,20)) #correlation heatmap
fig3.plot()
plt.savefig('missing3.jpg')


#%%
#%%
Exemplo n.º 37
0
 def test_large_matrix(self):
     msno.matrix(self.large_df)
     return plt.gcf()
s=requests.get(PROCESSED_DATA_URL).content
immigration_df=pd.read_csv(io.StringIO(s.decode('utf-8')))


# ## A general exploration of the immigration data

# In[21]:

immigration_df.head()


# In[31]:

immigration_report = pandas_profiling.ProfileReport(immigration_df)


# In[32]:

immigration_report.to_file('immigration_data_exploration_report.html')


# In[30]:

msno.matrix(immigration_df)


# In[ ]: