예제 #1
0
def test_mosaic_very_complex():
    # make a scattermatrix of mosaic plots to show the correlations between
    # each pair of variable in a dataset. Could be easily converted into a
    # new function that does this automatically based on the type of data
    key_name = ["gender", "age", "health", "work"]
    key_base = (["male", "female"], ["old", "young"], ["healty", "ill"], ["work", "unemployed"])
    keys = list(product(*key_base))
    data = OrderedDict(list(zip(keys, list(range(1, 1 + len(keys))))))
    props = {}
    props[("male", "old")] = {"color": "r"}
    props[("female",)] = {"color": "pink"}
    L = len(key_base)
    fig, axes = pylab.subplots(L, L)
    for i in range(L):
        for j in range(L):
            m = set(range(L)).difference(set((i, j)))
            if i == j:
                axes[i, i].text(0.5, 0.5, key_name[i], ha="center", va="center")
                axes[i, i].set_xticks([])
                axes[i, i].set_xticklabels([])
                axes[i, i].set_yticks([])
                axes[i, i].set_yticklabels([])
            else:
                ji = max(i, j)
                ij = min(i, j)
                temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v) for k, v in list(data.items())])
                keys = list(temp_data.keys())
                for k in keys:
                    value = _reduce_dict(temp_data, k[:2])
                    temp_data[k[:2]] = value
                    del temp_data[k]
                mosaic(temp_data, ax=axes[i, j], axes_label=False, properties=props, gap=0.05, horizontal=i > j)
    pylab.suptitle("old males should look bright red,  (plot 4 of 4)")
def testTransformDiscreteVar(df, newVarName, transformFunction, targetVar='Vote'):
    df[newVarName] = transformFunction(df)

    plt.figure()
    mosaic(df, [targetVar, newVarName])
    plt.savefig('Temp/' + newVarName + 'by' + targetVar + '.png')
    plt.close()
예제 #3
0
def test_mosaic_simple():
    # display a simple plot of 4 categories of data, splitted in four
    # levels with increasing size for each group
    # creation of the levels
    key_set = (["male", "female"], ["old", "adult", "young"], ["worker", "unemployed"], ["healty", "ill"])
    # the cartesian product of all the categories is
    # the complete set of categories
    keys = list(product(*key_set))
    data = OrderedDict(list(zip(keys, list(range(1, 1 + len(keys))))))
    # which colours should I use for the various categories?
    # put it into a dict
    props = {}
    # males and females in blue and red
    props[("male",)] = {"color": "b"}
    props[("female",)] = {"color": "r"}
    # all the groups corresponding to ill groups have a different color
    for key in keys:
        if "ill" in key:
            if "male" in key:
                props[key] = {"color": "BlueViolet", "hatch": "+"}
            else:
                props[key] = {"color": "Crimson", "hatch": "+"}
    # mosaic of the data, with given gaps and colors
    mosaic(data, gap=0.05, properties=props, axes_label=False)
    pylab.suptitle("syntetic data, 4 categories (plot 2 of 4)")
예제 #4
0
def test_mosaic_simple():
    # display a simple plot of 4 categories of data, splitted in four
    # levels with increasing size for each group
    # creation of the levels
    key_set = (['male', 'female'], ['old', 'adult', 'young'],
               ['worker', 'unemployed'], ['healty', 'ill'])
    # the cartesian product of all the categories is
    # the complete set of categories
    keys = list(product(*key_set))
    data = OrderedDict(zip(keys, range(1, 1 + len(keys))))
    # which colours should I use for the various categories?
    # put it into a dict
    props = {}
    #males and females in blue and red
    props[('male',)] = {'color': 'b'}
    props[('female',)] = {'color': 'r'}
    # all the groups corresponding to ill groups have a different color
    for key in keys:
        if 'ill' in key:
            if 'male' in key:
                props[key] = {'color': 'BlueViolet' , 'hatch': '+'}
            else:
                props[key] = {'color': 'Crimson' , 'hatch': '+'}
    # mosaic of the data, with given gaps and colors
    mosaic(data, gap=0.05, properties=props, axes_label=False)
    pylab.suptitle('syntetic data, 4 categories (plot 2 of 4)')
    #pylab.show()
    pylab.close('all')
예제 #5
0
def test_axes_labeling():
    from numpy.random import rand

    key_set = (["male", "female"], ["old", "adult", "young"], ["worker", "unemployed"], ["yes", "no"])
    # the cartesian product of all the categories is
    # the complete set of categories
    keys = list(product(*key_set))
    data = OrderedDict(list(zip(keys, rand(len(keys)))))
    lab = lambda k: "".join(s[0] for s in k)
    fig, (ax1, ax2) = pylab.subplots(1, 2, figsize=(16, 8))
    mosaic(data, ax=ax1, labelizer=lab, horizontal=True, label_rotation=45)
    mosaic(data, ax=ax2, labelizer=lab, horizontal=False, label_rotation=[0, 45, 90, 0])
    # fig.tight_layout()
    fig.suptitle("correct alignment of the axes labels")
예제 #6
0
def test_mosaic():
    # make the same analysis on a known dataset

    # load the data and clean it a bit
    affairs = datasets.fair.load_pandas()
    datas = affairs.exog
    # any time greater than 0 is cheating
    datas['cheated'] = affairs.endog > 0
    # sort by the marriage quality and give meaningful name
    # [rate_marriage, age, yrs_married, children,
    # religious, educ, occupation, occupation_husb]
    datas = datas.sort(['rate_marriage', 'religious'])
    num_to_desc = {1: 'awful', 2: 'bad', 3: 'intermediate',
                      4: 'good', 5: 'wonderful'}
    datas['rate_marriage'] = datas['rate_marriage'].map(num_to_desc)
    num_to_faith = {1: 'non religious', 2: 'poorly religious', 3: 'religious',
                      4: 'very religious'}
    datas['religious'] = datas['religious'].map(num_to_faith)
    num_to_cheat = {False: 'faithful', True: 'cheated'}
    datas['cheated'] = datas['cheated'].map(num_to_cheat)
    # finished cleaning
    fig, ax = pylab.subplots(2, 2)
    mosaic(datas, ['rate_marriage', 'cheated'], ax=ax[0, 0],
                title='by marriage happiness')
    mosaic(datas, ['religious', 'cheated'], ax=ax[0, 1],
                title='by religiosity')
    mosaic(datas, ['rate_marriage', 'religious', 'cheated'], ax=ax[1, 0],
                title='by both', labelizer=lambda k:'')
    ax[1, 0].set_xlabel('marriage rating')
    ax[1, 0].set_ylabel('religion status')
    mosaic(datas, ['religious', 'rate_marriage'], ax=ax[1, 1],
                title='inter-dependence', axes_label=False)
    pylab.suptitle("extramarital affairs (plot 3 of 4)")
예제 #7
0
def test_mosaic():
    # make the same analysis on a known dataset

    # load the data and clean it a bit
    affairs = datasets.fair.load_pandas()
    datas = affairs.exog
    # any time greater than 0 is cheating
    datas["cheated"] = affairs.endog > 0
    # sort by the marriage quality and give meaningful name
    # [rate_marriage, age, yrs_married, children,
    # religious, educ, occupation, occupation_husb]
    datas = datas.sort(["rate_marriage", "religious"])
    num_to_desc = {1: "awful", 2: "bad", 3: "intermediate", 4: "good", 5: "wonderful"}
    datas["rate_marriage"] = datas["rate_marriage"].map(num_to_desc)
    num_to_faith = {1: "non religious", 2: "poorly religious", 3: "religious", 4: "very religious"}
    datas["religious"] = datas["religious"].map(num_to_faith)
    num_to_cheat = {False: "faithful", True: "cheated"}
    datas["cheated"] = datas["cheated"].map(num_to_cheat)
    # finished cleaning
    fig, ax = pylab.subplots(2, 2)
    mosaic(datas, ["rate_marriage", "cheated"], ax=ax[0, 0], title="by marriage happiness")
    mosaic(datas, ["religious", "cheated"], ax=ax[0, 1], title="by religiosity")
    mosaic(datas, ["rate_marriage", "religious", "cheated"], ax=ax[1, 0], title="by both", labelizer=lambda k: "")
    ax[1, 0].set_xlabel("marriage rating")
    ax[1, 0].set_ylabel("religion status")
    mosaic(datas, ["religious", "rate_marriage"], ax=ax[1, 1], title="inter-dependence", axes_label=False)
    pylab.suptitle("extramarital affairs (plot 3 of 4)")
예제 #8
0
def test_axes_labeling(close_figures):
    from numpy.random import rand
    key_set = (['male', 'female'], ['old', 'adult', 'young'],
               ['worker', 'unemployed'], ['yes', 'no'])
    # the cartesian product of all the categories is
    # the complete set of categories
    keys = list(product(*key_set))
    data = OrderedDict(zip(keys, rand(len(keys))))
    lab = lambda k: ''.join(s[0] for s in k)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
    mosaic(data, ax=ax1, labelizer=lab, horizontal=True, label_rotation=45)
    mosaic(data, ax=ax2, labelizer=lab, horizontal=False,
           label_rotation=[0, 45, 90, 0])
    #fig.tight_layout()
    fig.suptitle("correct alignment of the axes labels")
예제 #9
0
def test_mosaic_empty_cells(close_figures):
    # GH#2286
    import pandas as pd
    mydata = pd.DataFrame({'id2': {64: 'Angelica',
                                   65: 'DXW_UID', 66: 'casuid01',
                                   67: 'casuid01', 68: 'EC93_uid',
                                   69: 'EC93_uid', 70: 'EC93_uid',
                                   60: 'DXW_UID',  61: 'AtmosFox',
                                   62: 'DXW_UID', 63: 'DXW_UID'},
                           'id1': {64: 'TGP',
                                   65: 'Retention01', 66: 'default',
                                   67: 'default', 68: 'Musa_EC_9_3',
                                   69: 'Musa_EC_9_3', 70: 'Musa_EC_9_3',
                                   60: 'default', 61: 'default',
                                   62: 'default', 63: 'default'}})

    ct = pd.crosstab(mydata.id1, mydata.id2)
    _, vals = mosaic(ct.T.unstack())
    _, vals = mosaic(mydata, ['id1','id2'])
예제 #10
0
def test_mosaic_very_complex():
    # make a scattermatrix of mosaic plots to show the correlations between
    # each pair of variable in a dataset. Could be easily converted into a
    # new function that does this automatically based on the type of data
    key_name = ['gender', 'age', 'health', 'work']
    key_base = (['male', 'female'], ['old', 'young'],
                    ['healty', 'ill'], ['work', 'unemployed'])
    keys = list(product(*key_base))
    data = OrderedDict(zip(keys, range(1, 1 + len(keys))))
    props = {}
    props[('male', 'old')] = {'color': 'r'}
    props[('female',)] = {'color': 'pink'}
    L = len(key_base)
    fig, axes = pylab.subplots(L, L)
    for i in range(L):
        for j in range(L):
            m = set(range(L)).difference(set((i, j)))
            if i == j:
                axes[i, i].text(0.5, 0.5, key_name[i],
                                ha='center', va='center')
                axes[i, i].set_xticks([])
                axes[i, i].set_xticklabels([])
                axes[i, i].set_yticks([])
                axes[i, i].set_yticklabels([])
            else:
                ji = max(i, j)
                ij = min(i, j)
                temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v)
                                            for k, v in iteritems(data)])

                keys = list(iterkeys(temp_data))
                for k in keys:
                    value = _reduce_dict(temp_data, k[:2])
                    temp_data[k[:2]] = value
                    del temp_data[k]
                mosaic(temp_data, ax=axes[i, j], axes_label=False,
                       properties=props, gap=0.05, horizontal=i > j)
    pylab.suptitle('old males should look bright red,  (plot 4 of 4)')
    #pylab.show()
    pylab.close('all')
예제 #11
0
train.groupby('Survived')['Sex'].value_counts().unstack(level=1).plot.bar(stacked=True)


# ## Pclass
# There was a statistically significant survival outcome between boarding classes 1 and 3 only.  2nd class passengers almost equally survived, while 1st and 3rd class passengers found opposite fates.  Roughly 25% 3rd class passengers survived.  Over 50% 1st class passengers enjoyed safety after this tragedy.  Therefore, `Pclass` is worthy to include in the model.

# In[ ]:


contingency(train, c.Pclass, c.Survived)


# In[ ]:


fig, _ = mosaic(train, [c.Pclass, c.Survived], title="Pclass vs Survived | Titanic train dataset.", axes_label=True)
fig.axes[0].set_ylabel(c.Survived)
_ = fig.axes[0].set_xlabel(c.Pclass)


# According to the graph, most people who died were from class 3.****
# - source: https://stackoverflow.com/questions/50319614/count-plot-with-stacked-bars-per-hue

# In[ ]:


train.groupby('Survived')['Pclass'].value_counts().unstack(level=1).plot.bar(stacked=True)
train['Pclass'].value_counts()


# ## Pclass versus Fare
예제 #12
0
from statsmodels.graphics.mosaicplot import mosaic

color_map = {
    '0 - 499':'whitesmoke',
    '500 - 999': 'lightgray',
    '1000 - 1499': 'darkgray',
    '1500 - 1999': 'gray'
}

def color(key):
    _, tdpm = key
    return color_map.get(tdpm, 'red')
    
props = lambda key: {'color': color(key)}

mosaic(data,  properties=props, labelizer = lambda key: '')


# %%

# data = np.zeros((len(stringency_labels), len(tdpm_labels)))

# for index, row in average_si_vs_dpm.iterrows():
#     si_label = row['stringency_group']
#     tdpm_label = row['tdpm_group']
#     si_label_idx = stringency_labels.index(si_label)
#     tdpm_label_idx = tdpm_labels.index(tdpm_label)
    
#     data[si_label_idx, tdpm_label_idx] = data[si_label_idx, tdpm_label_idx] + 1
    
# # normalize data row wise to sum to 1
예제 #13
0
                         size = 8,
                         position = position_stack(vjust = 0.5))
# customized colors
barStacked2c += scale_colour_manual(values = adHoc) 
# use this to avoid text on top of legend symbols
barStacked2c += guides(color=False) # want to omit?

barStacked2c


#%%


from statsmodels.graphics.mosaicplot import mosaic

ax,t=mosaic(PrecintDaytime.stack(),gap=0.01)



#%%


base4= ggplot(PrecDaytiDF,
              aes(x='daytime',y='pctCol',
                  fill='precint')) + theme_classic()

barStPct2 = base4 + scale_fill_brewer(type='Qualitative',
                                      palette = "Paired") 

barStPct2 += theme(axis_title_y = element_blank(),
                   axis_text_y  = element_blank(),
예제 #14
0
possible_outliers += np.where(
    (pca_dim[:, 0] > 0) & (d['diagnosis'].values.ravel() == 0))[0].tolist()
possible_outliers += np.where(
    (pca_dim[:, 0] < 0) & (d['diagnosis'].values.ravel() == 1))[0].tolist()
possible_outliers += np.where(
    (pca_dim[:, 0] > 4) & (d['diagnosis'].values.ravel() == 0))[0].tolist()
print(list(set(possible_outliers)))

#Pairs plot/SPLOM
splom = sns.pairplot(d.iloc[:, 3:])  #, diag_kind="kde")
fig = splom.fig
fig.suptitle('Pairs Plot of Wisconsin Breast Cancer Data')
plt.show()

#Mosaic plot
mosaic(d, ['clump_thickness', 'cell_size_uniformity'],
       title='Mosaic Plot of 2 Features from Wisconsin Breast Cancer Data')
plt.show()

#Plot parallel coordinates of potential outliers
fig = plt.figure()
fig.suptitle(
    'Parallel Coordinates Plot of Potential Outliers in Wisconsin Breast Cancer Data'
)
parallel_coordinates(d.iloc[possible_outliers, :],
                     class_column='diagnosis',
                     cols=d.columns[3:],
                     color=('#0158FE', '#FE0101'))
plt.show()

#-------------------------------------------------------------------------------------------------#
#----------------------------------------Robust Covariance----------------------------------------#
예제 #15
0
Following Variables need to investigated:
Pclass, Fare, AFare
Sex, Age_G, Age_E, Age_Er, Age, Titel
SibSp, Parch, NPerson, Alone, ParCh_B, SibSp_B, Family
Embarked

Against the Variable survived/Survived

Not used: PassengerId, Cabin, Name(indirect Titel)
###################################
"""
"""
Analysis of Pclass, Fare, AFare
"""
mosaic(df_2, ['Pclass', 'Survived'])
pd.crosstab(df_2['Survived'],
            df_2['Pclass'],
            normalize='columns',
            margins=True)
#====>           Pclass seems to have an Influence

sns.boxplot(x="Survived", y="Fare", data=df_2)
sns.boxplot(x="Survived", y="AFare", data=df_2)
#====>          There seems to be a difference, but the question is, if this variable is necessary (maybe we can just use Pclass)

sns.boxplot(x="Pclass", y="AFare", data=df_2)
sns.boxplot(x="Pclass", y="AFare", data=df_2)
#====>          The information of AFare seems to be included in Pclass

#Might be enough to just include Pclass (else AFare)
예제 #16
0
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 12 11:07:36 2017

@author: 28414
"""

import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic

data1 = pd.read_excel('totalNPdata.xlsx', sheet_name='Sheet1')
data2 = pd.read_excel('totalPDSdat.xlsx', sheet_name='Sheet1')
n1 = data1[0:1]

d = {'age': n1['b'], 'weight': n1['c'], 'hight': n1['d']}
mosaic(d)
plt.show()
예제 #17
0
def ParrarelCorrelationPlotChart():
    dataset = LoadDataset()
    mosaic(dataset, ['size', 'length'])
    plt.show()
mean_top_depose * 100  # 0.17% pour cluster 1, 0.4% pour cluster 2, 9% pour cluster 3 et 4% pour cluster 4
del mean_top_enligne, mean_top_depose

# Import de la base quali pour mozaic plot
base_quali = pd.read_table(
    'C:/Users/Richard/Documents/GitHub/Segmentation-multicanale2/Données/v2/base_variables_quali.csv',
    delimiter=";",
    dtype={"IDPART_CALCULE": object})
base_quali2 = pd.concat([
    base_quali, clustered_data['top_enligne'], clustered_data['top_depose'],
    clustered_data['cluster']
],
                        axis=1)

# Variables quali interessantes :
mosaic(base_quali2, ['cluster', 'libpcs2'])
mosaic(base_quali2, ['cluster', 'lncsg2'])
mosaic(base_quali2, ['cluster', 'type_famille'])

# csp par cluster
ctab = pd.crosstab(base_quali2['cluster'],
                   base_quali2['libpcs2']).apply(lambda x: x / x.sum(), axis=1)

ct = ctab.plot(
    kind='bar',
    stacked=True,
    title='Categories socio-professionnelles en proportion par classe')
lgd = ct.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
ct.set_ylabel('Proportion')
ct.set_xlabel('Classe')
ct.set_xticklabels(ctab.index, rotation=0)
예제 #19
0
table = pd.crosstab(data.slope, data.ca)
chi2, p, dof, expected = chi2_contingency(table.values)
print('chi2 = ' , chi2 , ' p = ' , p , ' dof = ' , dof ,' expected=' , expected )

table = pd.crosstab(data.slope, data.thal)
chi2, p, dof, expected = chi2_contingency(table.values)
print('chi2 = ' , chi2 , ' p = ' , p , ' dof = ' , dof ,' expected=' , expected )

##--------Connections graph between categorical variables------------------------------##

# exang and cp
sns.stripplot(x='exang', y='cp', data=data, jitter=0.2)
plt.show()

# exang and slope
mosaic(data,['slope', 'exang'] , axes_label=True ,title = "Categorial slope and exang : " )
plt.xlabel('slope')
plt.show()

###-------------------Changing missing values -------------------------------###

# CA
data.loc[data['ca'] == 4,'ca'] = 0

#Thal
data.loc[data['thal'] == 0,'thal'] = 2

##--------Connections graph between y and variables------------------------------##

# y and exang
mosaic(data,['exang', 'y'], gap=0.01, axes_label=True)
print(train.isnull().sum())
print(test.isnull().sum())
print(combined.isnull().sum())


# In[ ]:


train.info()


# In[ ]:


fig, ax = plt.subplots(figsize=(12, 4))
mosaic(train,["Survived",'Sex','Pclass'], axes_label = False, ax=ax)

plt.figure(figsize=[12,8])
plt.subplot(231)
sns.barplot('Sex', 'Survived', data=train)
plt.subplot(232)
sns.barplot('Pclass', 'Survived', data=train)
plt.subplot(233)
sns.barplot('Pclass', 'Survived', hue = 'Sex', data=train)
plt.subplot(234)
sns.barplot('Parch', 'Survived', data=train)
plt.subplot(235)
sns.barplot('SibSp', 'Survived', data=train)
plt.subplot(236)
sns.barplot('Embarked', 'Survived', data=train)
예제 #21
0
def plot_classification_categorical(X,
                                    target_col,
                                    types=None,
                                    kind='count',
                                    hue_order=None):
    """Exploration plots for categorical features in classification.

    Creates plots of categorical variable distributions for each target class.
    Relevant features are identified via mutual information.

    For high cardinality categorical variables (variables with many categories)
    only the most frequent categories are shown.

    Parameters
    ----------
    X : dataframe
        Input data including features and target
    target_col : str or int
        Identifier of the target column in X
    types : dataframe of types, optional.
        Output of detect_types on X. Can be used to avoid recomputing the
        types.
    """
    types = _check_X_target_col(X, target_col, types, task="classification")

    features = X.loc[:, types.categorical]
    if target_col in features.columns:
        features = features.drop(target_col, axis=1)

    if features.shape[1] == 0:
        return

    features = features.astype('category')

    show_top = _get_n_top(features, "categorical")

    # can't use OrdinalEncoder because we might have mix of int and string
    ordinal_encoded = features.apply(lambda x: x.cat.codes)
    target = X[target_col]
    f = mutual_info_classif(ordinal_encoded,
                            target,
                            discrete_features=np.ones(X.shape[1], dtype=bool))
    top_k = np.argsort(f)[-show_top:][::-1]
    # large number of categories -> taller plot
    row_height = 3 if features.nunique().max() <= 5 else 5
    fig, axes = _make_subplots(n_plots=show_top, row_height=row_height)
    # FIXME mosaic doesn't like constraint layout?
    plt.suptitle("Categorical Features vs Target", y=1.02)
    for i, (col_ind, ax) in enumerate(zip(top_k, axes.ravel())):
        col = features.columns[col_ind]
        X_new = _prune_category_make_X(X, col, target_col)
        if kind == 'proportion':
            df = (X_new.groupby(col)[target_col].value_counts(
                normalize=True).unstack().sort_values(by=target[0])
                  )  # hacky way to get a class name
            df.plot(kind='barh', stacked='True', ax=ax, legend=i == 0)
            ax.set_title(col)
            ax.set_ylabel(None)
        elif kind == 'mosaic':
            warn("Mosaic plots are buggy right now, come back later.",
                 UserWarning)
            # This seems pretty broken, abandoning for now
            # counts = pd.crosstab(X_new[col], X_new[target_col])

            mosaic(X_new, [col, target_col], horizontal=False, ax=ax)
            # ,
            # labelizer=lambda k: counts.loc[k[0], k[1]])
        elif kind == 'count':
            # absolute counts
            # FIXME show f value
            # FIXME shorten titles?
            sns.countplot(y=col,
                          data=X_new,
                          ax=ax,
                          hue=target_col,
                          hue_order=hue_order)
            if i > 0:
                ax.legend(())
        else:
            raise ValueError("Unknown plot kind {}".format(kind))
        _short_tick_names(ax)

    for j in range(i + 1, axes.size):
        # turn off axis if we didn't fill last row
        axes.ravel()[j].set_axis_off()
    def plot_bivariate_x_categorical_y_categorical(self,
                                                   df,
                                                   x_name,
                                                   target_name,
                                                   filename_prefix=''):
        '''
            Plot bivariate analysis : y = f(x) where both x and y are categorical.
            This functions generates two graphs: a mekko chart and a stacked bar chart

            Parameters
            ----------
            df: dataframe
                Dataframe containing x_name and target_name at least.
            x_name: string
                Name of column that is on x axis.
            target_name: string
                Name of column containing target to predict.
            filename_prefix: string
                Prefix added to filename.

            Returns
            -------
            None
        '''
        df_to_plot = self._build_dataset_for_x_cat_y_cat(
            df=df, x_name=x_name, target_name=target_name)

        # Plot 1: Mekko graph ------------------------------------------------------------------------------------------
        df_to_plot['label'] = df_to_plot['count_percent'].apply(int).apply(str) + ' %' + '\n' \
                                + '(' + df_to_plot['count'].apply(str) + ')'

        props = lambda index: {'color': self.color if index[1] == str(df[target_name].value_counts().index[1]) \
                                                    else self.color_secondary,
                               'alpha': 0.7}
        labels = lambda k: df_to_plot.loc[k, 'label']
        plot = mosaic(
            data=df_to_plot['count'],
            gap=0.02,
            title='Distribution de {target} en fonction de {var}'.format(
                target=target_name, var=x_name),
            properties=props,
            labelizer=labels)

        plt.savefig(self.output_directory + filename_prefix +
                    'bivariate_mekko_' + target_name + '_' + x_name + '.png')
        plt.close()
        # --------------------------------------------------------------------------------------------------------------

        # Plot 2: Stacked bar chart ------------------------------------------------------------------------------------
        df_to_plot.reset_index(inplace=True)
        df_to_plot2 = df_to_plot.pivot(index=x_name,
                                       columns=target_name,
                                       values='count')
        df_to_plot2['total'] = df_to_plot2.sum(axis=1)
        df_to_plot2.sort_values(by='total', ascending=False, inplace=True)

        fig, ax = plt.subplots()

        bar_width = 0.75
        bar_position = [i + 1 for i in range(df_to_plot2.shape[0])]
        tick_position = [i + (bar_width / 2) for i in bar_position]

        ax.bar(bar_position,
               df_to_plot2[str(df[target_name].value_counts().index[0])],
               width=bar_width,
               label=str(df[target_name].value_counts().index[0]),
               alpha=0.7,
               color=self.color_secondary,
               edgecolor='#7F7F7F')

        ax.bar(bar_position,
               df_to_plot2[str(df[target_name].value_counts().index[1])],
               width=bar_width,
               bottom=df_to_plot2[str(
                   df[target_name].value_counts().index[0])],
               label=str(df[target_name].value_counts().index[1]),
               alpha=0.7,
               color=self.color,
               edgecolor='#7F7F7F')

        rects = ax.patches
        labels_numbers = df_to_plot2[str(df[target_name].value_counts().index[0])].tolist() + \
                            df_to_plot2[str(df[target_name].value_counts().index[1])].tolist()

        labels_percent = (df_to_plot2[str(df[target_name].value_counts().index[0])]/df_to_plot2['total']).tolist() + \
                    (df_to_plot2[str(df[target_name].value_counts().index[1])]/df_to_plot2['total']).tolist()

        labels = [
            '{}\n({:.0%})'.format(number, percent)
            for number, percent in zip(labels_numbers, labels_percent)
        ]

        for rect, label in zip(rects, labels):
            coord = rect.get_xy()
            height = rect.get_height()
            ax.text(x=coord[0] + bar_width / 2,
                    y=coord[1] + height / 2,
                    s=label,
                    size=9,
                    va='center',
                    ha='center',
                    color='w')

        for index in range(len(df_to_plot2.index)):
            ax.text(x=bar_position[index] + bar_width / 2,
                    y=df_to_plot2['total'].values[index],
                    s=df_to_plot2['total'].values[index],
                    size=10,
                    ha='center',
                    va='bottom')

        plt.xticks(tick_position, df_to_plot2.index.values)
        plt.yticks([])
        plt.xlim(
            [min(tick_position) - bar_width,
             max(tick_position) + bar_width])

        plt.title('Distribution de {target} en fonction de {var}'.format(
            target=target_name, var=x_name),
                  y=1.08)

        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles[::-1],
                  labels[::-1],
                  loc='upper right',
                  frameon=False)

        plt.savefig('{}{}bivariate_stacked_{}_{}.png'.format(
            self.output_directory, filename_prefix, target_name, x_name))
        plt.close(fig)
        # --------------------------------------------------------------------------------------------------------------
        pass
train_df['Family_Size_D'] = train_df['Family_Size'].apply(lambda size: conv_discrete(size))

# train_df.head() - uncomment to view 


# In[ ]:


##train_df.loc[:,['Survived','Family_Size_D']]


# In[ ]:


# Visualize multivariate categorical data in a rigorous and informative way.
mosaicplt.mosaic(train_df,index=['Survived','Family_Size_D'], gap=0.02,title='Family size by survival', statistic = True) 


# ```a contingency table (also known as a cross tabulation or crosstab) is a type of table in a matrix format that displays the (multivariate) frequency distribution of the variables```

# The mosaic plot shows that we preserve our rule that there’s a survival penalty among singletons and large families, but a benefit for passengers in small families.

# # Missing data
# 
# Assumption: It is assumed that the type of Missingness here is Missing At Random(MAR)

# In[ ]:


#Create a new function:
def count_missing(x):
예제 #24
0
df['color'].value_counts() 
sns.countplot(df['color'])
df['cut'].value_counts() 
sns.countplot(df['cut'])
plt.plot(df.carat,df.price)
plt.scatter(df.carat,df.price)
df['cut'].value_counts().plot(kind='bar')
df['clarity'].value_counts().plot(kind='bar')
df['color'].value_counts().plot(kind='bar')
df['cut'].value_counts().plot(kind='bar')
df['carat'].value_counts().plot(kind='bar')
plt.scatter(df.carat, df.price)

from statsmodels.graphics.mosaicplot import mosaic
plt.rcParams['font.size'] = 16.0
mosaic(df, ['cut', 'color'])
mosaic(df, ['cut', 'color', 'clarity'])
values = [21551, 13791, 12082, 4906, 1610]
labels = ['Ideal', 'Premium', 'Very Good', 'Good','Fair']
colors = ['b', 'g', 'r', 'c', 'm']
labels =labels 
plt.pie(values, colors=colors, labels= labels, counterclock=False, shadow=True)

df.corr(method='pearson')        # By default corr() is pearson
df.corr(method='spearman')
df.corr(method='kendall')

# from def to plt.show excute alltogether then correlation_matrix(ddef correlation_matrix(df):
    from matplotlib import pyplot as plt
    from matplotlib import cm as cm
    fig = plt.figure()
예제 #25
0
plt.subplots_adjust(top=0.9)
# g.fig.subplots_adjust(top=0.9)
g.fig.suptitle('Histogram of Age by Y and marital')
plt.show()

#
#%%
sns.boxplot(x='marital', y='age', hue='y', data=bank_data, palette='coolwarm',fliersize=0.2)
ax = plt.gca()
ax.set_title('Boxplot of Age by Y and marital')
ax.legend(loc = 2)
ax.get_ylim()

#
#%%
mosaic(bank_data, ['job','y'], gap=0.001, label_rotation=30)
ax = plt.gca()
ax.set_title('Mosaic plot of job by y')


#
#%%
mosaic(bank_data, ['housing','loan'], gap=0.001, title='Mosaic plot of housing(x) and loan(y)')
ax = plt.gca()
ax.set_title('Mosaic plot of housing(x) and loan(y)')


#
#%%
sns.countplot(x='education', data=bank_data, hue='y')
ax = plt.gca()
def save_image(b):

    data = df

    #no selection made
    #========================================================================
    if headers_x.value == 'Select' and headers_y.value == 'Select':
        sns.set_context("notebook", font_scale=1.1)

    #X selected but noy Y
    #========================================================================
    elif headers_x.value != 'Select' and headers_y.value == 'Select':
        sns.set_context("notebook", font_scale=1.1)
        x = headers_x.value

        if df[headers_x.value].dtype == np.float or df[
                headers_x.value].dtype == np.int:
            x_type = 'is_numeric'
        elif df[headers_x.value].dtype == np.object:
            x_type = 'is_string'

        #plot when x is a string
        #--------------------------------------------------------------------
        if x_type == 'is_string':
            #if colours have not been selected
            #................................................................
            if colour_headers.value == 'Select':

                g = sns.countplot(x=x, data=data)
                loc, labels = plt.xticks()
                g.set_xticklabels(labels, rotation=90)
                g.figure.savefig("xCategoricalNoColour.png")
                plt.close()
            #if colours have been selected
            #................................................................
            else:
                g = sns.countplot(x=x, hue=colour_headers.value, data=data)
                loc, labels = plt.xticks()
                g.set_xticklabels(labels, rotation=90)
                g.figure.savefig("xCategoricalColour.png")
                plt.close()
                #plt.show()

        #plot when x is numeric
        #--------------------------------------------------------------------
        else:
            #if colours have not been selected
            #................................................................
            if colour_headers.value == 'Select':
                xplot = data[x]
                g = sns.distplot(xplot)
                g.figure.savefig("xNumericNoColour.png")
                plt.close()
                #plt.show()
            #if colours have been selected
            #................................................................
            else:
                g = sns.FacetGrid(data, hue=colour_headers.value)
                g = g.map(sns.distplot, x)
                g.savefig("xNumericColour.png")
                plt.close()
                #plt.show()
    #if only Y has been selected
    #========================================================================
    elif headers_x.value == 'Select' and headers_y.value != 'Select':
        sns.set_context("notebook", font_scale=1.1)
    #if both X and Y have been selected

    #========================================================================
    elif headers_x.value != 'Select' and headers_y.value != 'Select':
        x = headers_x.value
        y = headers_y.value

        if df[headers_x.value].dtype == np.float or df[
                headers_x.value].dtype == np.int:
            x_type = 'is_numeric'
        elif df[headers_x.value].dtype == np.object:
            x_type = 'is_string'

        if df[headers_y.value].dtype == np.float or df[
                headers_y.value].dtype == np.int:
            y_type = 'is_numeric'
        elif df[headers_y.value].dtype == np.object:
            y_type = 'is_string'

        sns.set_context("notebook", font_scale=1.1)
        sns.set_style("ticks")

        #Numeric vs Numeric
        #------------------------------------------------------------------------
        if x_type == 'is_numeric' and y_type == 'is_numeric':
            # Create scatterplot of dataframe
            #if colours have not been selected
            #................................................................
            if colour_headers.value == 'Select':
                g = sns.lmplot(
                    x=x,  # Horizontal axis
                    y=y,  # Vertical axis
                    data=data,  # Data source
                    fit_reg=False,  # Don't fix a regression line
                    scatter_kws={
                        "marker": "D",  # Set marker style
                        "s": pointSize.value,
                        "alpha": pointAlpha.value
                    },  # S marker size
                    legend=True)
                g.savefig("NumericVsNumericNoColour.png")
                plt.close()
                #plt.show()

            #if colours have been selected
            #................................................................
            else:
                g = sns.lmplot(
                    x=x,  # Horizontal axis
                    y=y,  # Vertical axis
                    data=data,  # Data source
                    fit_reg=False,  # Don't fix a regression line
                    hue=colour_headers.value,  # Set color
                    scatter_kws={
                        "marker": "D",  # Set marker style
                        "s": pointSize.value,
                        "alpha": pointAlpha.value
                    },  # S marker size
                    legend=True)
                g.savefig("NumericVsNumericColour.png")
                plt.close()
                #plt.show()

        #Numeric vs String
        #------------------------------------------------------------------------
        elif x_type == 'is_numeric' and y_type == 'is_string':
            sns.set_style("ticks")
            g = sns.violinplot(x=x, y=y, data=data)
            g.figure.savefig("NumericVsCategorical.png")
            plt.close()
            #plt.show()
        #String vs Numeric
        #------------------------------------------------------------------------
        elif x_type == 'is_string' and y_type == 'is_numeric':
            sns.set_style("ticks")
            g = sns.boxplot(x=x, y=y, data=data)
            g.figure.savefig("CategoricalVsNumeric.png")
            plt.close()
            #plt.show()
        #String vs String
        #------------------------------------------------------------------------
        elif x_type == 'is_string' and y_type == 'is_string':
            plotting = mosaic(data, [x, y])
            plt.savefig('categoricalVsCategorical.png')
            plt.close()
#        letter = ""
#    else:
    if "essential" in key:
        letter = "e"
    elif "recommended" in key:
        letter = "r"
    elif "desired" in key:
        letter = "d"
    return letter

# Make the figure
#props = lambda key: colorCode(key)
props = lambda k: colorCode(k)
#fig, rects =  mosaic(data, ['WG','rec'], title='Mosaic Plot _ no freqs')

fig, recs = mosaic(df1, ['WG','rec'], title='Recommendation for new datasets', \
       properties = props, gap=0.015)

labels = lambda k: letterCode(k) if recs[k][1] !=1 else ""
fig, ax  = plt.subplots(figsize=(7.5, 3.5))
mosaic(df1, ['WG','rec'], title='a. Recommendation for new datasets', \
       properties = props, gap=0.015, ax=ax,labelizer=labels)

for tick in ax.get_xticklabels():
    tick.set_rotation(30)
    tick.set_horizontalalignment('right')
for tick in ax.get_yticklabels():
    tick.set_rotation(30)

plt.savefig('./Figures/new_datasets_rec.png',\
            dpi =300, bbox_inches='tight', pad_inches=0.25)
    df_pi.pi_pro_schl_feats_comp > np.mean(df_pi.pi_pro_schl_feats_comp)].index
df_pi.loc[i1, 'schl_comp'] = 'High'
i2 = df_pi.loc[
    df_pi.pi_pro_hm_feats_comp > np.mean(df_pi.pi_pro_hm_feats_comp)].index
df_pi.loc[i2, 'hm_comp'] = 'High'
# Identify high school - low home involvement, and
# low school - high home involvement groups
i1 = df_pi.loc[(df_pi.schl_comp == 'High') & (df_pi.hm_comp == 'Low')].index
i2 = df_pi.loc[(df_pi.schl_comp == 'Low') & (df_pi.hm_comp == 'High')].index
df_pi.loc[i1, 'schl_hm_comp'] = 'More involved at school'
df_pi.loc[i2, 'schl_hm_comp'] = 'More involved at home'
# Plot of the contingency table for the type of parental involvement vs.
# student high-low performing students
props = lambda key: {
    'color': 'dodgerblue' if 'More involved at home' in key else 'orange'
}
labelizer = lambda k: f"{(k == ('More involved at school', 'A or B'))*90 + (k == ('More involved at school', 'C or lower'))*10 + (k == ('More involved at home', 'A or B'))*84 + (k == ('More involved at home', 'C or lower'))*16}%"

mosaic(
    df_pi[['schl_hm_comp', 'grades_comp']],
    index=['schl_hm_comp', 'grades_comp'],
    title=
    'Relationship Between Student Grades and \nType of Parental Involvement',
    properties=props,
    gap=0.025,
    labelizer=labelizer)
ax1.set_xticklabels(['More involved at school\nLess Involved at home', ''])
plt.show()
df_pi.to_csv('~/FIS-Projects/Module-3/FIS-Mod3-Project/data/df_pi.csv',
             sep=',')
예제 #29
0
############################################################################

# trestbps graph
plt.bar(
    x=["low blood pressure", "proper", "high blood pressure"],
    height=[low_blood_pressure, proper_blood_pressure, high_blood_pressure])
plt.xlabel('trestbps')
plt.ylabel('amount')
plt.show()

############################################################################

plt.scatter(x=gender, y=ca, color=['r', 'b'])

mosaic(trainDF, ['fbs', 'thal'])
plt.show()
mosaic(trainDF, ['fbs', 'ca'])
plt.show()
mosaic(trainDF, ['restecg', 'slope'])
plt.show()
mosaic(trainDF, ['cp', 'restecg'])
plt.show()

############################################################################

plt.scatter(trestbps, chol)
plt.scatter(trestbps, thalach)
plt.scatter(trestbps, oldpeak)
plt.scatter(chol, trestbps)
plt.scatter(chol, thalach)
# Superficie du garage en fonction des classes de prix
sns.set_style("whitegrid")
sns.boxplot(y="GarageArea",x="Class_prix", data = df_housing_copy,order=["Classe0", "Classe1", "Classe2", "Classe3"], palette = pal_col)
plt.title("Superficie du garage en fonction des classes de prix")
plt.xlabel("Classe")
plt.ylabel("Superficie du garage")

# Nombre de chambre (Sans salles de bains) en fonction des classes de prix
sns.set_style("whitegrid")
sns.boxplot(y="TotRmsAbvGrd",x="Class_prix", data = df_housing_copy,order=["Classe0", "Classe1", "Classe2", "Classe3"], palette = pal_col)
plt.title("Nombre de chambre (Sans salles de bains) en fonction des classes de prix")
plt.xlabel("Classe")
plt.ylabel("Nombre de chambre (Sans salles de bains)")

# classification générale de zonage en fonction des classes de prix
mosaic(df_housing_copy,["Class_prix","MSZoning"],gap=0.3)
# =============================================================================
# Imputation des données manquantes (Première méthode-Mode/Médiane)
# =============================================================================

# transformer la table des pourcentages des NA's en DataFrame.
df_per_NA_per_col = per_NA_per_col.reset_index().rename(columns={"index": "Variable", 0: "pourcentage"}).sort_values(by = 'pourcentage')
df_per_NA_per_col_sup50 = df_per_NA_per_col.loc[df_per_NA_per_col.pourcentage > 50]
df_per_NA_per_col_inf50 = df_per_NA_per_col.loc[df_per_NA_per_col.pourcentage <= 50]

# Suppression des variables avec plus de 50% des NA's
df_housing_copy.drop(columns= df_per_NA_per_col_sup50.Variable, inplace = True)

# Data contenant que les variables qualitatives : 
var_qualitative = df_housing_copy.select_dtypes(exclude=['float', 'integer'])
예제 #31
0
def test_data_conversion():
    # It will not reorder the elements
    # so the dictionary will look odd
    # as it key order has the c and b
    # keys swapped
    import pandas
    fig, ax = pylab.subplots(4, 4)
    data = {'ax': 1, 'bx': 2, 'cx': 3}
    mosaic(data, ax=ax[0, 0], title='basic dict', axes_label=False)
    data = pandas.Series(data)
    mosaic(data, ax=ax[0, 1], title='basic series', axes_label=False)
    data = [1, 2, 3]
    mosaic(data, ax=ax[0, 2], title='basic list', axes_label=False)
    data = np.asarray(data)
    mosaic(data, ax=ax[0, 3], title='basic array', axes_label=False)

    data = {('ax', 'cx'): 1, ('bx', 'cx'): 2, ('ax', 'dx'): 3, ('bx', 'dx'): 4}
    mosaic(data, ax=ax[1, 0], title='compound dict', axes_label=False)
    mosaic(data, ax=ax[2, 0], title='inverted keys dict', index=[1, 0], axes_label=False)
    data = pandas.Series(data)
    mosaic(data, ax=ax[1, 1], title='compound series', axes_label=False)
    mosaic(data, ax=ax[2, 1], title='inverted keys series', index=[1, 0])
    data = [[1, 2], [3, 4]]
    mosaic(data, ax=ax[1, 2], title='compound list', axes_label=False)
    mosaic(data, ax=ax[2, 2], title='inverted keys list', index=[1, 0])
    data = np.array([[1, 2], [3, 4]])
    mosaic(data, ax=ax[1, 3], title='compound array', axes_label=False)
    mosaic(data, ax=ax[2, 3], title='inverted keys array', index=[1, 0], axes_label=False)

    gender = ['male', 'male', 'male', 'female', 'female', 'female']
    pet = ['cat', 'dog', 'dog', 'cat', 'dog', 'cat']
    data = pandas.DataFrame({'gender': gender, 'pet': pet})
    mosaic(data, ['gender'], ax=ax[3, 0], title='dataframe by key 1', axes_label=False)
    mosaic(data, ['pet'], ax=ax[3, 1], title='dataframe by key 2', axes_label=False)
    mosaic(data, ['gender', 'pet'], ax=ax[3, 2], title='both keys', axes_label=False)
    mosaic(data, ['pet', 'gender'], ax=ax[3, 3], title='keys inverted', axes_label=False)

    pylab.suptitle('testing data conversion (plot 1 of 4)')
    #pylab.show()
    pylab.close('all')
예제 #32
0
def test_data_conversion():
    # It will not reorder the elements
    # so the dictionary will look odd
    # as it key order has the c and b
    # keys swapped
    import pandas

    fig, ax = pylab.subplots(4, 4)
    data = {"ax": 1, "bx": 2, "cx": 3}
    mosaic(data, ax=ax[0, 0], title="basic dict", axes_label=False)
    data = pandas.Series(data)
    mosaic(data, ax=ax[0, 1], title="basic series", axes_label=False)
    data = [1, 2, 3]
    mosaic(data, ax=ax[0, 2], title="basic list", axes_label=False)
    data = np.asarray(data)
    mosaic(data, ax=ax[0, 3], title="basic array", axes_label=False)

    data = {("ax", "cx"): 1, ("bx", "cx"): 2, ("ax", "dx"): 3, ("bx", "dx"): 4}
    mosaic(data, ax=ax[1, 0], title="compound dict", axes_label=False)
    mosaic(data, ax=ax[2, 0], title="inverted keys dict", index=[1, 0], axes_label=False)
    data = pandas.Series(data)
    mosaic(data, ax=ax[1, 1], title="compound series", axes_label=False)
    mosaic(data, ax=ax[2, 1], title="inverted keys series", index=[1, 0])
    data = [[1, 2], [3, 4]]
    mosaic(data, ax=ax[1, 2], title="compound list", axes_label=False)
    mosaic(data, ax=ax[2, 2], title="inverted keys list", index=[1, 0])
    data = np.array([[1, 2], [3, 4]])
    mosaic(data, ax=ax[1, 3], title="compound array", axes_label=False)
    mosaic(data, ax=ax[2, 3], title="inverted keys array", index=[1, 0], axes_label=False)

    gender = ["male", "male", "male", "female", "female", "female"]
    pet = ["cat", "dog", "dog", "cat", "dog", "cat"]
    data = pandas.DataFrame({"gender": gender, "pet": pet})
    mosaic(data, ["gender"], ax=ax[3, 0], title="dataframe by key 1", axes_label=False)
    mosaic(data, ["pet"], ax=ax[3, 1], title="dataframe by key 2", axes_label=False)
    mosaic(data, ["gender", "pet"], ax=ax[3, 2], title="both keys", axes_label=False)
    mosaic(data, ["pet", "gender"], ax=ax[3, 3], title="keys inverted", axes_label=False)

    pylab.suptitle("testing data conversion (plot 1 of 4)")
예제 #33
0
def test_mosaic_plot(cat_target, cat_feature, data):
    from statsmodels.graphics.mosaicplot import mosaic
    mosaic(data, [cat_feature, cat_target])
예제 #34
0
def test_data_conversion():
    # It will not reorder the elements
    # so the dictionary will look odd
    # as it key order has the c and b
    # keys swapped
    import pandas
    fig, ax = pylab.subplots(4, 4)
    data = {'ax': 1, 'bx': 2, 'cx': 3}
    mosaic(data, ax=ax[0, 0], title='basic dict', axes_label=False)
    data = pandas.Series(data)
    mosaic(data, ax=ax[0, 1], title='basic series', axes_label=False)
    data = [1, 2, 3]
    mosaic(data, ax=ax[0, 2], title='basic list', axes_label=False)
    data = np.asarray(data)
    mosaic(data, ax=ax[0, 3], title='basic array', axes_label=False)

    data = {('ax', 'cx'): 1, ('bx', 'cx'): 2, ('ax', 'dx'): 3, ('bx', 'dx'): 4}
    mosaic(data, ax=ax[1, 0], title='compound dict', axes_label=False)
    mosaic(data,
           ax=ax[2, 0],
           title='inverted keys dict',
           index=[1, 0],
           axes_label=False)
    data = pandas.Series(data)
    mosaic(data, ax=ax[1, 1], title='compound series', axes_label=False)
    mosaic(data, ax=ax[2, 1], title='inverted keys series', index=[1, 0])
    data = [[1, 2], [3, 4]]
    mosaic(data, ax=ax[1, 2], title='compound list', axes_label=False)
    mosaic(data, ax=ax[2, 2], title='inverted keys list', index=[1, 0])
    data = np.array([[1, 2], [3, 4]])
    mosaic(data, ax=ax[1, 3], title='compound array', axes_label=False)
    mosaic(data,
           ax=ax[2, 3],
           title='inverted keys array',
           index=[1, 0],
           axes_label=False)

    gender = ['male', 'male', 'male', 'female', 'female', 'female']
    pet = ['cat', 'dog', 'dog', 'cat', 'dog', 'cat']
    data = pandas.DataFrame({'gender': gender, 'pet': pet})
    mosaic(data, ['gender'],
           ax=ax[3, 0],
           title='dataframe by key 1',
           axes_label=False)
    mosaic(data, ['pet'],
           ax=ax[3, 1],
           title='dataframe by key 2',
           axes_label=False)
    mosaic(data, ['gender', 'pet'],
           ax=ax[3, 2],
           title='both keys',
           axes_label=False)
    mosaic(data, ['pet', 'gender'],
           ax=ax[3, 3],
           title='keys inverted',
           axes_label=False)

    pylab.suptitle('testing data conversion (plot 1 of 4)')
예제 #35
0
def reports(results_input, results_output):
    """Generate reports for EMSE paper."""
    now = pandas.Timestamp(2017, 9, 30, 12)
    df = pandas.read_csv(
        path_join(results_input, "results_with_coverage.csv"),
        parse_dates=[0, 10]
    )
    df_googleplay = pandas.read_csv(
        path_join(results_input, "googleplay.csv"),
        index_col='package'
    )
    df = df.join(df_googleplay, on="app_id")
    df_sonar = pandas.read_csv("results_sonar.csv", index_col='package')
    df_sonar.fillna(0, inplace=True)
    df_sonar = df_sonar.add_prefix('sonar_')
    df = df.join(df_sonar, on="app_id")

    #Feature engineering
    df['tests'] = df[unit_test_frameworks+ui_automation_frameworks+cloud_test_services].any(axis=1)
    df['no_tests'] = ~df['tests']
    df['unit_tests'] = df[unit_test_frameworks].apply(any, axis=1)
    df['ui_tests'] = df[ui_automation_frameworks].apply(any, axis=1)
    df["cloud_tests"] = df[cloud_test_services].apply(any, axis=1)
    df["ci/cd"] = df[ci_services].apply(any, axis=1)
    df['age'] = (now - df['created_at'])
    df['age_numeric'] = (now - df['created_at']).astype('<m8[Y]').astype('int')
    df['time_since_last_update'] = (now - df['last_updated'])
    df['time_since_last_update_numeric'] = df['time_since_last_update'].astype('<m8[Y]').astype('int')
    df_old = df[df['age_numeric']>=2]
    df["downloads"] = df["downloads"].astype("category", categories=downloads_scale, ordered=True)
    df['sonar_issues_ratio'] = df['sonar_issues'].divide(df['sonar_files_processed'])
    df['sonar_blocker_issues_ratio'] = df['sonar_blocker_issues'].divide(df['sonar_files_processed'])
    df['sonar_critical_issues_ratio'] = df['sonar_critical_issues'].divide(df['sonar_files_processed'])
    df['sonar_major_issues_ratio'] = df['sonar_major_issues'].divide(df['sonar_files_processed'])
    df['sonar_minor_issues_ratio'] = df['sonar_minor_issues'].divide(df['sonar_files_processed'])
    df_with_google_data = df[~df["rating_count"].isnull()]
    df_with_tests = df[df['tests']]
    df_without_tests = df[~df['tests']]
    df.to_csv("results_merged.csv")


    # from android_test_inspector.corr_analysis import correlation_matrix
    # correlation_matrix(df, output_file=path_join(results_output, "corr_matrix.pdf"))

    colors_dict = {
        'any': 'C0',
        'unit_test_frameworks': 'C1',
        'ui_automation_frameworks': 'C2',
        'cloud_test_services': 'C3',
        'ci_services': 'C4',
    }

    marker_dict = {
        'any': 'o',
        'unit_test_frameworks': 'v',
        'ui_automation_frameworks': '*',
        'cloud_test_services': 'H',
        'ci_services': 's',
    }

    linestyle_dict = {
        'any': '-',
        'unit_test_frameworks': ':',
        'ui_automation_frameworks': '--',
        'cloud_test_services': '-.',
    }

    # --- Number of projects by year --- #
    figure, ax = plt.subplots(figsize=(4, 2.5))
    df.groupby('age_numeric')['age_numeric'].count().plot.bar(
        color='black',
        width=0.25,
        ax=ax,
    )
    ax.tick_params(direction='out', top='off')
    ax.set_xlabel("Age")
    ax.set_ylabel("Number of apps")
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.yaxis.grid(linestyle='dotted')
    figure.tight_layout()
    figure.savefig(path_join(results_output, "app_age_count.pdf"))

    # --- Number of projects by framework --- #
    columns = (
        ['tests']
        + ['unit_tests'] + unit_test_frameworks
        + ['ui_tests'] + ui_automation_frameworks
        + ['cloud_tests'] + cloud_test_services
        # + ['ci/cd'] + ci_services
    )
    colors =  (
        [colors_dict['any']] +
        [colors_dict['unit_test_frameworks']] * (len(unit_test_frameworks) + 1)
        + [colors_dict['ui_automation_frameworks']] * (len(ui_automation_frameworks) + 1)
        + [colors_dict['cloud_test_services']] * (len(cloud_test_services) + 1)
        + [colors_dict['ci_services']] * (len(ci_services) + 1)
    )

    highlights = [
        'tests',
        'unit_tests',
        'ui_tests',
        'cloud_tests',
        'ci/cd',
    ]
    sums = df[columns].sum()
    labels = (label in highlights and "• All "+label or label for label in columns)
    labels = [label.title().replace("_", " ") for label in labels]
    heights = sums.values
    figure, ax = plt.subplots(1, 1)
    ax.bar(
        range(len(labels)),
        heights,
        0.5,
        color=colors,
        edgecolor = 'k',
        linewidth= [column in highlights and 0.9 or 0.0 for column in columns]
    )
    ax.set_xticklabels(labels, rotation='vertical')
    ax.set_xticks(range(len(labels)))
    ax.tick_params(direction='out', top='off')
    # ax.set_title("Number of projects by test framework")
    ax.set_ylabel("Number of projects (out of {})".format(len(df.index)))
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.yaxis.grid(linestyle='dotted')

    # ax2 = ax.twinx()
    # ax2.grid(False)
    # ax2.set_ylim(ax.get_ylim())
    # ax2.set_yticklabels(["{:.0%}".format(tick/len(df)) for tick in ax2.get_yticks()])
    # ax2.spines['right'].set_visible(False)
    # ax2.spines['top'].set_visible(False)
    # ax2.spines['left'].set_visible(False)
    # ax2.set_ylabel("Percentage of projects")

    def draw_range(ax, xmin, xmax, label):
        y=400
        ax.annotate('', xy=(xmin, y), xytext=(xmax, y), xycoords='data', textcoords='data',
                    arrowprops={'arrowstyle': '|-|', 'color':'black', 'linewidth': 0.5})
        xcenter = xmin + (xmax-xmin)/2
        ytext = y + ( ax.get_ylim()[1] - ax.get_ylim()[0] ) / 22
        ax.annotate(label, xy=(xcenter,ytext), ha='center', va='center', fontsize=9)

    draw_range(ax, 0.5, 5.5, "Unit testing")
    draw_range(ax, 5.5, 14.5, "GUI testing")
    draw_range(ax, 14.5, 21.5, "Cloud testing")
    # draw_range(ax, 21.5, 26.5, "CI/CD")

    figure.tight_layout()
    figure.savefig(path_join(results_output, "framework_count.pdf"))
    # --------------------------------------- #

    # --- Percentage of Android tests over the age of the apps --- #
    def tests_in_projects_by_time_of_creation(df_projects, frameworks, label=None,
                                              title=None,
                                              zorder=1, color=None,
                                              verbose=False, **kwargs):
        portions = []
        n_projects_with_tests_history = []
        total_projects_history = []
        age_max = df_projects['age_numeric'].max()+1
        for age in range(age_max):
            n_projects_with_tests = df_projects[df_projects['age_numeric']==age][frameworks].apply(any, axis=1).sum()
            n_projects_with_tests_history.append(n_projects_with_tests)
            total_projects = len(df_projects[df_projects['age_numeric']==age].index)
            total_projects_history.append(total_projects)
            if total_projects == 0:
                portion = 0
            else:
                portion = n_projects_with_tests/total_projects
            portions.append(portion)
            if verbose:
                print("Age {}:".format(age))
                print("{} out of {} projects ({:.1%}).".format(n_projects_with_tests, total_projects, portion))

        plt.plot(range(age_max), portions, label=label, zorder=zorder, **kwargs)
        plt.scatter(range(age_max), portions, total_projects_history, marker='o', linewidth='1', zorder=zorder)
        ax = plt.gca()
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.set_xticks(range(age_max))
        ax.set_yticklabels(["{:.0%}".format(label) for label in ax.get_yticks()])
        ax.set_ylabel("Percentage of projects")
        ax.yaxis.grid(linestyle='dotted', color='gray')
        if label:
            legend = ax.legend(loc='upper center', shadow=False)
        if title:
            ax.set_title(title)

    figure, ax = plt.subplots(1,1)
    tests_in_projects_by_time_of_creation(df, unit_test_frameworks+ui_automation_frameworks+cloud_test_services, label="Any", color=colors_dict['any'], zorder=2, linestyle=linestyle_dict['any'])
    tests_in_projects_by_time_of_creation(df, unit_test_frameworks, label="Unit testing", color=colors_dict['unit_test_frameworks'], zorder=3, linestyle=linestyle_dict['unit_test_frameworks'])
    tests_in_projects_by_time_of_creation(df, ui_automation_frameworks, label="GUI testing", color=colors_dict['ui_automation_frameworks'], zorder=4, linestyle=linestyle_dict['ui_automation_frameworks'])
    tests_in_projects_by_time_of_creation(df, cloud_test_services, label="Cloud testing", color=colors_dict['cloud_test_services'], zorder=5, linestyle=linestyle_dict['cloud_test_services'])

    ax.set_xlabel("Years since first commit")
    ax.axvspan(0,2, color='darkgreen', alpha=0.1)
    figure.tight_layout()
    figure.savefig(path_join(results_output, "tests_by_age.pdf"))
    ax.invert_xaxis()
    figure.savefig(path_join(results_output, "tests_by_age_i.pdf"))
    # ------------------------------------------------------------ #

    # --- Percentage of Android tests over the age of the apps (cumulated) --- #
    def tests_in_projects_by_time_of_creation_cumm(df_projects, frameworks,
                                                   title=None, verbose=False, **kwargs):
        project_with_test_per_age = []
        total_projects_per_age = []
        n_projects_with_tests_history = []
        total_projects_history = []
        age_max = df_projects['age_numeric'].max()+1
        for age in range(age_max)[::-1]:
            n_projects_with_tests = df_projects[df_projects['age_numeric']==age][frameworks].apply(any, axis=1).sum()
            n_projects_with_tests_history.append(n_projects_with_tests)
            total_projects = len(df_projects[df_projects['age_numeric']==age].index)
            total_projects_history.append(total_projects)
            project_with_test_per_age.append(n_projects_with_tests)
            total_projects_per_age.append(total_projects)
            if verbose:
                print("Age {}:".format(age))
                print("{} out of {} projects ({:.1%}).".format(n_projects_with_tests, total_projects, portion))
        project_with_test_per_age_cum = [sum(project_with_test_per_age[:index+1]) for index in range(len(project_with_test_per_age))]
        total_projects_per_age_cum = [sum(total_projects_per_age[:index+1]) for index in range(len(total_projects_per_age))]
        portions = []
        for with_tests, total in zip(project_with_test_per_age_cum, total_projects_per_age_cum):
            if total > 0:
                portions.append(with_tests/len(df_projects))
            else:
                portions.append(0)
        plt.plot(range(age_max)[::-1], portions, **kwargs)
        # plt.scatter(range(age_max)[::-1], portions, total_projects_history, marker='o', linewidth=1, zorder=kwargs.get('zorder'))
        plt.scatter(range(age_max)[::-1], portions, marker='.', linewidth=1, zorder=kwargs.get('zorder'))
        ax = plt.gca()
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.set_xticks(range(age_max))
        ax.set_yticklabels(["{:.0%}".format(label) for label in ax.get_yticks()])
        ax.set_ylabel("Percentage of projects")
        ax.yaxis.grid(linestyle='dotted', color='gray')
        ax.legend(loc='upper center', shadow=False)
        if title:
            ax.set_title(title)

    figure, ax = plt.subplots(1,1)
    tests_in_projects_by_time_of_creation_cumm(
        df,
        unit_test_frameworks+ui_automation_frameworks+cloud_test_services,
        label="Any", color=colors_dict['any'], zorder=2,
        linestyle=linestyle_dict['any'],
    )
    tests_in_projects_by_time_of_creation_cumm(
        df,
        unit_test_frameworks,
        label="Unit testing", color=colors_dict['unit_test_frameworks'], zorder=3,
        linestyle=linestyle_dict['unit_test_frameworks'],
    )
    tests_in_projects_by_time_of_creation_cumm(
        df,
        ui_automation_frameworks,
        label="GUI testing", color=colors_dict['ui_automation_frameworks'], zorder=4,
        linestyle=linestyle_dict['ui_automation_frameworks'],
    )
    tests_in_projects_by_time_of_creation_cumm(
        df,
        cloud_test_services,
        label="Cloud testing", color=colors_dict['cloud_test_services'], zorder=5,
        linestyle=linestyle_dict['cloud_test_services'],
    )
    ax.set_xlabel("Year")
    ax.axvspan(0,2, color='darkgreen', alpha=0.1)
    figure.tight_layout()
    figure.savefig(path_join(results_output, "tests_by_age_cumm.pdf"))
    ax.invert_xaxis()
    figure.savefig(path_join(results_output, "tests_by_age_cumm_i.pdf"))
    # ------------------------------------------------------------ #



    # --- Percentage of 2+years apps with tests grouped by time since last update --- #
    def tests_in_projects_by_time_of_update(df_projects, frameworks, label=None,
                                              title=None,
                                              verbose=False, zorder=None, color=None, **kwargs):
        portions = []
        n_projects_with_tests_history = []
        total_projects_history = []
        age_max = df_projects['time_since_last_update_numeric'].max()+1
        for age in range(age_max):
            n_projects_with_tests = df_projects[df_projects['time_since_last_update_numeric']==age][frameworks].apply(any, axis=1).sum()
            n_projects_with_tests_history.append(n_projects_with_tests)
            total_projects = len(df_projects[df_projects['time_since_last_update_numeric']==age].index)
            total_projects_history.append(total_projects)
            if total_projects == 0:
                portion = 0
            else:
                portion = n_projects_with_tests/total_projects
            portions.append(portion)
            if verbose:
                print("Age {}:".format(age))
                print("{} out of {} projects ({:.1%}).".format(n_projects_with_tests, total_projects, portion))

        plt.plot(range(age_max), portions, label=label, zorder=zorder, **kwargs)
        plt.scatter(range(age_max), portions, total_projects_history, marker='o', linewidth='1', zorder=zorder)
        ax = plt.gca()
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.spines['bottom'].set_visible(True)
        ax.set_xticks(range(age_max))
        ax.set_yticklabels(["{:.0%}".format(label) for label in ax.get_yticks()])
        ax.set_ylabel("Percentage of projects")
        ax.yaxis.grid(linestyle='dotted', color='gray')

        if label:
            legend = ax.legend(loc='upper center', shadow=False)
        if title:
            plt.title(title)

    figure, ax = plt.subplots(1,1)
    tests_in_projects_by_time_of_update(df_old, unit_test_frameworks+ui_automation_frameworks+cloud_test_services, label="Any", color=colors_dict['any'], linestyle=linestyle_dict['any'], zorder=1)
    tests_in_projects_by_time_of_update(df_old, unit_test_frameworks, label="Unit testing", color=colors_dict['unit_test_frameworks'], linestyle=linestyle_dict['unit_test_frameworks'], zorder=2)
    tests_in_projects_by_time_of_update(df_old, ui_automation_frameworks, label="GUI testing", color=colors_dict['ui_automation_frameworks'], linestyle=linestyle_dict['ui_automation_frameworks'], zorder=3)
    tests_in_projects_by_time_of_update(df_old, cloud_test_services, label="Cloud testing", color=colors_dict['cloud_test_services'], linestyle=linestyle_dict['cloud_test_services'], zorder=4)
    ax.set_xlabel("Years since last update")
    figure.tight_layout()
    figure.savefig(path_join(results_output, "mature_tests_by_update.pdf"))
    ax.invert_xaxis()
    figure.savefig(path_join(results_output, "mature_tests_by_update_i.pdf"))

    # ------------------------------------------------------------------------------- #

    # --- Descriptive stats for popularity metrics --- #
    dictionary = {
        "count": "$N$",
        "mean": "$\\bar{x}$",
        "std": "$s$",
        "min": "$min$",
        "max": "$max$",
        "rating_value": "Rating"
    }
    metrics = ['stars','forks', 'contributors', 'commits', 'rating_value', 'rating_count']


    def outliers_modified_z_score(ys):
        threshold = 3.5

        median_y = np.median(ys)
        median_absolute_deviation_y = np.median([np.abs(y - median_y) for y in ys])
        modified_z_scores = [0.6745 * (y - median_y) / median_absolute_deviation_y
                             for y in ys]
        return (np.abs(modified_z_scores) > threshold)

    def outliers_z_score(ys):
        return np.abs(zscore(ys) < 3)

    def remove_outliers_df(df, metric):
        df = df.dropna(subset=[metric])
        return df[outliers_z_score(df[metric])]


    def remove_outliers(series):
        series = series[~series.isnull()]
        return series[outliers_z_score(series)]
        # return series[np.abs(zscore(series) < 3)]

    def _descriptive_stats(series, ):
        return (
            series.count(),
            series.mean(),
            series.std(),
            series.min(),
            series.quantile(0.25),
            series.median(),
            series.quantile(0.75),
            series.max(),
            shapiro(series)[1] < 0.01 and "$p < 0.01$",
        )

    stats = []
    for metric in metrics:
        metric_title = metric.title().replace("_", " ")
        df_tmp = remove_outliers_df(df, metric)
        df_tmp_tests = df_tmp[df_tmp['tests']]
        stats.append((
            f"\\multirow{{2}}{{*}}{{{metric_title}}}",
            '$W$',
            *_descriptive_stats(df_tmp_tests[metric])
        ))
        df_tmp_wo_tests = df_tmp[~df_tmp['tests']]
        stats.append((
            "",
            '$WO$',
            *_descriptive_stats(df_tmp_wo_tests[metric])
        ))
    old_escape_rules = T.LATEX_ESCAPE_RULES
    T.LATEX_ESCAPE_RULES = {'%': '\\%'}
    table = tabulate(
        stats,
        headers=['', 'Tests', '$N$', '$\\bar{x}$', '$s$', '$min$', '$25%$', '$Md$', '$75%$', '$max$', '$X \sim N$'],
        # showindex=issues_column,
        tablefmt='latex',
        floatfmt=".1f",
    )
    T.LATEX_ESCAPE_RULES = old_escape_rules
    with open(path_join(results_output, "popularity_metrics_stats_2.tex"), 'w') as f:
        f.write(table)

    stats = pandas.concat([remove_outliers(df[metric]).describe() for metric in metrics], axis=1)
    stats = stats.applymap((lambda x: "${:.1f}$".format(float(x)))).astype(str)
    stats[['stars','forks', 'contributors', 'commits', 'rating_count']] = stats[['stars','forks', 'contributors', 'commits', 'rating_count']].applymap((lambda x: "${:.0f}$".format(float(x[1:-1])))).astype(str)
    stats.loc['count']= stats.loc['count'].map((lambda x: "${:.0f}$".format(float(x[1:-1])))).astype(str)

    old_escape_rules = T.LATEX_ESCAPE_RULES
    T.LATEX_ESCAPE_RULES = {'%': '\\%'}
    with open(path_join(results_output, "popularity_metrics_stats.tex"), 'w') as f:
        f.write(tabulate(
            stats,
            headers=[dictionary.get(column, column.title().replace("_", " ")) for column in stats.columns],
            showindex=[dictionary.get(name, name) for name in stats.index],
            tablefmt='latex',
            floatfmt=".1f"
        ))
    T.LATEX_ESCAPE_RULES = old_escape_rules
    ###box plots instead
    figure, axes = plt.subplots(2, 3)
    for index, ax, metric in zip(range(len(metrics)), [ax for subaxes in axes for ax in subaxes], metrics):
        values = remove_outliers(df[metric])
        metric_title = metric.title().replace("_", " ")
        ax.boxplot(values, whis=[5,95], showmeans=True, meanline=True,showfliers=True)
        ax.set_xticklabels([metric_title])
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.spines['bottom'].set_visible(True)
        ax.yaxis.grid(linestyle='dotted', color='gray')
        if index != 4:
            ax.set_yscale('log')
            ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, _: '{:g}'.format(y)))
        figure.tight_layout()
    figure.savefig(path_join(results_output, f"popularity_metrics_boxplot.pdf"))

    # -------------------------------------------------- #

    # --- Histogram for downloads --- #
    downloads_distribution = df_with_google_data.groupby('downloads')['downloads'].count()
    heights = df_with_google_data.groupby('downloads')['downloads'].count().values


    figure, ax = plt.subplots(1,1)
    labels = [
        str(human_format(int(cat.split(' - ')[0].replace(',',''))))
        + " – " +
        str(human_format(int(cat.split(' - ')[1].replace(',',''))))
        for cat in downloads_scale
    ]
    # ax.bar(
    #     range(len(labels)),
    #     heights,
    #     width=0.9,
    #     color=[column == '10,000 - 50,000' and 'C1' or 'C0' for column in downloads_scale],
    # )
    downloads_distribution.plot.bar(
        ax=ax,
        width=0.9,
        fontsize=14,
    )
    ax.set_xticklabels(labels, fontsize=14, rotation='vertical')
    ax.set_xlabel("Downloads", fontsize=15)
    ax.set_ylabel("Number of apps (out of {})".format(len(df.index)), fontsize=15)
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(True)
    ax.yaxis.grid(linestyle='dotted', color='gray')

    # ax2 = ax.twinx()
    # ax2.grid(False)
    # ax2.set_ylim(ax.get_ylim())
    # ax2.set_yticklabels(["{:.0%}".format(tick/len(df_with_google_data)) for tick in ax2.get_yticks()], fontsize=14)
    # ax2.spines['right'].set_visible(False)
    # ax2.spines['top'].set_visible(False)
    # ax2.spines['left'].set_visible(False)
    # ax2.set_ylabel("Percentage of apps", fontsize=15)


    figure.tight_layout()
    figure.savefig(path_join(results_output, "downloads_hist.pdf"))
    # -------------------------------------------------- #

    # ---------- Hypothesis testing ------------- #
    popularity_metrics = [
        'stars',
        'forks',
        'contributors',
        'commits',
        'rating_value',
        'rating_count',
        # 'downloads'
    ]

    def cohen_d(y,x):
        nx = len(x)
        ny = len(y)
        dof = nx + ny - 2
        return (np.mean(x) - np.mean(y)) / np.sqrt(((nx-1)*np.std(x, ddof=1) ** 2 + (ny-1)*np.std(y, ddof=1) ** 2) / dof)

    def analyze_populations(a,b, continuous=True):
        mean_difference = np.mean(b) - np.mean(a)
        median_difference = np.median(b) - np.median(a)
        improvement = mean_difference/np.mean(b)
        ks_test, ks_p = ks_2samp(a,b)
        mwu_test, mwu_p = mannwhitneyu(a,b, alternative='two-sided')

        return {
            # 'MW': "${:.4f}$".format(mwu_p),
            # 'KS': continuous and "${:.4f}$".format(ks_p) or "n.a.",
            'Test': continuous and "${:,.0f}$".format(ks_test) or "${:,.0f}$".format(mwu_test),
            '$p$-value': continuous and ks_p or mwu_p,
            '$\\Delta\\bar{x}$': "${:,.2f}$".format(mean_difference),
            '$\\Delta Md$': "${:,.2f}$".format(median_difference),
            'CL (%)': f"${cles(a,b):,.2%}$",
            'Cohen\'s $d$': f"${cohen_d(a,b):,.4f}$",
            '$d_r$': "${:.1%}$".format(improvement),
        }

    tests = []
    for metric in popularity_metrics:
        df_wo_outliers = remove_outliers_df(df, metric)
        tests.append(
            analyze_populations(
                df_wo_outliers[~df_wo_outliers['tests']][metric],
                df_wo_outliers[df_wo_outliers['tests']][metric],
                False
            )
        )

    # Apply multiple test correction ()
    pvalues = [test['$p$-value'] for test in tests]
    _,pvalues,*_ = multipletests(pvalues, alpha=0.05, method='fdr_bh')
    for test, pvalue in zip(tests, pvalues):
        test['$p$-value'] = "${:.4f}$".format(pvalue)


    old_escape_rules = T.LATEX_ESCAPE_RULES
    T.LATEX_ESCAPE_RULES = {'%': '\\%'}
    with open(path_join(results_output, "popularity_metrics_test.tex"), 'w') as f:
        f.write(tabulate(
            tests,
            headers='keys',
            showindex=[metric.title().replace("_"," ") for metric in popularity_metrics],
            tablefmt='latex',

        ))
    T.LATEX_ESCAPE_RULES = old_escape_rules
    # ------------------------------------------- #

    # ---------- Tests vs Rating with Rating count ------------- #
    x = range(0, 10000 , 100)
    y_with_tests = tuple(df_with_tests[df_with_tests['rating_count']>i]['rating_value'].mean() for i in x)
    y_without_tests = tuple(df_without_tests[df_without_tests['rating_count']>i]['rating_value'].mean() for i in x)

    figure, ax = plt.subplots()
    ax.scatter(x, y_with_tests, marker='o', color='C0', label="With tests", zorder=2)
    ax.plot(x, y_with_tests, alpha=0.5, color='C0', zorder=1)
    ax.scatter(x, y_without_tests, marker='2', color='r', label="Without tests", zorder=2)
    ax.plot(x, y_without_tests, alpha=0.5, color='r', zorder=1)
    ax.legend(loc='upper center')

    ax.set_ylabel("Rating")
    ax.set_xlabel("Rating count >")
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)

    figure.tight_layout()
    figure.savefig(path_join(results_output, "rating_with_lower_limit.pdf"))
    # --------------------------------------------------------- #

    # ------------------ CI/CD platforms hist --------------- #

    figure, ax = plt.subplots()
    namepedia={
        "circleci": "Circle CI",
        "travis": "Travis CI",
    }
    df[['ci/cd']+ci_services].sum().plot.bar(
        fontsize=15, edgecolor = 'k', color='black', width=0.25, linewidth = [1]+[0]*len(ci_services)
    )
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.yaxis.grid(linestyle='dotted', color='gray')
    ax.set_ylabel("Number of apps (out of {})".format(len(df.index)), fontsize=15)
    ax.set_xticklabels(["All"]+[namepedia.get(key, key.title().replace('_', ' ')) for key in ci_services])

    # ax2 = ax.twinx()
    # ax2.grid(False)
    # ax2.set_ylim(ax.get_ylim())
    # ax2.set_yticklabels(["{:.0%}".format(tick/len(df)) for tick in ax2.get_yticks()], fontsize=15)
    # ax2.spines['right'].set_visible(False)
    # ax2.spines['top'].set_visible(False)
    # ax2.spines['left'].set_visible(False)
    # ax2.set_ylabel("Percentage of apps", fontsize=15)

    for p in ax.patches:
        ax.annotate("{:.0f}".format(p.get_height()), (p.get_x() +p.get_width()/2, p.get_height()+4), ha='center', fontsize=14)
    figure.tight_layout()
    figure.savefig(path_join(results_output, "ci_cd_hist.pdf"))
    # ------------------------------------------------------- #

    # ---------------- Mosaic CI/CD ---------------- #
    from statsmodels.graphics.mosaicplot import mosaic
    def properties(keys):
        keys = list(map(lambda i: i == 'True', keys))
        if all(keys):
            return {'color': 'lightgreen'}
        elif any(keys):
            return {'color': 'lightgoldenrodyellow'}
        return {'color': 'lightcoral'}

    figure, ax  = plt.subplots(figsize=(4.5,3.5))
    labelizer = lambda k: {
        ('False','False'): 'A. No Tests and no CI/CD\n({:.1%})'.format(1 - df[["tests", "ci/cd"]].any(axis=1).sum()/len(df)),
        ('True','False'): 'B. With Tests but\nno CI/CD\n({:.1%})'.format(sum(df["tests"] & ~df["ci/cd"])/len(df)),
        ('False','True'): 'C. No Tests but with CI/CD\n({:.1%})'.format(sum(~df["tests"] & df["ci/cd"])/len(df)),
        ('True','True'): 'D. With Tests and\nwith CI/CD\n({:.1%})'.format(df[["tests", "ci/cd"]].all(axis=1).sum()/len(df)),
    }.get(k, k)

    mosaic(df, ["tests", "ci/cd"], properties= properties, labelizer=labelizer, ax=ax)
    ax.set_xticklabels(['No tests', 'With tests'])
    ax.set_yticklabels(['With CI/CD', 'No CI/CD'])
    # ax.spines['left'].linewidth = 1
    # ax.spines['top'].linewidth = 1
    # ax.spines['right'].linewidth = 1
    # ax.spines['bottom'].linewidth = 1
    ax.invert_yaxis()
    figure.tight_layout()
    figure.savefig(path_join(results_output, "ci_cd_mosaic.pdf"))

    obs = [
        [sum(~df["tests"] & df["ci/cd"]), sum(~df["tests"] & ~df["ci/cd"])], #No tests
        [sum(df["tests"] & df["ci/cd"]), sum(df["tests"] & ~df["ci/cd"])] #Tests
    ]
    chi,pvalue,dof,_ = chi2_contingency(obs)
    print("Relationship between Ci/CD and Automated testing:")
    print("Chi={}, dof={}, p={}".format(chi, dof, pvalue))
    # ------------------------------------------------------- #

    # ------------------ Sonar vs tests --------------- #
    features = [
        # 'sonar_issues_ratio',
        'sonar_blocker_issues_ratio',
        'sonar_critical_issues_ratio',
        'sonar_major_issues_ratio',
        'sonar_minor_issues_ratio'
    ]
    names = [
        # 'Any',
        'Blocker',
        'Critical',
        'Major',
        'Minor'
    ]
    options = {
        'sym':       '',
        'meanline':  True,
        'showmeans': True,
        'patch_artist': True,
    }

    figure, ax = plt.subplots(1,1)
    boxplot = ax.boxplot(
        [
            df_tmp[feature].dropna().values
            for feature in features
            for df_tmp in (df_with_tests, df_without_tests)
        ],
        labels=(
            'With Tests',
            'Without Tests'
        )*len(features),
        **options
    )

    colors = (
        'C0',
        'darkred'
    )*len(features)
    hatches = (
        '/',
        ''
    )*len(features)
    for patch, color, hatch in zip(boxplot['boxes'], colors, hatches):
        patch.set_edgecolor(color)
        patch.set_facecolor((1,1,1,0.8))
        patch.set_hatch(hatch)
        patch.set_alpha(0.9)
    for cap, whisker, color in zip(boxplot['caps'], boxplot['whiskers'], np.repeat(colors,2)):
        cap.set_color(color)
        whisker.set_color(color)

    # legend
    circ1 = mpatches.Patch(facecolor='white', edgecolor=colors[0], hatch=hatches[0], label='With Tests')
    circ2 = mpatches.Patch(facecolor='white', edgecolor=colors[1], hatch=hatches[1], label='Without Tests')
    ax.legend(handles=(circ1,circ2), facecolor='white')
    # -----

    ax.yaxis.grid(linestyle='dotted', color='gray')
    ax.set_xticklabels(names)
    xticks = np.arange(1.5, len(features)*2+0.5, 2)
    ax.set_xticks(xticks)
    ax.set_ylabel('Number of issues per file')
    ax.set_xlabel('Severity of issues')

    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)


    mean_differences = [
        df_without_tests[feature].dropna().mean() -
        df_with_tests[feature].dropna().mean()
        for feature in features
    ]
    median_differences = [
        df_without_tests[feature].dropna().median() -
        df_with_tests[feature].dropna().median()
        for feature in features
    ]

    relative_differences = [
        int((df_without_tests[feature].dropna().median() - df_with_tests[feature].dropna().median()) / df_with_tests[feature].dropna().median()*100)
        for feature in features
    ]
    cles_values = [
        "{:.2%}".format(cles(
            df_with_tests[feature].dropna(),
            df_without_tests[feature].dropna()
        ))
        for feature in features
    ]
    cohensd_values = [
        cohen_d(
            df_with_tests[feature].dropna(),
            df_without_tests[feature].dropna()
        )
        for feature in features
    ]

    tester = ks_2samp
    tester = mannwhitneyu
    # tester = ttest_ind
    pvalues = [
        tester(
            df_without_tests[feature].dropna().values,
            df_with_tests[feature].dropna().values,
            # alternative="two-sided"
            # equal_var=False,
        ).pvalue
        for feature in features
    ]
    #multiple test correction ()
    _,pvalues,*_ = multipletests(pvalues, alpha=0.05, method='fdr_bh')

    # # Add info boxes to the boxplot
    # bbox_props_not_significant = dict(boxstyle="round,pad=0.3", fc=(1,1,1,0.8), ec='lightgray', lw=0.5)
    # bbox_props_significant = dict(boxstyle="round,pad=0.3", fc=(1,1,1,0.8), ec='black', lw=0.5)
    # for name, x, mean_difference, median_difference, pvalue in zip(names, xticks, mean_differences, median_differences, pvalues):
    #     if pvalue < 0.05:
    #         bbox_props = bbox_props_significant
    #     else:
    #         bbox_props = bbox_props_not_significant
    #     ax.annotate(
    #         (
    #             r"$\Delta\bar{{x}} = {:.2f}$".format(mean_difference)+"\n"+
    #             r"$\Delta Md = {:.2f}$".format(median_difference)+"\n"+
    #             r"$p = {:.4f}$".format(pvalue)
    #         ),
    #         (x,2.5),
    #         va='top', ha='center',
    #         fontsize=11,
    #         bbox=bbox_props
    #     )
    for patch,pvalue,color in zip(boxplot['boxes'], np.repeat(pvalues,2), colors):
        if pvalue < 0.05:
            # patch.set_facecolor((1.0,1.0,0.8,0.7))
            # patch.set_facecolor(color)
            # patch.set_hatch("\\")
            patch.set_linewidth(2)

    figure.tight_layout()
    figure.savefig(path_join(results_output, "sonar_vs_tests.pdf"))

    #SONAR ISSUEs SIGNIFICANCE RESULTS TABLE
    table_values = list(zip(names, mean_differences, median_differences, relative_differences, cles_values, cohensd_values, pvalues))
    old_escape_rules = T.LATEX_ESCAPE_RULES
    T.LATEX_ESCAPE_RULES = {'%': '\\%'}
    table = tabulate(
        table_values,
        headers=['Severity', r"$\Delta\bar{{x}}$", r"$\Delta Md$", r"$\frac{\Delta{}Md}{Md_W}$(%)",'CL (%)','Cohen\'s $d$', '$p$-value'],
        # showindex=issues_column,
        tablefmt='latex',
        floatfmt=".4f",
    )
    T.LATEX_ESCAPE_RULES = old_escape_rules
    with open(path_join(results_output, "sonar_metrics_test.tex"), 'w') as f:
        f.write(table)


    from itertools import chain
    issues_column = list(chain.from_iterable([("\multirow{{2}}{{*}}{{{}}}".format(name), ' ') for name in names]))
    old_escape_rules = T.LATEX_ESCAPE_RULES
    T.LATEX_ESCAPE_RULES = {'%': '\\%'}
    table = tabulate(
        [
            (
                sample_name,
                df_tmp[feature].dropna().count(),
                "${:.4f}$".format(df_tmp[feature].dropna().median()),
                "${:.4f}$".format(df_tmp[feature].dropna().mean()),
                "${:.4f}$".format(df_tmp[feature].dropna().std()),
                shapiro(df_tmp[feature].dropna())[1] < 0.0001 and "$p < 0.0001$",
            )
            for feature in features
            for (df_tmp, sample_name) in ((df_with_tests, '$W$'), (df_without_tests, '$WO$'))
        ],
        headers=['Tests', '$N$', '$Md$', '$\\bar{x}$', '$s$', '$X \sim N$'],
        showindex=issues_column,
        tablefmt='latex',
    )
    T.LATEX_ESCAPE_RULES = old_escape_rules
    with open(path_join(results_output, "sonar_metrics.tex"), 'w') as f:
        f.write(table)
    # ------------------------------------------------- #


    ###############
    # Hall of Fame
    ###############
    hall_of_fame = df[df[['ci/cd', 'unit_tests', 'ui_tests']].all(axis=1)].sort_values('stars', ascending=False)
    categories = hall_of_fame['category'].unique()
    small_hall_of_fame = [hall_of_fame[hall_of_fame['category']==category].iloc[0][['user', 'project_name']] for category in categories ]
    small_hall_of_fame_table = tabulate(
        small_hall_of_fame,
        headers=['Category', 'Organization', 'Project Name'],
        showindex=list(categories),
        tablefmt='latex',
    )
    with open(path_join(results_output, "small_hall_of_fame.tex"), 'w') as f:
        f.write(small_hall_of_fame_table)
    #############

    #### Categories ######
    figure, ax = plt.subplots(figsize=(6.4, 4))
    (df[['app_id','category']]
     .groupby('category')
     .count()
     .plot.bar(color='black', width=0.25, ax=ax))
    ax.legend().remove()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.yaxis.grid(linestyle='dotted', color='gray')
    ax.set_xlabel('Category')
    ax.set_ylabel('Number of Apps')

    figure.tight_layout()
    figure.savefig(path_join(results_output, "categories.pdf"))
    ######################

    # --- Percentage of Android tests over the age of the apps (cumulated) --- #
    def tests_in_projects_by_time_of_creation_cumm(df_projects, frameworks,
                                                   title=None, verbose=False, **kwargs):
        project_with_test_per_age = []
        total_projects_per_age = []
        n_projects_with_tests_history = []
        total_projects_history = []
        age_max = df_projects['age_numeric'].max()+1
        for age in range(age_max)[::-1]:
            n_projects_with_tests = df_projects[df_projects['age_numeric']==age][frameworks].apply(any, axis=1).sum()
            n_projects_with_tests_history.append(n_projects_with_tests)
            total_projects = len(df_projects[df_projects['age_numeric']==age].index)
            total_projects_history.append(total_projects)
            project_with_test_per_age.append(n_projects_with_tests)
            total_projects_per_age.append(total_projects)
            if verbose:
                print("Age {}:".format(age))
                print("{} out of {} projects ({:.1%}).".format(n_projects_with_tests, total_projects, portion))
        project_with_test_per_age_cum = [sum(project_with_test_per_age[:index+1]) for index in range(len(project_with_test_per_age))]
        total_projects_per_age_cum = [sum(total_projects_per_age[:index+1]) for index in range(len(total_projects_per_age))]
        portions = []
        for with_tests, total in zip(project_with_test_per_age_cum, total_projects_per_age_cum):
            if total > 0:
                portions.append(with_tests/len(df_projects))
            else:
                portions.append(0)
        plt.plot(range(age_max)[::-1], portions, **kwargs)
        plt.scatter(
            range(age_max)[::-1], portions, total_projects_history,
            marker='o',
            zorder=kwargs.get('zorder'),
            color=kwargs.get('color')
        )
        ax = plt.gca()
        ax.spines['right'].set_visible(False)
        ax.spines['top'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.set_xticks(range(age_max)[::-1])
        ax.set_yticklabels(["{:.0%}".format(label) for label in ax.get_yticks()])
        ax.set_ylabel("Percentage of projects")
        ax.yaxis.grid(linestyle='dotted', color='gray')
        ax.legend(loc='upper center', shadow=False)
        if title:
            ax.set_title(title)

    figure, ax = plt.subplots(1,1)
    tests_in_projects_by_time_of_creation_cumm(
        df,
        unit_test_frameworks+ui_automation_frameworks+cloud_test_services,
        label="Any", color=colors_dict['any'], zorder=2,
        linestyle=linestyle_dict['any'],
    )
    tests_in_projects_by_time_of_creation_cumm(
        df,
        ['no_tests'],
        label="No tests", color='darkred', zorder=5,
        linestyle="--",
    )
    ax.set_xlabel("Years since first commit")
    ax.axvspan(0,2, color='darkgreen', alpha=0.1)
    figure.tight_layout()
    figure.savefig(path_join(results_output, "tests_by_age_cumm_3.pdf"))
    ax.invert_xaxis()
    figure.savefig(path_join(results_output, "tests_by_age_cumm_3_i.pdf"))
# but the range of age for non survivor is larger

titanic.boxplot(column="Fare", by="Survived")
# Survivors tend to possess ticket with higher price ==> richer people tend to survive more

titanic.boxplot(column="SibSp", by="Survived")
# Not much information

titanic.boxplot(column="Parch", by="Survived")
# Not much information

table = pandas.crosstab(titanic["Survived"], titanic["Pclass"])
print(table)
from statsmodels.graphics.mosaicplot import mosaic

mosaic(titanic, ["Pclass", "Survived"])
# Most of victimes are from class 3

table2 = pandas.crosstab(titanic["Survived"], titanic["Sex"])
print(table2)
mosaic(titanic, ["Survived", "Sex"])
# female are more likely survived than male passengers

table3 = pandas.crosstab(titanic["Survived"], titanic["Embarked"])
print(table3)
mosaic(titanic, ["Survived", "Embarked"])
# most of people part from port S


# Fill missing value of variable Age by its median
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
tail_prob = prob.loc[~mask].sum()
prob = prob.loc[mask]
if not tail_prob == 0:
    prob['other'] = tail_prob
prob.plot(kind='bar')
plt.xticks(rotation=25)
plt.show()

## Max commercials during 8 AM and then in night hours, prime time

# In[55]:

from statsmodels.graphics.mosaicplot import mosaic
plt.rcParams['font.size'] = 0.5
plt.figure(figsize=(200000, 100000))
mosaic(df, ['Category', 'Network'])

## Observations
# Electronics & Communication on TBS was the best combo of all

# In[53]:

# cross tab of category and network
table1 = pd.crosstab(index=df["Category"], columns=df["Network"])

table1.plot(kind="bar", figsize=(20, 20), stacked=True)
# Observation
# same as above

# In[87]:
plt.title("Frequency of Sentiment Scores")
plt.show()


# In[21]:

dfFull.ix[:, -5:-1].hist()

plt.show()


# In[62]:

from statsmodels.graphics.mosaicplot import mosaic

mosaic(dfFull, ["Age", "compound"])


# In[23]:

dfFull["Continent"] = dfFull["Continent"].astype(
    "category", categories=["America", "Africa", "Asia", "Europe"], ordered=True
)


# In[24]:

dfFull["Income"] = dfFull["Income"].astype(
    "category",
    categories=["20000-34999", "<20000", "35000-49999", "50000-74999", "75000-99999", "100000+"],
    ordered=True,
예제 #39
0
titanic_df.shape
# (891, 12)

# have a peek view of data
titanic_df.head()
summary = titanic_df.describe()
nullsum = titanic_df.isnull().sum()

# mosaic plots
tb1 = pd.crosstab(titanic_df['Pclass'], titanic_df['Survived'])
tb2 = pd.crosstab(titanic_df['Sex'], titanic_df['Survived'])

from statsmodels.graphics.mosaicplot import mosaic
import matplotlib.pyplot as plt

fig1, ax1 = plt.subplots()
m11 = mosaic(tb1.stack(),
             ax=ax1,
             labelizer=lambda x: tb1.loc[int(x[0]), int(x[1])])
ax1.set_yticklabels(['Deceased', 'Survived'])
ax1.set_title("Ticket Class and Survivability")

fig2, ax2 = plt.subplots()
m22 = mosaic(tb2.stack(), ax=ax2, labelizer=lambda y: tb2.loc[y[0], int(y[1])])
ax2.set_yticklabels(['Deceased', 'Survived'])
ax2.set_title("Gender and Survivability")

# box plots
bp1 = titanic_df.boxplot(column='Age', by='Survived')
bp1.set_ylabel("Age")
bp1.set_xlabel("Survival")
예제 #40
0
def scatter_matrix_all(frame, alpha=0.5, figsize=None, grid=False, diagonal='hist', marker='.', density_kwds=None, hist_kwds=None, range_padding=0.05, **kwds):
    
    df = frame
    num_cols = frame._get_numeric_data().columns.values
    n = df.columns.size
    fig, axes = plt.subplots(nrows=n, ncols=n, figsize=figsize, squeeze=False)

    # no gaps between subplots
    fig.subplots_adjust(wspace=0, hspace=0)

    mask = com.notnull(df)
    marker = _get_marker_compat(marker)

    hist_kwds = hist_kwds or {}
    density_kwds = density_kwds or {}

    # workaround because `c='b'` is hardcoded in matplotlibs scatter method
    kwds.setdefault('c', plt.rcParams['patch.facecolor'])

    boundaries_list = []
    for a in df.columns:
        if a in num_cols:
            values = df[a].values[mask[a].values]
        else:
            values = df[a].value_counts()
        rmin_, rmax_ = np.min(values), np.max(values)
        rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
        boundaries_list.append((rmin_ - rdelta_ext, rmax_+ rdelta_ext))

    for i, a in zip(lrange(n), df.columns):
        for j, b in zip(lrange(n), df.columns):
            ax = axes[i, j]

            if i == j:
                if a in num_cols:    # numerical variable
                    values = df[a].values[mask[a].values]
                    # Deal with the diagonal by drawing a histogram there.
                    if diagonal == 'hist':
                        ax.hist(values, **hist_kwds)
                    elif diagonal in ('kde', 'density'):
                        from scipy.stats import gaussian_kde
                        y = values
                        gkde = gaussian_kde(y)
                        ind = np.linspace(y.min(), y.max(), 1000)
                        ax.plot(ind, gkde.evaluate(ind), **density_kwds)
                    ax.set_xlim(boundaries_list[i])
                else:                # categorical variable
                    values = df[a].value_counts()
                    ax.bar(list(range(df[a].nunique())), values)
            else:
                common = (mask[a] & mask[b]).values
                # two numerical variables
                if a in num_cols and b in num_cols:
                    if i > j:
                        ax.scatter(df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds)
                        # The following 2 lines add the lowess smoothing
                        ys = lowess(df[a][common], df[b][common])
                        ax.plot(ys[:,0], ys[:,1], 'red')
                    else:
                        pearR = df[[a, b]].corr()
                        ax.text(df[b].min(), df[a].min(), 'r = %.4f' % (pearR.iloc[0][1]))
                    ax.set_xlim(boundaries_list[j])
                    ax.set_ylim(boundaries_list[i])
                # two categorical variables
                elif a not in num_cols and b not in num_cols:
                    if i > j:
                        from statsmodels.graphics import mosaicplot
                        mosaicplot.mosaic(df, [b, a], ax, labelizer=lambda k:'')
                # one numerical variable and one categorical variable
                else:
                    if i > j:
                        tol = pd.DataFrame(df[[a, b]])
                        if a in num_cols:
                            label = [ k for k, v in tol.groupby(b) ]
                            values = [ v[a].tolist() for k, v in tol.groupby(b) ]
                            ax.boxplot(values, labels=label)
                        else:
                            label = [ k for k, v in tol.groupby(a) ]
                            values = [ v[b].tolist() for k, v in tol.groupby(a) ]
                            ax.boxplot(values, labels=label, vert=False)

            ax.set_xlabel('')
            ax.set_ylabel('')

            _label_axis(ax, kind='x', label=b, position='bottom', rotate=True)
            _label_axis(ax, kind='y', label=a, position='left')

            if j!= 0:
                ax.yaxis.set_visible(False)
            if i != n-1:
                ax.xaxis.set_visible(False)

    for ax in axes.flat:
        setp(ax.get_xticklabels(), fontsize=8)
        setp(ax.get_yticklabels(), fontsize=8)
    return fig
#print(data)

classes = ["Mammalia", "Aves", "Reptilia"]
statuses = ["Endangered", "Critically endangered", "Vulnerable"]

mosaic_data = []
for item in data:
  if item["Animal Class"] in classes and item["Category"] in statuses:
    mosaic_data.append(item)

properties = {
  "Endangered": {"color": "#FACDB6"},
  "Critically endangered": {"color": "#C5CADE"},
  "Vulnerable": {"color": "#A8DBD2"},
}

plt.rc("font", size=8)

mosaic_dataframe = pd.DataFrame(mosaic_data)

fig = mosaic(
    mosaic_dataframe,
    ["Category","Animal Class"],
    title="Conservation Status by Animal Class",
    gap=[0.02, 0.02],
    axes_label=True,
    properties=lambda x: properties[x[0]],
)

plt.savefig("endangered_species.png")
ticket          Ticket Number
fare            Passenger Fare
cabin           Cabin
embarked        Port of Embarkation
                (C = Cherbourg; Q = Queenstown; S = Southampton)
'''

########
## Performing some EDA of the data
########

# Pull out survived as the response series
response_series = df.Survived

# Create some Mosiac plots to inspect the data
mosaic(df,['Pclass','Survived'], title = 'Survival Rate by Class')
mosaic(df,['Sex','Survived'], title = 'Survival Rate by Gender')

## Creating a function to pull out the titles of the Passengers

def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first ) + 1
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

# Test the function to pull out the titles
test_title = find_between(df.Name[100],',','.')
print test_title
def view_image(headers_x: headers_x, headers_y: headers_y,
               colour_headers: colour_headers):

    data = df

    #no selection made
    #========================================================================
    if headers_x == 'Select' and headers_y == 'Select':
        sns.set_context("notebook", font_scale=1.1)

    #X selected but noy Y
    #========================================================================
    elif headers_x != 'Select' and headers_y == 'Select':

        sns.set_context("notebook", font_scale=1.1)
        x = headers_x

        if df[headers_x].dtype == np.float or df[headers_x].dtype == np.int:
            x_type = 'is_numeric'
        elif df[headers_x].dtype == np.object:
            x_type = 'is_string'

        #plot when x is a string
        #--------------------------------------------------------------------
        if x_type == 'is_string':
            #if colours have not been selected
            #................................................................
            if colour_headers == 'Select':

                g = sns.countplot(x=x, data=data)
                loc, labels = plt.xticks()
                g.set_xticklabels(labels, rotation=90)
                plt.show()
                #sns_plot.savefig("output.png")
            #if colours have been selected
            #................................................................
            else:
                g = sns.countplot(x=x, hue=colour_headers, data=data)
                loc, labels = plt.xticks()
                g.set_xticklabels(labels, rotation=90)
                plt.show()

        #plot when x is numeric
        #--------------------------------------------------------------------
        else:
            #if colours have not been selected
            #................................................................
            if colour_headers == 'Select':
                xplot = data[x]
                sns.distplot(xplot)
                plt.show()
            #if colours have been selected
            #................................................................
            else:
                g = sns.FacetGrid(data, hue=colour_headers)
                g = g.map(sns.distplot, x)
                plt.show()
    #if only Y has been selected
    #========================================================================
    elif headers_x == 'Select' and headers_y != 'Select':
        sns.set_context("notebook", font_scale=1.1)
    #if both X and Y have been selected

    #========================================================================
    elif headers_x != 'Select' and headers_y != 'Select':
        x = headers_x
        y = headers_y

        if df[headers_x].dtype == np.float or df[headers_x].dtype == np.int:
            x_type = 'is_numeric'
        elif df[headers_x].dtype == np.object:
            x_type = 'is_string'

        if df[headers_y].dtype == np.float or df[headers_y].dtype == np.int:
            y_type = 'is_numeric'
        elif df[headers_y].dtype == np.object:
            y_type = 'is_string'

        sns.set_context("notebook", font_scale=1.1)
        sns.set_style("ticks")

        #Numeric vs Numeric
        #------------------------------------------------------------------------
        if x_type == 'is_numeric' and y_type == 'is_numeric':
            # Create scatterplot of dataframe
            #if colours have not been selected
            #................................................................
            if colour_headers == 'Select':
                g = sns.lmplot(
                    x=x,  # Horizontal axis
                    y=y,  # Vertical axis
                    data=data,  # Data source
                    fit_reg=False,  # Don't fix a regression line
                    scatter_kws={"marker": "D"},
                    legend=True)
                plt.show()

            #if colours have been selected
            #................................................................
            else:
                g = sns.lmplot(
                    x=x,  # Horizontal axis
                    y=y,  # Vertical axis
                    data=data,  # Data source
                    fit_reg=False,  # Don't fix a regression line
                    hue=colour_headers,  # Set color
                    scatter_kws={"marker": "D"},  # S marker size
                    legend=True)
                plt.show()

        #Numeric vs String
        #------------------------------------------------------------------------
        elif x_type == 'is_numeric' and y_type == 'is_string':
            sns.set_style("ticks")

            #if colours have not been selected
            #................................................................
            if colour_headers == 'Select':
                g = sns.violinplot(x=x, y=y, data=data)
                plt.show()
            #if colours have been selected
            #................................................................
            else:
                g = sns.violinplot(x=x, y=y, hue=colour_headers, data=data)
                plt.show()
        #String vs Numeric
        #------------------------------------------------------------------------
        elif x_type == 'is_string' and y_type == 'is_numeric':

            #if colours have not been selected
            #................................................................
            if colour_headers == 'Select':
                sns.set_style("ticks")
                g = sns.boxplot(x=x, y=y, data=data)
                plt.show()
            #if colours have been selected
            #................................................................
            else:
                sns.set_style("ticks")
                g = sns.boxplot(x=x, y=y, hue=colour_headers, data=data)
                plt.show()

        #String vs String
        #------------------------------------------------------------------------
        elif x_type == 'is_string' and y_type == 'is_string':
            if headers_x != headers_y:
                g = mosaic(data, [x, y])
                plt.show()
            elif headers_x == headers_y:
                g = sns.countplot(x=x, data=data)
                loc, labels = plt.xticks()
                g.set_xticklabels(labels, rotation=90)
                plt.show()
df

# %%
# Let's calculate the Cramér's V coefficient for Survived and Pclass
cramers_v(df['Survived'], df['Pclass'], bias_correction=False)

# %%
# Let's verify that Cramér's V is a symmetric function
cramers_v(df['Survived'], df['Pclass']) == cramers_v(df['Pclass'],
                                                     df['Survived'])

# %%
# You can also draw a mosaic plot for these variables
mosaic(data=df,
       index=['Survived', 'Pclass'],
       statistic=True,
       axes_label=True,
       gap=[0.01, 0.02])

# %%
# Take advantage of the asymmetry of Theil's U calculating it for the same variables.
# This is U(Survived|Pcalss) that is "U for Survived given Pclass"
theils_u(df['Survived'], df['Pclass'])

# %%
# Just check that the opposite direction gives you a different result
theils_u(df['Pclass'], df['Survived'])

# %%
# Let's draw a violin plot of Age and Pclass
violinPlot(data=df,
예제 #45
0
	      line_group="pop")
fig.show()
#%%
gapminder
#%%
#Mekko chart
import pandas as pd
from statsmodels.graphics.mosaicplot import mosaic
import pylab
from itertools import product
import numpy as np
rand = np.random.random
speaks_mul_foreign_languages = list(product(['male', 'female'], ['yes', 'no']))
index = pd.MultiIndex.from_tuples(speaks_mul_foreign_languages, names=['male', 'female'])
data = pd.Series(rand(4), index=index)
mosaic(data, gap=0.01, title='Who knows multiple foreign languages? - Mosaic Chart')
pylab.show()

#%%
#Pie chart
import matplotlib.pyplot as plt

# Data to plot
labels = 'Python', 'C++', 'Ruby', 'Java'
sizes = [215, 130, 245, 210]
colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']
explode = (0.1, 0, 0, 0)  # explode 1st slice

# Plot
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=140)