def test_mosaic_very_complex(): # make a scattermatrix of mosaic plots to show the correlations between # each pair of variable in a dataset. Could be easily converted into a # new function that does this automatically based on the type of data key_name = ["gender", "age", "health", "work"] key_base = (["male", "female"], ["old", "young"], ["healty", "ill"], ["work", "unemployed"]) keys = list(product(*key_base)) data = OrderedDict(list(zip(keys, list(range(1, 1 + len(keys)))))) props = {} props[("male", "old")] = {"color": "r"} props[("female",)] = {"color": "pink"} L = len(key_base) fig, axes = pylab.subplots(L, L) for i in range(L): for j in range(L): m = set(range(L)).difference(set((i, j))) if i == j: axes[i, i].text(0.5, 0.5, key_name[i], ha="center", va="center") axes[i, i].set_xticks([]) axes[i, i].set_xticklabels([]) axes[i, i].set_yticks([]) axes[i, i].set_yticklabels([]) else: ji = max(i, j) ij = min(i, j) temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v) for k, v in list(data.items())]) keys = list(temp_data.keys()) for k in keys: value = _reduce_dict(temp_data, k[:2]) temp_data[k[:2]] = value del temp_data[k] mosaic(temp_data, ax=axes[i, j], axes_label=False, properties=props, gap=0.05, horizontal=i > j) pylab.suptitle("old males should look bright red, (plot 4 of 4)")
def testTransformDiscreteVar(df, newVarName, transformFunction, targetVar='Vote'): df[newVarName] = transformFunction(df) plt.figure() mosaic(df, [targetVar, newVarName]) plt.savefig('Temp/' + newVarName + 'by' + targetVar + '.png') plt.close()
def test_mosaic_simple(): # display a simple plot of 4 categories of data, splitted in four # levels with increasing size for each group # creation of the levels key_set = (["male", "female"], ["old", "adult", "young"], ["worker", "unemployed"], ["healty", "ill"]) # the cartesian product of all the categories is # the complete set of categories keys = list(product(*key_set)) data = OrderedDict(list(zip(keys, list(range(1, 1 + len(keys)))))) # which colours should I use for the various categories? # put it into a dict props = {} # males and females in blue and red props[("male",)] = {"color": "b"} props[("female",)] = {"color": "r"} # all the groups corresponding to ill groups have a different color for key in keys: if "ill" in key: if "male" in key: props[key] = {"color": "BlueViolet", "hatch": "+"} else: props[key] = {"color": "Crimson", "hatch": "+"} # mosaic of the data, with given gaps and colors mosaic(data, gap=0.05, properties=props, axes_label=False) pylab.suptitle("syntetic data, 4 categories (plot 2 of 4)")
def test_mosaic_simple(): # display a simple plot of 4 categories of data, splitted in four # levels with increasing size for each group # creation of the levels key_set = (['male', 'female'], ['old', 'adult', 'young'], ['worker', 'unemployed'], ['healty', 'ill']) # the cartesian product of all the categories is # the complete set of categories keys = list(product(*key_set)) data = OrderedDict(zip(keys, range(1, 1 + len(keys)))) # which colours should I use for the various categories? # put it into a dict props = {} #males and females in blue and red props[('male',)] = {'color': 'b'} props[('female',)] = {'color': 'r'} # all the groups corresponding to ill groups have a different color for key in keys: if 'ill' in key: if 'male' in key: props[key] = {'color': 'BlueViolet' , 'hatch': '+'} else: props[key] = {'color': 'Crimson' , 'hatch': '+'} # mosaic of the data, with given gaps and colors mosaic(data, gap=0.05, properties=props, axes_label=False) pylab.suptitle('syntetic data, 4 categories (plot 2 of 4)') #pylab.show() pylab.close('all')
def test_axes_labeling(): from numpy.random import rand key_set = (["male", "female"], ["old", "adult", "young"], ["worker", "unemployed"], ["yes", "no"]) # the cartesian product of all the categories is # the complete set of categories keys = list(product(*key_set)) data = OrderedDict(list(zip(keys, rand(len(keys))))) lab = lambda k: "".join(s[0] for s in k) fig, (ax1, ax2) = pylab.subplots(1, 2, figsize=(16, 8)) mosaic(data, ax=ax1, labelizer=lab, horizontal=True, label_rotation=45) mosaic(data, ax=ax2, labelizer=lab, horizontal=False, label_rotation=[0, 45, 90, 0]) # fig.tight_layout() fig.suptitle("correct alignment of the axes labels")
def test_mosaic(): # make the same analysis on a known dataset # load the data and clean it a bit affairs = datasets.fair.load_pandas() datas = affairs.exog # any time greater than 0 is cheating datas['cheated'] = affairs.endog > 0 # sort by the marriage quality and give meaningful name # [rate_marriage, age, yrs_married, children, # religious, educ, occupation, occupation_husb] datas = datas.sort(['rate_marriage', 'religious']) num_to_desc = {1: 'awful', 2: 'bad', 3: 'intermediate', 4: 'good', 5: 'wonderful'} datas['rate_marriage'] = datas['rate_marriage'].map(num_to_desc) num_to_faith = {1: 'non religious', 2: 'poorly religious', 3: 'religious', 4: 'very religious'} datas['religious'] = datas['religious'].map(num_to_faith) num_to_cheat = {False: 'faithful', True: 'cheated'} datas['cheated'] = datas['cheated'].map(num_to_cheat) # finished cleaning fig, ax = pylab.subplots(2, 2) mosaic(datas, ['rate_marriage', 'cheated'], ax=ax[0, 0], title='by marriage happiness') mosaic(datas, ['religious', 'cheated'], ax=ax[0, 1], title='by religiosity') mosaic(datas, ['rate_marriage', 'religious', 'cheated'], ax=ax[1, 0], title='by both', labelizer=lambda k:'') ax[1, 0].set_xlabel('marriage rating') ax[1, 0].set_ylabel('religion status') mosaic(datas, ['religious', 'rate_marriage'], ax=ax[1, 1], title='inter-dependence', axes_label=False) pylab.suptitle("extramarital affairs (plot 3 of 4)")
def test_mosaic(): # make the same analysis on a known dataset # load the data and clean it a bit affairs = datasets.fair.load_pandas() datas = affairs.exog # any time greater than 0 is cheating datas["cheated"] = affairs.endog > 0 # sort by the marriage quality and give meaningful name # [rate_marriage, age, yrs_married, children, # religious, educ, occupation, occupation_husb] datas = datas.sort(["rate_marriage", "religious"]) num_to_desc = {1: "awful", 2: "bad", 3: "intermediate", 4: "good", 5: "wonderful"} datas["rate_marriage"] = datas["rate_marriage"].map(num_to_desc) num_to_faith = {1: "non religious", 2: "poorly religious", 3: "religious", 4: "very religious"} datas["religious"] = datas["religious"].map(num_to_faith) num_to_cheat = {False: "faithful", True: "cheated"} datas["cheated"] = datas["cheated"].map(num_to_cheat) # finished cleaning fig, ax = pylab.subplots(2, 2) mosaic(datas, ["rate_marriage", "cheated"], ax=ax[0, 0], title="by marriage happiness") mosaic(datas, ["religious", "cheated"], ax=ax[0, 1], title="by religiosity") mosaic(datas, ["rate_marriage", "religious", "cheated"], ax=ax[1, 0], title="by both", labelizer=lambda k: "") ax[1, 0].set_xlabel("marriage rating") ax[1, 0].set_ylabel("religion status") mosaic(datas, ["religious", "rate_marriage"], ax=ax[1, 1], title="inter-dependence", axes_label=False) pylab.suptitle("extramarital affairs (plot 3 of 4)")
def test_axes_labeling(close_figures): from numpy.random import rand key_set = (['male', 'female'], ['old', 'adult', 'young'], ['worker', 'unemployed'], ['yes', 'no']) # the cartesian product of all the categories is # the complete set of categories keys = list(product(*key_set)) data = OrderedDict(zip(keys, rand(len(keys)))) lab = lambda k: ''.join(s[0] for s in k) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8)) mosaic(data, ax=ax1, labelizer=lab, horizontal=True, label_rotation=45) mosaic(data, ax=ax2, labelizer=lab, horizontal=False, label_rotation=[0, 45, 90, 0]) #fig.tight_layout() fig.suptitle("correct alignment of the axes labels")
def test_mosaic_empty_cells(close_figures): # GH#2286 import pandas as pd mydata = pd.DataFrame({'id2': {64: 'Angelica', 65: 'DXW_UID', 66: 'casuid01', 67: 'casuid01', 68: 'EC93_uid', 69: 'EC93_uid', 70: 'EC93_uid', 60: 'DXW_UID', 61: 'AtmosFox', 62: 'DXW_UID', 63: 'DXW_UID'}, 'id1': {64: 'TGP', 65: 'Retention01', 66: 'default', 67: 'default', 68: 'Musa_EC_9_3', 69: 'Musa_EC_9_3', 70: 'Musa_EC_9_3', 60: 'default', 61: 'default', 62: 'default', 63: 'default'}}) ct = pd.crosstab(mydata.id1, mydata.id2) _, vals = mosaic(ct.T.unstack()) _, vals = mosaic(mydata, ['id1','id2'])
def test_mosaic_very_complex(): # make a scattermatrix of mosaic plots to show the correlations between # each pair of variable in a dataset. Could be easily converted into a # new function that does this automatically based on the type of data key_name = ['gender', 'age', 'health', 'work'] key_base = (['male', 'female'], ['old', 'young'], ['healty', 'ill'], ['work', 'unemployed']) keys = list(product(*key_base)) data = OrderedDict(zip(keys, range(1, 1 + len(keys)))) props = {} props[('male', 'old')] = {'color': 'r'} props[('female',)] = {'color': 'pink'} L = len(key_base) fig, axes = pylab.subplots(L, L) for i in range(L): for j in range(L): m = set(range(L)).difference(set((i, j))) if i == j: axes[i, i].text(0.5, 0.5, key_name[i], ha='center', va='center') axes[i, i].set_xticks([]) axes[i, i].set_xticklabels([]) axes[i, i].set_yticks([]) axes[i, i].set_yticklabels([]) else: ji = max(i, j) ij = min(i, j) temp_data = OrderedDict([((k[ij], k[ji]) + tuple(k[r] for r in m), v) for k, v in iteritems(data)]) keys = list(iterkeys(temp_data)) for k in keys: value = _reduce_dict(temp_data, k[:2]) temp_data[k[:2]] = value del temp_data[k] mosaic(temp_data, ax=axes[i, j], axes_label=False, properties=props, gap=0.05, horizontal=i > j) pylab.suptitle('old males should look bright red, (plot 4 of 4)') #pylab.show() pylab.close('all')
train.groupby('Survived')['Sex'].value_counts().unstack(level=1).plot.bar(stacked=True) # ## Pclass # There was a statistically significant survival outcome between boarding classes 1 and 3 only. 2nd class passengers almost equally survived, while 1st and 3rd class passengers found opposite fates. Roughly 25% 3rd class passengers survived. Over 50% 1st class passengers enjoyed safety after this tragedy. Therefore, `Pclass` is worthy to include in the model. # In[ ]: contingency(train, c.Pclass, c.Survived) # In[ ]: fig, _ = mosaic(train, [c.Pclass, c.Survived], title="Pclass vs Survived | Titanic train dataset.", axes_label=True) fig.axes[0].set_ylabel(c.Survived) _ = fig.axes[0].set_xlabel(c.Pclass) # According to the graph, most people who died were from class 3.**** # - source: https://stackoverflow.com/questions/50319614/count-plot-with-stacked-bars-per-hue # In[ ]: train.groupby('Survived')['Pclass'].value_counts().unstack(level=1).plot.bar(stacked=True) train['Pclass'].value_counts() # ## Pclass versus Fare
from statsmodels.graphics.mosaicplot import mosaic color_map = { '0 - 499':'whitesmoke', '500 - 999': 'lightgray', '1000 - 1499': 'darkgray', '1500 - 1999': 'gray' } def color(key): _, tdpm = key return color_map.get(tdpm, 'red') props = lambda key: {'color': color(key)} mosaic(data, properties=props, labelizer = lambda key: '') # %% # data = np.zeros((len(stringency_labels), len(tdpm_labels))) # for index, row in average_si_vs_dpm.iterrows(): # si_label = row['stringency_group'] # tdpm_label = row['tdpm_group'] # si_label_idx = stringency_labels.index(si_label) # tdpm_label_idx = tdpm_labels.index(tdpm_label) # data[si_label_idx, tdpm_label_idx] = data[si_label_idx, tdpm_label_idx] + 1 # # normalize data row wise to sum to 1
size = 8, position = position_stack(vjust = 0.5)) # customized colors barStacked2c += scale_colour_manual(values = adHoc) # use this to avoid text on top of legend symbols barStacked2c += guides(color=False) # want to omit? barStacked2c #%% from statsmodels.graphics.mosaicplot import mosaic ax,t=mosaic(PrecintDaytime.stack(),gap=0.01) #%% base4= ggplot(PrecDaytiDF, aes(x='daytime',y='pctCol', fill='precint')) + theme_classic() barStPct2 = base4 + scale_fill_brewer(type='Qualitative', palette = "Paired") barStPct2 += theme(axis_title_y = element_blank(), axis_text_y = element_blank(),
possible_outliers += np.where( (pca_dim[:, 0] > 0) & (d['diagnosis'].values.ravel() == 0))[0].tolist() possible_outliers += np.where( (pca_dim[:, 0] < 0) & (d['diagnosis'].values.ravel() == 1))[0].tolist() possible_outliers += np.where( (pca_dim[:, 0] > 4) & (d['diagnosis'].values.ravel() == 0))[0].tolist() print(list(set(possible_outliers))) #Pairs plot/SPLOM splom = sns.pairplot(d.iloc[:, 3:]) #, diag_kind="kde") fig = splom.fig fig.suptitle('Pairs Plot of Wisconsin Breast Cancer Data') plt.show() #Mosaic plot mosaic(d, ['clump_thickness', 'cell_size_uniformity'], title='Mosaic Plot of 2 Features from Wisconsin Breast Cancer Data') plt.show() #Plot parallel coordinates of potential outliers fig = plt.figure() fig.suptitle( 'Parallel Coordinates Plot of Potential Outliers in Wisconsin Breast Cancer Data' ) parallel_coordinates(d.iloc[possible_outliers, :], class_column='diagnosis', cols=d.columns[3:], color=('#0158FE', '#FE0101')) plt.show() #-------------------------------------------------------------------------------------------------# #----------------------------------------Robust Covariance----------------------------------------#
Following Variables need to investigated: Pclass, Fare, AFare Sex, Age_G, Age_E, Age_Er, Age, Titel SibSp, Parch, NPerson, Alone, ParCh_B, SibSp_B, Family Embarked Against the Variable survived/Survived Not used: PassengerId, Cabin, Name(indirect Titel) ################################### """ """ Analysis of Pclass, Fare, AFare """ mosaic(df_2, ['Pclass', 'Survived']) pd.crosstab(df_2['Survived'], df_2['Pclass'], normalize='columns', margins=True) #====> Pclass seems to have an Influence sns.boxplot(x="Survived", y="Fare", data=df_2) sns.boxplot(x="Survived", y="AFare", data=df_2) #====> There seems to be a difference, but the question is, if this variable is necessary (maybe we can just use Pclass) sns.boxplot(x="Pclass", y="AFare", data=df_2) sns.boxplot(x="Pclass", y="AFare", data=df_2) #====> The information of AFare seems to be included in Pclass #Might be enough to just include Pclass (else AFare)
# -*- coding: utf-8 -*- """ Created on Tue Dec 12 11:07:36 2017 @author: 28414 """ import pandas as pd import matplotlib.pyplot as plt from statsmodels.graphics.mosaicplot import mosaic data1 = pd.read_excel('totalNPdata.xlsx', sheet_name='Sheet1') data2 = pd.read_excel('totalPDSdat.xlsx', sheet_name='Sheet1') n1 = data1[0:1] d = {'age': n1['b'], 'weight': n1['c'], 'hight': n1['d']} mosaic(d) plt.show()
def ParrarelCorrelationPlotChart(): dataset = LoadDataset() mosaic(dataset, ['size', 'length']) plt.show()
mean_top_depose * 100 # 0.17% pour cluster 1, 0.4% pour cluster 2, 9% pour cluster 3 et 4% pour cluster 4 del mean_top_enligne, mean_top_depose # Import de la base quali pour mozaic plot base_quali = pd.read_table( 'C:/Users/Richard/Documents/GitHub/Segmentation-multicanale2/Données/v2/base_variables_quali.csv', delimiter=";", dtype={"IDPART_CALCULE": object}) base_quali2 = pd.concat([ base_quali, clustered_data['top_enligne'], clustered_data['top_depose'], clustered_data['cluster'] ], axis=1) # Variables quali interessantes : mosaic(base_quali2, ['cluster', 'libpcs2']) mosaic(base_quali2, ['cluster', 'lncsg2']) mosaic(base_quali2, ['cluster', 'type_famille']) # csp par cluster ctab = pd.crosstab(base_quali2['cluster'], base_quali2['libpcs2']).apply(lambda x: x / x.sum(), axis=1) ct = ctab.plot( kind='bar', stacked=True, title='Categories socio-professionnelles en proportion par classe') lgd = ct.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) ct.set_ylabel('Proportion') ct.set_xlabel('Classe') ct.set_xticklabels(ctab.index, rotation=0)
table = pd.crosstab(data.slope, data.ca) chi2, p, dof, expected = chi2_contingency(table.values) print('chi2 = ' , chi2 , ' p = ' , p , ' dof = ' , dof ,' expected=' , expected ) table = pd.crosstab(data.slope, data.thal) chi2, p, dof, expected = chi2_contingency(table.values) print('chi2 = ' , chi2 , ' p = ' , p , ' dof = ' , dof ,' expected=' , expected ) ##--------Connections graph between categorical variables------------------------------## # exang and cp sns.stripplot(x='exang', y='cp', data=data, jitter=0.2) plt.show() # exang and slope mosaic(data,['slope', 'exang'] , axes_label=True ,title = "Categorial slope and exang : " ) plt.xlabel('slope') plt.show() ###-------------------Changing missing values -------------------------------### # CA data.loc[data['ca'] == 4,'ca'] = 0 #Thal data.loc[data['thal'] == 0,'thal'] = 2 ##--------Connections graph between y and variables------------------------------## # y and exang mosaic(data,['exang', 'y'], gap=0.01, axes_label=True)
print(train.isnull().sum()) print(test.isnull().sum()) print(combined.isnull().sum()) # In[ ]: train.info() # In[ ]: fig, ax = plt.subplots(figsize=(12, 4)) mosaic(train,["Survived",'Sex','Pclass'], axes_label = False, ax=ax) plt.figure(figsize=[12,8]) plt.subplot(231) sns.barplot('Sex', 'Survived', data=train) plt.subplot(232) sns.barplot('Pclass', 'Survived', data=train) plt.subplot(233) sns.barplot('Pclass', 'Survived', hue = 'Sex', data=train) plt.subplot(234) sns.barplot('Parch', 'Survived', data=train) plt.subplot(235) sns.barplot('SibSp', 'Survived', data=train) plt.subplot(236) sns.barplot('Embarked', 'Survived', data=train)
def plot_classification_categorical(X, target_col, types=None, kind='count', hue_order=None): """Exploration plots for categorical features in classification. Creates plots of categorical variable distributions for each target class. Relevant features are identified via mutual information. For high cardinality categorical variables (variables with many categories) only the most frequent categories are shown. Parameters ---------- X : dataframe Input data including features and target target_col : str or int Identifier of the target column in X types : dataframe of types, optional. Output of detect_types on X. Can be used to avoid recomputing the types. """ types = _check_X_target_col(X, target_col, types, task="classification") features = X.loc[:, types.categorical] if target_col in features.columns: features = features.drop(target_col, axis=1) if features.shape[1] == 0: return features = features.astype('category') show_top = _get_n_top(features, "categorical") # can't use OrdinalEncoder because we might have mix of int and string ordinal_encoded = features.apply(lambda x: x.cat.codes) target = X[target_col] f = mutual_info_classif(ordinal_encoded, target, discrete_features=np.ones(X.shape[1], dtype=bool)) top_k = np.argsort(f)[-show_top:][::-1] # large number of categories -> taller plot row_height = 3 if features.nunique().max() <= 5 else 5 fig, axes = _make_subplots(n_plots=show_top, row_height=row_height) # FIXME mosaic doesn't like constraint layout? plt.suptitle("Categorical Features vs Target", y=1.02) for i, (col_ind, ax) in enumerate(zip(top_k, axes.ravel())): col = features.columns[col_ind] X_new = _prune_category_make_X(X, col, target_col) if kind == 'proportion': df = (X_new.groupby(col)[target_col].value_counts( normalize=True).unstack().sort_values(by=target[0]) ) # hacky way to get a class name df.plot(kind='barh', stacked='True', ax=ax, legend=i == 0) ax.set_title(col) ax.set_ylabel(None) elif kind == 'mosaic': warn("Mosaic plots are buggy right now, come back later.", UserWarning) # This seems pretty broken, abandoning for now # counts = pd.crosstab(X_new[col], X_new[target_col]) mosaic(X_new, [col, target_col], horizontal=False, ax=ax) # , # labelizer=lambda k: counts.loc[k[0], k[1]]) elif kind == 'count': # absolute counts # FIXME show f value # FIXME shorten titles? sns.countplot(y=col, data=X_new, ax=ax, hue=target_col, hue_order=hue_order) if i > 0: ax.legend(()) else: raise ValueError("Unknown plot kind {}".format(kind)) _short_tick_names(ax) for j in range(i + 1, axes.size): # turn off axis if we didn't fill last row axes.ravel()[j].set_axis_off()
def plot_bivariate_x_categorical_y_categorical(self, df, x_name, target_name, filename_prefix=''): ''' Plot bivariate analysis : y = f(x) where both x and y are categorical. This functions generates two graphs: a mekko chart and a stacked bar chart Parameters ---------- df: dataframe Dataframe containing x_name and target_name at least. x_name: string Name of column that is on x axis. target_name: string Name of column containing target to predict. filename_prefix: string Prefix added to filename. Returns ------- None ''' df_to_plot = self._build_dataset_for_x_cat_y_cat( df=df, x_name=x_name, target_name=target_name) # Plot 1: Mekko graph ------------------------------------------------------------------------------------------ df_to_plot['label'] = df_to_plot['count_percent'].apply(int).apply(str) + ' %' + '\n' \ + '(' + df_to_plot['count'].apply(str) + ')' props = lambda index: {'color': self.color if index[1] == str(df[target_name].value_counts().index[1]) \ else self.color_secondary, 'alpha': 0.7} labels = lambda k: df_to_plot.loc[k, 'label'] plot = mosaic( data=df_to_plot['count'], gap=0.02, title='Distribution de {target} en fonction de {var}'.format( target=target_name, var=x_name), properties=props, labelizer=labels) plt.savefig(self.output_directory + filename_prefix + 'bivariate_mekko_' + target_name + '_' + x_name + '.png') plt.close() # -------------------------------------------------------------------------------------------------------------- # Plot 2: Stacked bar chart ------------------------------------------------------------------------------------ df_to_plot.reset_index(inplace=True) df_to_plot2 = df_to_plot.pivot(index=x_name, columns=target_name, values='count') df_to_plot2['total'] = df_to_plot2.sum(axis=1) df_to_plot2.sort_values(by='total', ascending=False, inplace=True) fig, ax = plt.subplots() bar_width = 0.75 bar_position = [i + 1 for i in range(df_to_plot2.shape[0])] tick_position = [i + (bar_width / 2) for i in bar_position] ax.bar(bar_position, df_to_plot2[str(df[target_name].value_counts().index[0])], width=bar_width, label=str(df[target_name].value_counts().index[0]), alpha=0.7, color=self.color_secondary, edgecolor='#7F7F7F') ax.bar(bar_position, df_to_plot2[str(df[target_name].value_counts().index[1])], width=bar_width, bottom=df_to_plot2[str( df[target_name].value_counts().index[0])], label=str(df[target_name].value_counts().index[1]), alpha=0.7, color=self.color, edgecolor='#7F7F7F') rects = ax.patches labels_numbers = df_to_plot2[str(df[target_name].value_counts().index[0])].tolist() + \ df_to_plot2[str(df[target_name].value_counts().index[1])].tolist() labels_percent = (df_to_plot2[str(df[target_name].value_counts().index[0])]/df_to_plot2['total']).tolist() + \ (df_to_plot2[str(df[target_name].value_counts().index[1])]/df_to_plot2['total']).tolist() labels = [ '{}\n({:.0%})'.format(number, percent) for number, percent in zip(labels_numbers, labels_percent) ] for rect, label in zip(rects, labels): coord = rect.get_xy() height = rect.get_height() ax.text(x=coord[0] + bar_width / 2, y=coord[1] + height / 2, s=label, size=9, va='center', ha='center', color='w') for index in range(len(df_to_plot2.index)): ax.text(x=bar_position[index] + bar_width / 2, y=df_to_plot2['total'].values[index], s=df_to_plot2['total'].values[index], size=10, ha='center', va='bottom') plt.xticks(tick_position, df_to_plot2.index.values) plt.yticks([]) plt.xlim( [min(tick_position) - bar_width, max(tick_position) + bar_width]) plt.title('Distribution de {target} en fonction de {var}'.format( target=target_name, var=x_name), y=1.08) handles, labels = ax.get_legend_handles_labels() ax.legend(handles[::-1], labels[::-1], loc='upper right', frameon=False) plt.savefig('{}{}bivariate_stacked_{}_{}.png'.format( self.output_directory, filename_prefix, target_name, x_name)) plt.close(fig) # -------------------------------------------------------------------------------------------------------------- pass
train_df['Family_Size_D'] = train_df['Family_Size'].apply(lambda size: conv_discrete(size)) # train_df.head() - uncomment to view # In[ ]: ##train_df.loc[:,['Survived','Family_Size_D']] # In[ ]: # Visualize multivariate categorical data in a rigorous and informative way. mosaicplt.mosaic(train_df,index=['Survived','Family_Size_D'], gap=0.02,title='Family size by survival', statistic = True) # ```a contingency table (also known as a cross tabulation or crosstab) is a type of table in a matrix format that displays the (multivariate) frequency distribution of the variables``` # The mosaic plot shows that we preserve our rule that there’s a survival penalty among singletons and large families, but a benefit for passengers in small families. # # Missing data # # Assumption: It is assumed that the type of Missingness here is Missing At Random(MAR) # In[ ]: #Create a new function: def count_missing(x):
df['color'].value_counts() sns.countplot(df['color']) df['cut'].value_counts() sns.countplot(df['cut']) plt.plot(df.carat,df.price) plt.scatter(df.carat,df.price) df['cut'].value_counts().plot(kind='bar') df['clarity'].value_counts().plot(kind='bar') df['color'].value_counts().plot(kind='bar') df['cut'].value_counts().plot(kind='bar') df['carat'].value_counts().plot(kind='bar') plt.scatter(df.carat, df.price) from statsmodels.graphics.mosaicplot import mosaic plt.rcParams['font.size'] = 16.0 mosaic(df, ['cut', 'color']) mosaic(df, ['cut', 'color', 'clarity']) values = [21551, 13791, 12082, 4906, 1610] labels = ['Ideal', 'Premium', 'Very Good', 'Good','Fair'] colors = ['b', 'g', 'r', 'c', 'm'] labels =labels plt.pie(values, colors=colors, labels= labels, counterclock=False, shadow=True) df.corr(method='pearson') # By default corr() is pearson df.corr(method='spearman') df.corr(method='kendall') # from def to plt.show excute alltogether then correlation_matrix(ddef correlation_matrix(df): from matplotlib import pyplot as plt from matplotlib import cm as cm fig = plt.figure()
plt.subplots_adjust(top=0.9) # g.fig.subplots_adjust(top=0.9) g.fig.suptitle('Histogram of Age by Y and marital') plt.show() # #%% sns.boxplot(x='marital', y='age', hue='y', data=bank_data, palette='coolwarm',fliersize=0.2) ax = plt.gca() ax.set_title('Boxplot of Age by Y and marital') ax.legend(loc = 2) ax.get_ylim() # #%% mosaic(bank_data, ['job','y'], gap=0.001, label_rotation=30) ax = plt.gca() ax.set_title('Mosaic plot of job by y') # #%% mosaic(bank_data, ['housing','loan'], gap=0.001, title='Mosaic plot of housing(x) and loan(y)') ax = plt.gca() ax.set_title('Mosaic plot of housing(x) and loan(y)') # #%% sns.countplot(x='education', data=bank_data, hue='y') ax = plt.gca()
def save_image(b): data = df #no selection made #======================================================================== if headers_x.value == 'Select' and headers_y.value == 'Select': sns.set_context("notebook", font_scale=1.1) #X selected but noy Y #======================================================================== elif headers_x.value != 'Select' and headers_y.value == 'Select': sns.set_context("notebook", font_scale=1.1) x = headers_x.value if df[headers_x.value].dtype == np.float or df[ headers_x.value].dtype == np.int: x_type = 'is_numeric' elif df[headers_x.value].dtype == np.object: x_type = 'is_string' #plot when x is a string #-------------------------------------------------------------------- if x_type == 'is_string': #if colours have not been selected #................................................................ if colour_headers.value == 'Select': g = sns.countplot(x=x, data=data) loc, labels = plt.xticks() g.set_xticklabels(labels, rotation=90) g.figure.savefig("xCategoricalNoColour.png") plt.close() #if colours have been selected #................................................................ else: g = sns.countplot(x=x, hue=colour_headers.value, data=data) loc, labels = plt.xticks() g.set_xticklabels(labels, rotation=90) g.figure.savefig("xCategoricalColour.png") plt.close() #plt.show() #plot when x is numeric #-------------------------------------------------------------------- else: #if colours have not been selected #................................................................ if colour_headers.value == 'Select': xplot = data[x] g = sns.distplot(xplot) g.figure.savefig("xNumericNoColour.png") plt.close() #plt.show() #if colours have been selected #................................................................ else: g = sns.FacetGrid(data, hue=colour_headers.value) g = g.map(sns.distplot, x) g.savefig("xNumericColour.png") plt.close() #plt.show() #if only Y has been selected #======================================================================== elif headers_x.value == 'Select' and headers_y.value != 'Select': sns.set_context("notebook", font_scale=1.1) #if both X and Y have been selected #======================================================================== elif headers_x.value != 'Select' and headers_y.value != 'Select': x = headers_x.value y = headers_y.value if df[headers_x.value].dtype == np.float or df[ headers_x.value].dtype == np.int: x_type = 'is_numeric' elif df[headers_x.value].dtype == np.object: x_type = 'is_string' if df[headers_y.value].dtype == np.float or df[ headers_y.value].dtype == np.int: y_type = 'is_numeric' elif df[headers_y.value].dtype == np.object: y_type = 'is_string' sns.set_context("notebook", font_scale=1.1) sns.set_style("ticks") #Numeric vs Numeric #------------------------------------------------------------------------ if x_type == 'is_numeric' and y_type == 'is_numeric': # Create scatterplot of dataframe #if colours have not been selected #................................................................ if colour_headers.value == 'Select': g = sns.lmplot( x=x, # Horizontal axis y=y, # Vertical axis data=data, # Data source fit_reg=False, # Don't fix a regression line scatter_kws={ "marker": "D", # Set marker style "s": pointSize.value, "alpha": pointAlpha.value }, # S marker size legend=True) g.savefig("NumericVsNumericNoColour.png") plt.close() #plt.show() #if colours have been selected #................................................................ else: g = sns.lmplot( x=x, # Horizontal axis y=y, # Vertical axis data=data, # Data source fit_reg=False, # Don't fix a regression line hue=colour_headers.value, # Set color scatter_kws={ "marker": "D", # Set marker style "s": pointSize.value, "alpha": pointAlpha.value }, # S marker size legend=True) g.savefig("NumericVsNumericColour.png") plt.close() #plt.show() #Numeric vs String #------------------------------------------------------------------------ elif x_type == 'is_numeric' and y_type == 'is_string': sns.set_style("ticks") g = sns.violinplot(x=x, y=y, data=data) g.figure.savefig("NumericVsCategorical.png") plt.close() #plt.show() #String vs Numeric #------------------------------------------------------------------------ elif x_type == 'is_string' and y_type == 'is_numeric': sns.set_style("ticks") g = sns.boxplot(x=x, y=y, data=data) g.figure.savefig("CategoricalVsNumeric.png") plt.close() #plt.show() #String vs String #------------------------------------------------------------------------ elif x_type == 'is_string' and y_type == 'is_string': plotting = mosaic(data, [x, y]) plt.savefig('categoricalVsCategorical.png') plt.close()
# letter = "" # else: if "essential" in key: letter = "e" elif "recommended" in key: letter = "r" elif "desired" in key: letter = "d" return letter # Make the figure #props = lambda key: colorCode(key) props = lambda k: colorCode(k) #fig, rects = mosaic(data, ['WG','rec'], title='Mosaic Plot _ no freqs') fig, recs = mosaic(df1, ['WG','rec'], title='Recommendation for new datasets', \ properties = props, gap=0.015) labels = lambda k: letterCode(k) if recs[k][1] !=1 else "" fig, ax = plt.subplots(figsize=(7.5, 3.5)) mosaic(df1, ['WG','rec'], title='a. Recommendation for new datasets', \ properties = props, gap=0.015, ax=ax,labelizer=labels) for tick in ax.get_xticklabels(): tick.set_rotation(30) tick.set_horizontalalignment('right') for tick in ax.get_yticklabels(): tick.set_rotation(30) plt.savefig('./Figures/new_datasets_rec.png',\ dpi =300, bbox_inches='tight', pad_inches=0.25)
df_pi.pi_pro_schl_feats_comp > np.mean(df_pi.pi_pro_schl_feats_comp)].index df_pi.loc[i1, 'schl_comp'] = 'High' i2 = df_pi.loc[ df_pi.pi_pro_hm_feats_comp > np.mean(df_pi.pi_pro_hm_feats_comp)].index df_pi.loc[i2, 'hm_comp'] = 'High' # Identify high school - low home involvement, and # low school - high home involvement groups i1 = df_pi.loc[(df_pi.schl_comp == 'High') & (df_pi.hm_comp == 'Low')].index i2 = df_pi.loc[(df_pi.schl_comp == 'Low') & (df_pi.hm_comp == 'High')].index df_pi.loc[i1, 'schl_hm_comp'] = 'More involved at school' df_pi.loc[i2, 'schl_hm_comp'] = 'More involved at home' # Plot of the contingency table for the type of parental involvement vs. # student high-low performing students props = lambda key: { 'color': 'dodgerblue' if 'More involved at home' in key else 'orange' } labelizer = lambda k: f"{(k == ('More involved at school', 'A or B'))*90 + (k == ('More involved at school', 'C or lower'))*10 + (k == ('More involved at home', 'A or B'))*84 + (k == ('More involved at home', 'C or lower'))*16}%" mosaic( df_pi[['schl_hm_comp', 'grades_comp']], index=['schl_hm_comp', 'grades_comp'], title= 'Relationship Between Student Grades and \nType of Parental Involvement', properties=props, gap=0.025, labelizer=labelizer) ax1.set_xticklabels(['More involved at school\nLess Involved at home', '']) plt.show() df_pi.to_csv('~/FIS-Projects/Module-3/FIS-Mod3-Project/data/df_pi.csv', sep=',')
############################################################################ # trestbps graph plt.bar( x=["low blood pressure", "proper", "high blood pressure"], height=[low_blood_pressure, proper_blood_pressure, high_blood_pressure]) plt.xlabel('trestbps') plt.ylabel('amount') plt.show() ############################################################################ plt.scatter(x=gender, y=ca, color=['r', 'b']) mosaic(trainDF, ['fbs', 'thal']) plt.show() mosaic(trainDF, ['fbs', 'ca']) plt.show() mosaic(trainDF, ['restecg', 'slope']) plt.show() mosaic(trainDF, ['cp', 'restecg']) plt.show() ############################################################################ plt.scatter(trestbps, chol) plt.scatter(trestbps, thalach) plt.scatter(trestbps, oldpeak) plt.scatter(chol, trestbps) plt.scatter(chol, thalach)
# Superficie du garage en fonction des classes de prix sns.set_style("whitegrid") sns.boxplot(y="GarageArea",x="Class_prix", data = df_housing_copy,order=["Classe0", "Classe1", "Classe2", "Classe3"], palette = pal_col) plt.title("Superficie du garage en fonction des classes de prix") plt.xlabel("Classe") plt.ylabel("Superficie du garage") # Nombre de chambre (Sans salles de bains) en fonction des classes de prix sns.set_style("whitegrid") sns.boxplot(y="TotRmsAbvGrd",x="Class_prix", data = df_housing_copy,order=["Classe0", "Classe1", "Classe2", "Classe3"], palette = pal_col) plt.title("Nombre de chambre (Sans salles de bains) en fonction des classes de prix") plt.xlabel("Classe") plt.ylabel("Nombre de chambre (Sans salles de bains)") # classification générale de zonage en fonction des classes de prix mosaic(df_housing_copy,["Class_prix","MSZoning"],gap=0.3) # ============================================================================= # Imputation des données manquantes (Première méthode-Mode/Médiane) # ============================================================================= # transformer la table des pourcentages des NA's en DataFrame. df_per_NA_per_col = per_NA_per_col.reset_index().rename(columns={"index": "Variable", 0: "pourcentage"}).sort_values(by = 'pourcentage') df_per_NA_per_col_sup50 = df_per_NA_per_col.loc[df_per_NA_per_col.pourcentage > 50] df_per_NA_per_col_inf50 = df_per_NA_per_col.loc[df_per_NA_per_col.pourcentage <= 50] # Suppression des variables avec plus de 50% des NA's df_housing_copy.drop(columns= df_per_NA_per_col_sup50.Variable, inplace = True) # Data contenant que les variables qualitatives : var_qualitative = df_housing_copy.select_dtypes(exclude=['float', 'integer'])
def test_data_conversion(): # It will not reorder the elements # so the dictionary will look odd # as it key order has the c and b # keys swapped import pandas fig, ax = pylab.subplots(4, 4) data = {'ax': 1, 'bx': 2, 'cx': 3} mosaic(data, ax=ax[0, 0], title='basic dict', axes_label=False) data = pandas.Series(data) mosaic(data, ax=ax[0, 1], title='basic series', axes_label=False) data = [1, 2, 3] mosaic(data, ax=ax[0, 2], title='basic list', axes_label=False) data = np.asarray(data) mosaic(data, ax=ax[0, 3], title='basic array', axes_label=False) data = {('ax', 'cx'): 1, ('bx', 'cx'): 2, ('ax', 'dx'): 3, ('bx', 'dx'): 4} mosaic(data, ax=ax[1, 0], title='compound dict', axes_label=False) mosaic(data, ax=ax[2, 0], title='inverted keys dict', index=[1, 0], axes_label=False) data = pandas.Series(data) mosaic(data, ax=ax[1, 1], title='compound series', axes_label=False) mosaic(data, ax=ax[2, 1], title='inverted keys series', index=[1, 0]) data = [[1, 2], [3, 4]] mosaic(data, ax=ax[1, 2], title='compound list', axes_label=False) mosaic(data, ax=ax[2, 2], title='inverted keys list', index=[1, 0]) data = np.array([[1, 2], [3, 4]]) mosaic(data, ax=ax[1, 3], title='compound array', axes_label=False) mosaic(data, ax=ax[2, 3], title='inverted keys array', index=[1, 0], axes_label=False) gender = ['male', 'male', 'male', 'female', 'female', 'female'] pet = ['cat', 'dog', 'dog', 'cat', 'dog', 'cat'] data = pandas.DataFrame({'gender': gender, 'pet': pet}) mosaic(data, ['gender'], ax=ax[3, 0], title='dataframe by key 1', axes_label=False) mosaic(data, ['pet'], ax=ax[3, 1], title='dataframe by key 2', axes_label=False) mosaic(data, ['gender', 'pet'], ax=ax[3, 2], title='both keys', axes_label=False) mosaic(data, ['pet', 'gender'], ax=ax[3, 3], title='keys inverted', axes_label=False) pylab.suptitle('testing data conversion (plot 1 of 4)') #pylab.show() pylab.close('all')
def test_data_conversion(): # It will not reorder the elements # so the dictionary will look odd # as it key order has the c and b # keys swapped import pandas fig, ax = pylab.subplots(4, 4) data = {"ax": 1, "bx": 2, "cx": 3} mosaic(data, ax=ax[0, 0], title="basic dict", axes_label=False) data = pandas.Series(data) mosaic(data, ax=ax[0, 1], title="basic series", axes_label=False) data = [1, 2, 3] mosaic(data, ax=ax[0, 2], title="basic list", axes_label=False) data = np.asarray(data) mosaic(data, ax=ax[0, 3], title="basic array", axes_label=False) data = {("ax", "cx"): 1, ("bx", "cx"): 2, ("ax", "dx"): 3, ("bx", "dx"): 4} mosaic(data, ax=ax[1, 0], title="compound dict", axes_label=False) mosaic(data, ax=ax[2, 0], title="inverted keys dict", index=[1, 0], axes_label=False) data = pandas.Series(data) mosaic(data, ax=ax[1, 1], title="compound series", axes_label=False) mosaic(data, ax=ax[2, 1], title="inverted keys series", index=[1, 0]) data = [[1, 2], [3, 4]] mosaic(data, ax=ax[1, 2], title="compound list", axes_label=False) mosaic(data, ax=ax[2, 2], title="inverted keys list", index=[1, 0]) data = np.array([[1, 2], [3, 4]]) mosaic(data, ax=ax[1, 3], title="compound array", axes_label=False) mosaic(data, ax=ax[2, 3], title="inverted keys array", index=[1, 0], axes_label=False) gender = ["male", "male", "male", "female", "female", "female"] pet = ["cat", "dog", "dog", "cat", "dog", "cat"] data = pandas.DataFrame({"gender": gender, "pet": pet}) mosaic(data, ["gender"], ax=ax[3, 0], title="dataframe by key 1", axes_label=False) mosaic(data, ["pet"], ax=ax[3, 1], title="dataframe by key 2", axes_label=False) mosaic(data, ["gender", "pet"], ax=ax[3, 2], title="both keys", axes_label=False) mosaic(data, ["pet", "gender"], ax=ax[3, 3], title="keys inverted", axes_label=False) pylab.suptitle("testing data conversion (plot 1 of 4)")
def test_mosaic_plot(cat_target, cat_feature, data): from statsmodels.graphics.mosaicplot import mosaic mosaic(data, [cat_feature, cat_target])
def test_data_conversion(): # It will not reorder the elements # so the dictionary will look odd # as it key order has the c and b # keys swapped import pandas fig, ax = pylab.subplots(4, 4) data = {'ax': 1, 'bx': 2, 'cx': 3} mosaic(data, ax=ax[0, 0], title='basic dict', axes_label=False) data = pandas.Series(data) mosaic(data, ax=ax[0, 1], title='basic series', axes_label=False) data = [1, 2, 3] mosaic(data, ax=ax[0, 2], title='basic list', axes_label=False) data = np.asarray(data) mosaic(data, ax=ax[0, 3], title='basic array', axes_label=False) data = {('ax', 'cx'): 1, ('bx', 'cx'): 2, ('ax', 'dx'): 3, ('bx', 'dx'): 4} mosaic(data, ax=ax[1, 0], title='compound dict', axes_label=False) mosaic(data, ax=ax[2, 0], title='inverted keys dict', index=[1, 0], axes_label=False) data = pandas.Series(data) mosaic(data, ax=ax[1, 1], title='compound series', axes_label=False) mosaic(data, ax=ax[2, 1], title='inverted keys series', index=[1, 0]) data = [[1, 2], [3, 4]] mosaic(data, ax=ax[1, 2], title='compound list', axes_label=False) mosaic(data, ax=ax[2, 2], title='inverted keys list', index=[1, 0]) data = np.array([[1, 2], [3, 4]]) mosaic(data, ax=ax[1, 3], title='compound array', axes_label=False) mosaic(data, ax=ax[2, 3], title='inverted keys array', index=[1, 0], axes_label=False) gender = ['male', 'male', 'male', 'female', 'female', 'female'] pet = ['cat', 'dog', 'dog', 'cat', 'dog', 'cat'] data = pandas.DataFrame({'gender': gender, 'pet': pet}) mosaic(data, ['gender'], ax=ax[3, 0], title='dataframe by key 1', axes_label=False) mosaic(data, ['pet'], ax=ax[3, 1], title='dataframe by key 2', axes_label=False) mosaic(data, ['gender', 'pet'], ax=ax[3, 2], title='both keys', axes_label=False) mosaic(data, ['pet', 'gender'], ax=ax[3, 3], title='keys inverted', axes_label=False) pylab.suptitle('testing data conversion (plot 1 of 4)')
def reports(results_input, results_output): """Generate reports for EMSE paper.""" now = pandas.Timestamp(2017, 9, 30, 12) df = pandas.read_csv( path_join(results_input, "results_with_coverage.csv"), parse_dates=[0, 10] ) df_googleplay = pandas.read_csv( path_join(results_input, "googleplay.csv"), index_col='package' ) df = df.join(df_googleplay, on="app_id") df_sonar = pandas.read_csv("results_sonar.csv", index_col='package') df_sonar.fillna(0, inplace=True) df_sonar = df_sonar.add_prefix('sonar_') df = df.join(df_sonar, on="app_id") #Feature engineering df['tests'] = df[unit_test_frameworks+ui_automation_frameworks+cloud_test_services].any(axis=1) df['no_tests'] = ~df['tests'] df['unit_tests'] = df[unit_test_frameworks].apply(any, axis=1) df['ui_tests'] = df[ui_automation_frameworks].apply(any, axis=1) df["cloud_tests"] = df[cloud_test_services].apply(any, axis=1) df["ci/cd"] = df[ci_services].apply(any, axis=1) df['age'] = (now - df['created_at']) df['age_numeric'] = (now - df['created_at']).astype('<m8[Y]').astype('int') df['time_since_last_update'] = (now - df['last_updated']) df['time_since_last_update_numeric'] = df['time_since_last_update'].astype('<m8[Y]').astype('int') df_old = df[df['age_numeric']>=2] df["downloads"] = df["downloads"].astype("category", categories=downloads_scale, ordered=True) df['sonar_issues_ratio'] = df['sonar_issues'].divide(df['sonar_files_processed']) df['sonar_blocker_issues_ratio'] = df['sonar_blocker_issues'].divide(df['sonar_files_processed']) df['sonar_critical_issues_ratio'] = df['sonar_critical_issues'].divide(df['sonar_files_processed']) df['sonar_major_issues_ratio'] = df['sonar_major_issues'].divide(df['sonar_files_processed']) df['sonar_minor_issues_ratio'] = df['sonar_minor_issues'].divide(df['sonar_files_processed']) df_with_google_data = df[~df["rating_count"].isnull()] df_with_tests = df[df['tests']] df_without_tests = df[~df['tests']] df.to_csv("results_merged.csv") # from android_test_inspector.corr_analysis import correlation_matrix # correlation_matrix(df, output_file=path_join(results_output, "corr_matrix.pdf")) colors_dict = { 'any': 'C0', 'unit_test_frameworks': 'C1', 'ui_automation_frameworks': 'C2', 'cloud_test_services': 'C3', 'ci_services': 'C4', } marker_dict = { 'any': 'o', 'unit_test_frameworks': 'v', 'ui_automation_frameworks': '*', 'cloud_test_services': 'H', 'ci_services': 's', } linestyle_dict = { 'any': '-', 'unit_test_frameworks': ':', 'ui_automation_frameworks': '--', 'cloud_test_services': '-.', } # --- Number of projects by year --- # figure, ax = plt.subplots(figsize=(4, 2.5)) df.groupby('age_numeric')['age_numeric'].count().plot.bar( color='black', width=0.25, ax=ax, ) ax.tick_params(direction='out', top='off') ax.set_xlabel("Age") ax.set_ylabel("Number of apps") ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['left'].set_visible(False) ax.yaxis.grid(linestyle='dotted') figure.tight_layout() figure.savefig(path_join(results_output, "app_age_count.pdf")) # --- Number of projects by framework --- # columns = ( ['tests'] + ['unit_tests'] + unit_test_frameworks + ['ui_tests'] + ui_automation_frameworks + ['cloud_tests'] + cloud_test_services # + ['ci/cd'] + ci_services ) colors = ( [colors_dict['any']] + [colors_dict['unit_test_frameworks']] * (len(unit_test_frameworks) + 1) + [colors_dict['ui_automation_frameworks']] * (len(ui_automation_frameworks) + 1) + [colors_dict['cloud_test_services']] * (len(cloud_test_services) + 1) + [colors_dict['ci_services']] * (len(ci_services) + 1) ) highlights = [ 'tests', 'unit_tests', 'ui_tests', 'cloud_tests', 'ci/cd', ] sums = df[columns].sum() labels = (label in highlights and "• All "+label or label for label in columns) labels = [label.title().replace("_", " ") for label in labels] heights = sums.values figure, ax = plt.subplots(1, 1) ax.bar( range(len(labels)), heights, 0.5, color=colors, edgecolor = 'k', linewidth= [column in highlights and 0.9 or 0.0 for column in columns] ) ax.set_xticklabels(labels, rotation='vertical') ax.set_xticks(range(len(labels))) ax.tick_params(direction='out', top='off') # ax.set_title("Number of projects by test framework") ax.set_ylabel("Number of projects (out of {})".format(len(df.index))) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['left'].set_visible(False) ax.yaxis.grid(linestyle='dotted') # ax2 = ax.twinx() # ax2.grid(False) # ax2.set_ylim(ax.get_ylim()) # ax2.set_yticklabels(["{:.0%}".format(tick/len(df)) for tick in ax2.get_yticks()]) # ax2.spines['right'].set_visible(False) # ax2.spines['top'].set_visible(False) # ax2.spines['left'].set_visible(False) # ax2.set_ylabel("Percentage of projects") def draw_range(ax, xmin, xmax, label): y=400 ax.annotate('', xy=(xmin, y), xytext=(xmax, y), xycoords='data', textcoords='data', arrowprops={'arrowstyle': '|-|', 'color':'black', 'linewidth': 0.5}) xcenter = xmin + (xmax-xmin)/2 ytext = y + ( ax.get_ylim()[1] - ax.get_ylim()[0] ) / 22 ax.annotate(label, xy=(xcenter,ytext), ha='center', va='center', fontsize=9) draw_range(ax, 0.5, 5.5, "Unit testing") draw_range(ax, 5.5, 14.5, "GUI testing") draw_range(ax, 14.5, 21.5, "Cloud testing") # draw_range(ax, 21.5, 26.5, "CI/CD") figure.tight_layout() figure.savefig(path_join(results_output, "framework_count.pdf")) # --------------------------------------- # # --- Percentage of Android tests over the age of the apps --- # def tests_in_projects_by_time_of_creation(df_projects, frameworks, label=None, title=None, zorder=1, color=None, verbose=False, **kwargs): portions = [] n_projects_with_tests_history = [] total_projects_history = [] age_max = df_projects['age_numeric'].max()+1 for age in range(age_max): n_projects_with_tests = df_projects[df_projects['age_numeric']==age][frameworks].apply(any, axis=1).sum() n_projects_with_tests_history.append(n_projects_with_tests) total_projects = len(df_projects[df_projects['age_numeric']==age].index) total_projects_history.append(total_projects) if total_projects == 0: portion = 0 else: portion = n_projects_with_tests/total_projects portions.append(portion) if verbose: print("Age {}:".format(age)) print("{} out of {} projects ({:.1%}).".format(n_projects_with_tests, total_projects, portion)) plt.plot(range(age_max), portions, label=label, zorder=zorder, **kwargs) plt.scatter(range(age_max), portions, total_projects_history, marker='o', linewidth='1', zorder=zorder) ax = plt.gca() ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['left'].set_visible(False) ax.set_xticks(range(age_max)) ax.set_yticklabels(["{:.0%}".format(label) for label in ax.get_yticks()]) ax.set_ylabel("Percentage of projects") ax.yaxis.grid(linestyle='dotted', color='gray') if label: legend = ax.legend(loc='upper center', shadow=False) if title: ax.set_title(title) figure, ax = plt.subplots(1,1) tests_in_projects_by_time_of_creation(df, unit_test_frameworks+ui_automation_frameworks+cloud_test_services, label="Any", color=colors_dict['any'], zorder=2, linestyle=linestyle_dict['any']) tests_in_projects_by_time_of_creation(df, unit_test_frameworks, label="Unit testing", color=colors_dict['unit_test_frameworks'], zorder=3, linestyle=linestyle_dict['unit_test_frameworks']) tests_in_projects_by_time_of_creation(df, ui_automation_frameworks, label="GUI testing", color=colors_dict['ui_automation_frameworks'], zorder=4, linestyle=linestyle_dict['ui_automation_frameworks']) tests_in_projects_by_time_of_creation(df, cloud_test_services, label="Cloud testing", color=colors_dict['cloud_test_services'], zorder=5, linestyle=linestyle_dict['cloud_test_services']) ax.set_xlabel("Years since first commit") ax.axvspan(0,2, color='darkgreen', alpha=0.1) figure.tight_layout() figure.savefig(path_join(results_output, "tests_by_age.pdf")) ax.invert_xaxis() figure.savefig(path_join(results_output, "tests_by_age_i.pdf")) # ------------------------------------------------------------ # # --- Percentage of Android tests over the age of the apps (cumulated) --- # def tests_in_projects_by_time_of_creation_cumm(df_projects, frameworks, title=None, verbose=False, **kwargs): project_with_test_per_age = [] total_projects_per_age = [] n_projects_with_tests_history = [] total_projects_history = [] age_max = df_projects['age_numeric'].max()+1 for age in range(age_max)[::-1]: n_projects_with_tests = df_projects[df_projects['age_numeric']==age][frameworks].apply(any, axis=1).sum() n_projects_with_tests_history.append(n_projects_with_tests) total_projects = len(df_projects[df_projects['age_numeric']==age].index) total_projects_history.append(total_projects) project_with_test_per_age.append(n_projects_with_tests) total_projects_per_age.append(total_projects) if verbose: print("Age {}:".format(age)) print("{} out of {} projects ({:.1%}).".format(n_projects_with_tests, total_projects, portion)) project_with_test_per_age_cum = [sum(project_with_test_per_age[:index+1]) for index in range(len(project_with_test_per_age))] total_projects_per_age_cum = [sum(total_projects_per_age[:index+1]) for index in range(len(total_projects_per_age))] portions = [] for with_tests, total in zip(project_with_test_per_age_cum, total_projects_per_age_cum): if total > 0: portions.append(with_tests/len(df_projects)) else: portions.append(0) plt.plot(range(age_max)[::-1], portions, **kwargs) # plt.scatter(range(age_max)[::-1], portions, total_projects_history, marker='o', linewidth=1, zorder=kwargs.get('zorder')) plt.scatter(range(age_max)[::-1], portions, marker='.', linewidth=1, zorder=kwargs.get('zorder')) ax = plt.gca() ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['left'].set_visible(False) ax.set_xticks(range(age_max)) ax.set_yticklabels(["{:.0%}".format(label) for label in ax.get_yticks()]) ax.set_ylabel("Percentage of projects") ax.yaxis.grid(linestyle='dotted', color='gray') ax.legend(loc='upper center', shadow=False) if title: ax.set_title(title) figure, ax = plt.subplots(1,1) tests_in_projects_by_time_of_creation_cumm( df, unit_test_frameworks+ui_automation_frameworks+cloud_test_services, label="Any", color=colors_dict['any'], zorder=2, linestyle=linestyle_dict['any'], ) tests_in_projects_by_time_of_creation_cumm( df, unit_test_frameworks, label="Unit testing", color=colors_dict['unit_test_frameworks'], zorder=3, linestyle=linestyle_dict['unit_test_frameworks'], ) tests_in_projects_by_time_of_creation_cumm( df, ui_automation_frameworks, label="GUI testing", color=colors_dict['ui_automation_frameworks'], zorder=4, linestyle=linestyle_dict['ui_automation_frameworks'], ) tests_in_projects_by_time_of_creation_cumm( df, cloud_test_services, label="Cloud testing", color=colors_dict['cloud_test_services'], zorder=5, linestyle=linestyle_dict['cloud_test_services'], ) ax.set_xlabel("Year") ax.axvspan(0,2, color='darkgreen', alpha=0.1) figure.tight_layout() figure.savefig(path_join(results_output, "tests_by_age_cumm.pdf")) ax.invert_xaxis() figure.savefig(path_join(results_output, "tests_by_age_cumm_i.pdf")) # ------------------------------------------------------------ # # --- Percentage of 2+years apps with tests grouped by time since last update --- # def tests_in_projects_by_time_of_update(df_projects, frameworks, label=None, title=None, verbose=False, zorder=None, color=None, **kwargs): portions = [] n_projects_with_tests_history = [] total_projects_history = [] age_max = df_projects['time_since_last_update_numeric'].max()+1 for age in range(age_max): n_projects_with_tests = df_projects[df_projects['time_since_last_update_numeric']==age][frameworks].apply(any, axis=1).sum() n_projects_with_tests_history.append(n_projects_with_tests) total_projects = len(df_projects[df_projects['time_since_last_update_numeric']==age].index) total_projects_history.append(total_projects) if total_projects == 0: portion = 0 else: portion = n_projects_with_tests/total_projects portions.append(portion) if verbose: print("Age {}:".format(age)) print("{} out of {} projects ({:.1%}).".format(n_projects_with_tests, total_projects, portion)) plt.plot(range(age_max), portions, label=label, zorder=zorder, **kwargs) plt.scatter(range(age_max), portions, total_projects_history, marker='o', linewidth='1', zorder=zorder) ax = plt.gca() ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['left'].set_visible(False) ax.spines['bottom'].set_visible(True) ax.set_xticks(range(age_max)) ax.set_yticklabels(["{:.0%}".format(label) for label in ax.get_yticks()]) ax.set_ylabel("Percentage of projects") ax.yaxis.grid(linestyle='dotted', color='gray') if label: legend = ax.legend(loc='upper center', shadow=False) if title: plt.title(title) figure, ax = plt.subplots(1,1) tests_in_projects_by_time_of_update(df_old, unit_test_frameworks+ui_automation_frameworks+cloud_test_services, label="Any", color=colors_dict['any'], linestyle=linestyle_dict['any'], zorder=1) tests_in_projects_by_time_of_update(df_old, unit_test_frameworks, label="Unit testing", color=colors_dict['unit_test_frameworks'], linestyle=linestyle_dict['unit_test_frameworks'], zorder=2) tests_in_projects_by_time_of_update(df_old, ui_automation_frameworks, label="GUI testing", color=colors_dict['ui_automation_frameworks'], linestyle=linestyle_dict['ui_automation_frameworks'], zorder=3) tests_in_projects_by_time_of_update(df_old, cloud_test_services, label="Cloud testing", color=colors_dict['cloud_test_services'], linestyle=linestyle_dict['cloud_test_services'], zorder=4) ax.set_xlabel("Years since last update") figure.tight_layout() figure.savefig(path_join(results_output, "mature_tests_by_update.pdf")) ax.invert_xaxis() figure.savefig(path_join(results_output, "mature_tests_by_update_i.pdf")) # ------------------------------------------------------------------------------- # # --- Descriptive stats for popularity metrics --- # dictionary = { "count": "$N$", "mean": "$\\bar{x}$", "std": "$s$", "min": "$min$", "max": "$max$", "rating_value": "Rating" } metrics = ['stars','forks', 'contributors', 'commits', 'rating_value', 'rating_count'] def outliers_modified_z_score(ys): threshold = 3.5 median_y = np.median(ys) median_absolute_deviation_y = np.median([np.abs(y - median_y) for y in ys]) modified_z_scores = [0.6745 * (y - median_y) / median_absolute_deviation_y for y in ys] return (np.abs(modified_z_scores) > threshold) def outliers_z_score(ys): return np.abs(zscore(ys) < 3) def remove_outliers_df(df, metric): df = df.dropna(subset=[metric]) return df[outliers_z_score(df[metric])] def remove_outliers(series): series = series[~series.isnull()] return series[outliers_z_score(series)] # return series[np.abs(zscore(series) < 3)] def _descriptive_stats(series, ): return ( series.count(), series.mean(), series.std(), series.min(), series.quantile(0.25), series.median(), series.quantile(0.75), series.max(), shapiro(series)[1] < 0.01 and "$p < 0.01$", ) stats = [] for metric in metrics: metric_title = metric.title().replace("_", " ") df_tmp = remove_outliers_df(df, metric) df_tmp_tests = df_tmp[df_tmp['tests']] stats.append(( f"\\multirow{{2}}{{*}}{{{metric_title}}}", '$W$', *_descriptive_stats(df_tmp_tests[metric]) )) df_tmp_wo_tests = df_tmp[~df_tmp['tests']] stats.append(( "", '$WO$', *_descriptive_stats(df_tmp_wo_tests[metric]) )) old_escape_rules = T.LATEX_ESCAPE_RULES T.LATEX_ESCAPE_RULES = {'%': '\\%'} table = tabulate( stats, headers=['', 'Tests', '$N$', '$\\bar{x}$', '$s$', '$min$', '$25%$', '$Md$', '$75%$', '$max$', '$X \sim N$'], # showindex=issues_column, tablefmt='latex', floatfmt=".1f", ) T.LATEX_ESCAPE_RULES = old_escape_rules with open(path_join(results_output, "popularity_metrics_stats_2.tex"), 'w') as f: f.write(table) stats = pandas.concat([remove_outliers(df[metric]).describe() for metric in metrics], axis=1) stats = stats.applymap((lambda x: "${:.1f}$".format(float(x)))).astype(str) stats[['stars','forks', 'contributors', 'commits', 'rating_count']] = stats[['stars','forks', 'contributors', 'commits', 'rating_count']].applymap((lambda x: "${:.0f}$".format(float(x[1:-1])))).astype(str) stats.loc['count']= stats.loc['count'].map((lambda x: "${:.0f}$".format(float(x[1:-1])))).astype(str) old_escape_rules = T.LATEX_ESCAPE_RULES T.LATEX_ESCAPE_RULES = {'%': '\\%'} with open(path_join(results_output, "popularity_metrics_stats.tex"), 'w') as f: f.write(tabulate( stats, headers=[dictionary.get(column, column.title().replace("_", " ")) for column in stats.columns], showindex=[dictionary.get(name, name) for name in stats.index], tablefmt='latex', floatfmt=".1f" )) T.LATEX_ESCAPE_RULES = old_escape_rules ###box plots instead figure, axes = plt.subplots(2, 3) for index, ax, metric in zip(range(len(metrics)), [ax for subaxes in axes for ax in subaxes], metrics): values = remove_outliers(df[metric]) metric_title = metric.title().replace("_", " ") ax.boxplot(values, whis=[5,95], showmeans=True, meanline=True,showfliers=True) ax.set_xticklabels([metric_title]) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['left'].set_visible(False) ax.spines['bottom'].set_visible(True) ax.yaxis.grid(linestyle='dotted', color='gray') if index != 4: ax.set_yscale('log') ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda y, _: '{:g}'.format(y))) figure.tight_layout() figure.savefig(path_join(results_output, f"popularity_metrics_boxplot.pdf")) # -------------------------------------------------- # # --- Histogram for downloads --- # downloads_distribution = df_with_google_data.groupby('downloads')['downloads'].count() heights = df_with_google_data.groupby('downloads')['downloads'].count().values figure, ax = plt.subplots(1,1) labels = [ str(human_format(int(cat.split(' - ')[0].replace(',','')))) + " – " + str(human_format(int(cat.split(' - ')[1].replace(',','')))) for cat in downloads_scale ] # ax.bar( # range(len(labels)), # heights, # width=0.9, # color=[column == '10,000 - 50,000' and 'C1' or 'C0' for column in downloads_scale], # ) downloads_distribution.plot.bar( ax=ax, width=0.9, fontsize=14, ) ax.set_xticklabels(labels, fontsize=14, rotation='vertical') ax.set_xlabel("Downloads", fontsize=15) ax.set_ylabel("Number of apps (out of {})".format(len(df.index)), fontsize=15) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['left'].set_visible(False) ax.spines['bottom'].set_visible(True) ax.yaxis.grid(linestyle='dotted', color='gray') # ax2 = ax.twinx() # ax2.grid(False) # ax2.set_ylim(ax.get_ylim()) # ax2.set_yticklabels(["{:.0%}".format(tick/len(df_with_google_data)) for tick in ax2.get_yticks()], fontsize=14) # ax2.spines['right'].set_visible(False) # ax2.spines['top'].set_visible(False) # ax2.spines['left'].set_visible(False) # ax2.set_ylabel("Percentage of apps", fontsize=15) figure.tight_layout() figure.savefig(path_join(results_output, "downloads_hist.pdf")) # -------------------------------------------------- # # ---------- Hypothesis testing ------------- # popularity_metrics = [ 'stars', 'forks', 'contributors', 'commits', 'rating_value', 'rating_count', # 'downloads' ] def cohen_d(y,x): nx = len(x) ny = len(y) dof = nx + ny - 2 return (np.mean(x) - np.mean(y)) / np.sqrt(((nx-1)*np.std(x, ddof=1) ** 2 + (ny-1)*np.std(y, ddof=1) ** 2) / dof) def analyze_populations(a,b, continuous=True): mean_difference = np.mean(b) - np.mean(a) median_difference = np.median(b) - np.median(a) improvement = mean_difference/np.mean(b) ks_test, ks_p = ks_2samp(a,b) mwu_test, mwu_p = mannwhitneyu(a,b, alternative='two-sided') return { # 'MW': "${:.4f}$".format(mwu_p), # 'KS': continuous and "${:.4f}$".format(ks_p) or "n.a.", 'Test': continuous and "${:,.0f}$".format(ks_test) or "${:,.0f}$".format(mwu_test), '$p$-value': continuous and ks_p or mwu_p, '$\\Delta\\bar{x}$': "${:,.2f}$".format(mean_difference), '$\\Delta Md$': "${:,.2f}$".format(median_difference), 'CL (%)': f"${cles(a,b):,.2%}$", 'Cohen\'s $d$': f"${cohen_d(a,b):,.4f}$", '$d_r$': "${:.1%}$".format(improvement), } tests = [] for metric in popularity_metrics: df_wo_outliers = remove_outliers_df(df, metric) tests.append( analyze_populations( df_wo_outliers[~df_wo_outliers['tests']][metric], df_wo_outliers[df_wo_outliers['tests']][metric], False ) ) # Apply multiple test correction () pvalues = [test['$p$-value'] for test in tests] _,pvalues,*_ = multipletests(pvalues, alpha=0.05, method='fdr_bh') for test, pvalue in zip(tests, pvalues): test['$p$-value'] = "${:.4f}$".format(pvalue) old_escape_rules = T.LATEX_ESCAPE_RULES T.LATEX_ESCAPE_RULES = {'%': '\\%'} with open(path_join(results_output, "popularity_metrics_test.tex"), 'w') as f: f.write(tabulate( tests, headers='keys', showindex=[metric.title().replace("_"," ") for metric in popularity_metrics], tablefmt='latex', )) T.LATEX_ESCAPE_RULES = old_escape_rules # ------------------------------------------- # # ---------- Tests vs Rating with Rating count ------------- # x = range(0, 10000 , 100) y_with_tests = tuple(df_with_tests[df_with_tests['rating_count']>i]['rating_value'].mean() for i in x) y_without_tests = tuple(df_without_tests[df_without_tests['rating_count']>i]['rating_value'].mean() for i in x) figure, ax = plt.subplots() ax.scatter(x, y_with_tests, marker='o', color='C0', label="With tests", zorder=2) ax.plot(x, y_with_tests, alpha=0.5, color='C0', zorder=1) ax.scatter(x, y_without_tests, marker='2', color='r', label="Without tests", zorder=2) ax.plot(x, y_without_tests, alpha=0.5, color='r', zorder=1) ax.legend(loc='upper center') ax.set_ylabel("Rating") ax.set_xlabel("Rating count >") ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) figure.tight_layout() figure.savefig(path_join(results_output, "rating_with_lower_limit.pdf")) # --------------------------------------------------------- # # ------------------ CI/CD platforms hist --------------- # figure, ax = plt.subplots() namepedia={ "circleci": "Circle CI", "travis": "Travis CI", } df[['ci/cd']+ci_services].sum().plot.bar( fontsize=15, edgecolor = 'k', color='black', width=0.25, linewidth = [1]+[0]*len(ci_services) ) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['left'].set_visible(False) ax.yaxis.grid(linestyle='dotted', color='gray') ax.set_ylabel("Number of apps (out of {})".format(len(df.index)), fontsize=15) ax.set_xticklabels(["All"]+[namepedia.get(key, key.title().replace('_', ' ')) for key in ci_services]) # ax2 = ax.twinx() # ax2.grid(False) # ax2.set_ylim(ax.get_ylim()) # ax2.set_yticklabels(["{:.0%}".format(tick/len(df)) for tick in ax2.get_yticks()], fontsize=15) # ax2.spines['right'].set_visible(False) # ax2.spines['top'].set_visible(False) # ax2.spines['left'].set_visible(False) # ax2.set_ylabel("Percentage of apps", fontsize=15) for p in ax.patches: ax.annotate("{:.0f}".format(p.get_height()), (p.get_x() +p.get_width()/2, p.get_height()+4), ha='center', fontsize=14) figure.tight_layout() figure.savefig(path_join(results_output, "ci_cd_hist.pdf")) # ------------------------------------------------------- # # ---------------- Mosaic CI/CD ---------------- # from statsmodels.graphics.mosaicplot import mosaic def properties(keys): keys = list(map(lambda i: i == 'True', keys)) if all(keys): return {'color': 'lightgreen'} elif any(keys): return {'color': 'lightgoldenrodyellow'} return {'color': 'lightcoral'} figure, ax = plt.subplots(figsize=(4.5,3.5)) labelizer = lambda k: { ('False','False'): 'A. No Tests and no CI/CD\n({:.1%})'.format(1 - df[["tests", "ci/cd"]].any(axis=1).sum()/len(df)), ('True','False'): 'B. With Tests but\nno CI/CD\n({:.1%})'.format(sum(df["tests"] & ~df["ci/cd"])/len(df)), ('False','True'): 'C. No Tests but with CI/CD\n({:.1%})'.format(sum(~df["tests"] & df["ci/cd"])/len(df)), ('True','True'): 'D. With Tests and\nwith CI/CD\n({:.1%})'.format(df[["tests", "ci/cd"]].all(axis=1).sum()/len(df)), }.get(k, k) mosaic(df, ["tests", "ci/cd"], properties= properties, labelizer=labelizer, ax=ax) ax.set_xticklabels(['No tests', 'With tests']) ax.set_yticklabels(['With CI/CD', 'No CI/CD']) # ax.spines['left'].linewidth = 1 # ax.spines['top'].linewidth = 1 # ax.spines['right'].linewidth = 1 # ax.spines['bottom'].linewidth = 1 ax.invert_yaxis() figure.tight_layout() figure.savefig(path_join(results_output, "ci_cd_mosaic.pdf")) obs = [ [sum(~df["tests"] & df["ci/cd"]), sum(~df["tests"] & ~df["ci/cd"])], #No tests [sum(df["tests"] & df["ci/cd"]), sum(df["tests"] & ~df["ci/cd"])] #Tests ] chi,pvalue,dof,_ = chi2_contingency(obs) print("Relationship between Ci/CD and Automated testing:") print("Chi={}, dof={}, p={}".format(chi, dof, pvalue)) # ------------------------------------------------------- # # ------------------ Sonar vs tests --------------- # features = [ # 'sonar_issues_ratio', 'sonar_blocker_issues_ratio', 'sonar_critical_issues_ratio', 'sonar_major_issues_ratio', 'sonar_minor_issues_ratio' ] names = [ # 'Any', 'Blocker', 'Critical', 'Major', 'Minor' ] options = { 'sym': '', 'meanline': True, 'showmeans': True, 'patch_artist': True, } figure, ax = plt.subplots(1,1) boxplot = ax.boxplot( [ df_tmp[feature].dropna().values for feature in features for df_tmp in (df_with_tests, df_without_tests) ], labels=( 'With Tests', 'Without Tests' )*len(features), **options ) colors = ( 'C0', 'darkred' )*len(features) hatches = ( '/', '' )*len(features) for patch, color, hatch in zip(boxplot['boxes'], colors, hatches): patch.set_edgecolor(color) patch.set_facecolor((1,1,1,0.8)) patch.set_hatch(hatch) patch.set_alpha(0.9) for cap, whisker, color in zip(boxplot['caps'], boxplot['whiskers'], np.repeat(colors,2)): cap.set_color(color) whisker.set_color(color) # legend circ1 = mpatches.Patch(facecolor='white', edgecolor=colors[0], hatch=hatches[0], label='With Tests') circ2 = mpatches.Patch(facecolor='white', edgecolor=colors[1], hatch=hatches[1], label='Without Tests') ax.legend(handles=(circ1,circ2), facecolor='white') # ----- ax.yaxis.grid(linestyle='dotted', color='gray') ax.set_xticklabels(names) xticks = np.arange(1.5, len(features)*2+0.5, 2) ax.set_xticks(xticks) ax.set_ylabel('Number of issues per file') ax.set_xlabel('Severity of issues') ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['left'].set_visible(False) mean_differences = [ df_without_tests[feature].dropna().mean() - df_with_tests[feature].dropna().mean() for feature in features ] median_differences = [ df_without_tests[feature].dropna().median() - df_with_tests[feature].dropna().median() for feature in features ] relative_differences = [ int((df_without_tests[feature].dropna().median() - df_with_tests[feature].dropna().median()) / df_with_tests[feature].dropna().median()*100) for feature in features ] cles_values = [ "{:.2%}".format(cles( df_with_tests[feature].dropna(), df_without_tests[feature].dropna() )) for feature in features ] cohensd_values = [ cohen_d( df_with_tests[feature].dropna(), df_without_tests[feature].dropna() ) for feature in features ] tester = ks_2samp tester = mannwhitneyu # tester = ttest_ind pvalues = [ tester( df_without_tests[feature].dropna().values, df_with_tests[feature].dropna().values, # alternative="two-sided" # equal_var=False, ).pvalue for feature in features ] #multiple test correction () _,pvalues,*_ = multipletests(pvalues, alpha=0.05, method='fdr_bh') # # Add info boxes to the boxplot # bbox_props_not_significant = dict(boxstyle="round,pad=0.3", fc=(1,1,1,0.8), ec='lightgray', lw=0.5) # bbox_props_significant = dict(boxstyle="round,pad=0.3", fc=(1,1,1,0.8), ec='black', lw=0.5) # for name, x, mean_difference, median_difference, pvalue in zip(names, xticks, mean_differences, median_differences, pvalues): # if pvalue < 0.05: # bbox_props = bbox_props_significant # else: # bbox_props = bbox_props_not_significant # ax.annotate( # ( # r"$\Delta\bar{{x}} = {:.2f}$".format(mean_difference)+"\n"+ # r"$\Delta Md = {:.2f}$".format(median_difference)+"\n"+ # r"$p = {:.4f}$".format(pvalue) # ), # (x,2.5), # va='top', ha='center', # fontsize=11, # bbox=bbox_props # ) for patch,pvalue,color in zip(boxplot['boxes'], np.repeat(pvalues,2), colors): if pvalue < 0.05: # patch.set_facecolor((1.0,1.0,0.8,0.7)) # patch.set_facecolor(color) # patch.set_hatch("\\") patch.set_linewidth(2) figure.tight_layout() figure.savefig(path_join(results_output, "sonar_vs_tests.pdf")) #SONAR ISSUEs SIGNIFICANCE RESULTS TABLE table_values = list(zip(names, mean_differences, median_differences, relative_differences, cles_values, cohensd_values, pvalues)) old_escape_rules = T.LATEX_ESCAPE_RULES T.LATEX_ESCAPE_RULES = {'%': '\\%'} table = tabulate( table_values, headers=['Severity', r"$\Delta\bar{{x}}$", r"$\Delta Md$", r"$\frac{\Delta{}Md}{Md_W}$(%)",'CL (%)','Cohen\'s $d$', '$p$-value'], # showindex=issues_column, tablefmt='latex', floatfmt=".4f", ) T.LATEX_ESCAPE_RULES = old_escape_rules with open(path_join(results_output, "sonar_metrics_test.tex"), 'w') as f: f.write(table) from itertools import chain issues_column = list(chain.from_iterable([("\multirow{{2}}{{*}}{{{}}}".format(name), ' ') for name in names])) old_escape_rules = T.LATEX_ESCAPE_RULES T.LATEX_ESCAPE_RULES = {'%': '\\%'} table = tabulate( [ ( sample_name, df_tmp[feature].dropna().count(), "${:.4f}$".format(df_tmp[feature].dropna().median()), "${:.4f}$".format(df_tmp[feature].dropna().mean()), "${:.4f}$".format(df_tmp[feature].dropna().std()), shapiro(df_tmp[feature].dropna())[1] < 0.0001 and "$p < 0.0001$", ) for feature in features for (df_tmp, sample_name) in ((df_with_tests, '$W$'), (df_without_tests, '$WO$')) ], headers=['Tests', '$N$', '$Md$', '$\\bar{x}$', '$s$', '$X \sim N$'], showindex=issues_column, tablefmt='latex', ) T.LATEX_ESCAPE_RULES = old_escape_rules with open(path_join(results_output, "sonar_metrics.tex"), 'w') as f: f.write(table) # ------------------------------------------------- # ############### # Hall of Fame ############### hall_of_fame = df[df[['ci/cd', 'unit_tests', 'ui_tests']].all(axis=1)].sort_values('stars', ascending=False) categories = hall_of_fame['category'].unique() small_hall_of_fame = [hall_of_fame[hall_of_fame['category']==category].iloc[0][['user', 'project_name']] for category in categories ] small_hall_of_fame_table = tabulate( small_hall_of_fame, headers=['Category', 'Organization', 'Project Name'], showindex=list(categories), tablefmt='latex', ) with open(path_join(results_output, "small_hall_of_fame.tex"), 'w') as f: f.write(small_hall_of_fame_table) ############# #### Categories ###### figure, ax = plt.subplots(figsize=(6.4, 4)) (df[['app_id','category']] .groupby('category') .count() .plot.bar(color='black', width=0.25, ax=ax)) ax.legend().remove() ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['left'].set_visible(False) ax.yaxis.grid(linestyle='dotted', color='gray') ax.set_xlabel('Category') ax.set_ylabel('Number of Apps') figure.tight_layout() figure.savefig(path_join(results_output, "categories.pdf")) ###################### # --- Percentage of Android tests over the age of the apps (cumulated) --- # def tests_in_projects_by_time_of_creation_cumm(df_projects, frameworks, title=None, verbose=False, **kwargs): project_with_test_per_age = [] total_projects_per_age = [] n_projects_with_tests_history = [] total_projects_history = [] age_max = df_projects['age_numeric'].max()+1 for age in range(age_max)[::-1]: n_projects_with_tests = df_projects[df_projects['age_numeric']==age][frameworks].apply(any, axis=1).sum() n_projects_with_tests_history.append(n_projects_with_tests) total_projects = len(df_projects[df_projects['age_numeric']==age].index) total_projects_history.append(total_projects) project_with_test_per_age.append(n_projects_with_tests) total_projects_per_age.append(total_projects) if verbose: print("Age {}:".format(age)) print("{} out of {} projects ({:.1%}).".format(n_projects_with_tests, total_projects, portion)) project_with_test_per_age_cum = [sum(project_with_test_per_age[:index+1]) for index in range(len(project_with_test_per_age))] total_projects_per_age_cum = [sum(total_projects_per_age[:index+1]) for index in range(len(total_projects_per_age))] portions = [] for with_tests, total in zip(project_with_test_per_age_cum, total_projects_per_age_cum): if total > 0: portions.append(with_tests/len(df_projects)) else: portions.append(0) plt.plot(range(age_max)[::-1], portions, **kwargs) plt.scatter( range(age_max)[::-1], portions, total_projects_history, marker='o', zorder=kwargs.get('zorder'), color=kwargs.get('color') ) ax = plt.gca() ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['left'].set_visible(False) ax.set_xticks(range(age_max)[::-1]) ax.set_yticklabels(["{:.0%}".format(label) for label in ax.get_yticks()]) ax.set_ylabel("Percentage of projects") ax.yaxis.grid(linestyle='dotted', color='gray') ax.legend(loc='upper center', shadow=False) if title: ax.set_title(title) figure, ax = plt.subplots(1,1) tests_in_projects_by_time_of_creation_cumm( df, unit_test_frameworks+ui_automation_frameworks+cloud_test_services, label="Any", color=colors_dict['any'], zorder=2, linestyle=linestyle_dict['any'], ) tests_in_projects_by_time_of_creation_cumm( df, ['no_tests'], label="No tests", color='darkred', zorder=5, linestyle="--", ) ax.set_xlabel("Years since first commit") ax.axvspan(0,2, color='darkgreen', alpha=0.1) figure.tight_layout() figure.savefig(path_join(results_output, "tests_by_age_cumm_3.pdf")) ax.invert_xaxis() figure.savefig(path_join(results_output, "tests_by_age_cumm_3_i.pdf"))
# but the range of age for non survivor is larger titanic.boxplot(column="Fare", by="Survived") # Survivors tend to possess ticket with higher price ==> richer people tend to survive more titanic.boxplot(column="SibSp", by="Survived") # Not much information titanic.boxplot(column="Parch", by="Survived") # Not much information table = pandas.crosstab(titanic["Survived"], titanic["Pclass"]) print(table) from statsmodels.graphics.mosaicplot import mosaic mosaic(titanic, ["Pclass", "Survived"]) # Most of victimes are from class 3 table2 = pandas.crosstab(titanic["Survived"], titanic["Sex"]) print(table2) mosaic(titanic, ["Survived", "Sex"]) # female are more likely survived than male passengers table3 = pandas.crosstab(titanic["Survived"], titanic["Embarked"]) print(table3) mosaic(titanic, ["Survived", "Embarked"]) # most of people part from port S # Fill missing value of variable Age by its median titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
tail_prob = prob.loc[~mask].sum() prob = prob.loc[mask] if not tail_prob == 0: prob['other'] = tail_prob prob.plot(kind='bar') plt.xticks(rotation=25) plt.show() ## Max commercials during 8 AM and then in night hours, prime time # In[55]: from statsmodels.graphics.mosaicplot import mosaic plt.rcParams['font.size'] = 0.5 plt.figure(figsize=(200000, 100000)) mosaic(df, ['Category', 'Network']) ## Observations # Electronics & Communication on TBS was the best combo of all # In[53]: # cross tab of category and network table1 = pd.crosstab(index=df["Category"], columns=df["Network"]) table1.plot(kind="bar", figsize=(20, 20), stacked=True) # Observation # same as above # In[87]:
plt.title("Frequency of Sentiment Scores") plt.show() # In[21]: dfFull.ix[:, -5:-1].hist() plt.show() # In[62]: from statsmodels.graphics.mosaicplot import mosaic mosaic(dfFull, ["Age", "compound"]) # In[23]: dfFull["Continent"] = dfFull["Continent"].astype( "category", categories=["America", "Africa", "Asia", "Europe"], ordered=True ) # In[24]: dfFull["Income"] = dfFull["Income"].astype( "category", categories=["20000-34999", "<20000", "35000-49999", "50000-74999", "75000-99999", "100000+"], ordered=True,
titanic_df.shape # (891, 12) # have a peek view of data titanic_df.head() summary = titanic_df.describe() nullsum = titanic_df.isnull().sum() # mosaic plots tb1 = pd.crosstab(titanic_df['Pclass'], titanic_df['Survived']) tb2 = pd.crosstab(titanic_df['Sex'], titanic_df['Survived']) from statsmodels.graphics.mosaicplot import mosaic import matplotlib.pyplot as plt fig1, ax1 = plt.subplots() m11 = mosaic(tb1.stack(), ax=ax1, labelizer=lambda x: tb1.loc[int(x[0]), int(x[1])]) ax1.set_yticklabels(['Deceased', 'Survived']) ax1.set_title("Ticket Class and Survivability") fig2, ax2 = plt.subplots() m22 = mosaic(tb2.stack(), ax=ax2, labelizer=lambda y: tb2.loc[y[0], int(y[1])]) ax2.set_yticklabels(['Deceased', 'Survived']) ax2.set_title("Gender and Survivability") # box plots bp1 = titanic_df.boxplot(column='Age', by='Survived') bp1.set_ylabel("Age") bp1.set_xlabel("Survival")
def scatter_matrix_all(frame, alpha=0.5, figsize=None, grid=False, diagonal='hist', marker='.', density_kwds=None, hist_kwds=None, range_padding=0.05, **kwds): df = frame num_cols = frame._get_numeric_data().columns.values n = df.columns.size fig, axes = plt.subplots(nrows=n, ncols=n, figsize=figsize, squeeze=False) # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) mask = com.notnull(df) marker = _get_marker_compat(marker) hist_kwds = hist_kwds or {} density_kwds = density_kwds or {} # workaround because `c='b'` is hardcoded in matplotlibs scatter method kwds.setdefault('c', plt.rcParams['patch.facecolor']) boundaries_list = [] for a in df.columns: if a in num_cols: values = df[a].values[mask[a].values] else: values = df[a].value_counts() rmin_, rmax_ = np.min(values), np.max(values) rdelta_ext = (rmax_ - rmin_) * range_padding / 2. boundaries_list.append((rmin_ - rdelta_ext, rmax_+ rdelta_ext)) for i, a in zip(lrange(n), df.columns): for j, b in zip(lrange(n), df.columns): ax = axes[i, j] if i == j: if a in num_cols: # numerical variable values = df[a].values[mask[a].values] # Deal with the diagonal by drawing a histogram there. if diagonal == 'hist': ax.hist(values, **hist_kwds) elif diagonal in ('kde', 'density'): from scipy.stats import gaussian_kde y = values gkde = gaussian_kde(y) ind = np.linspace(y.min(), y.max(), 1000) ax.plot(ind, gkde.evaluate(ind), **density_kwds) ax.set_xlim(boundaries_list[i]) else: # categorical variable values = df[a].value_counts() ax.bar(list(range(df[a].nunique())), values) else: common = (mask[a] & mask[b]).values # two numerical variables if a in num_cols and b in num_cols: if i > j: ax.scatter(df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds) # The following 2 lines add the lowess smoothing ys = lowess(df[a][common], df[b][common]) ax.plot(ys[:,0], ys[:,1], 'red') else: pearR = df[[a, b]].corr() ax.text(df[b].min(), df[a].min(), 'r = %.4f' % (pearR.iloc[0][1])) ax.set_xlim(boundaries_list[j]) ax.set_ylim(boundaries_list[i]) # two categorical variables elif a not in num_cols and b not in num_cols: if i > j: from statsmodels.graphics import mosaicplot mosaicplot.mosaic(df, [b, a], ax, labelizer=lambda k:'') # one numerical variable and one categorical variable else: if i > j: tol = pd.DataFrame(df[[a, b]]) if a in num_cols: label = [ k for k, v in tol.groupby(b) ] values = [ v[a].tolist() for k, v in tol.groupby(b) ] ax.boxplot(values, labels=label) else: label = [ k for k, v in tol.groupby(a) ] values = [ v[b].tolist() for k, v in tol.groupby(a) ] ax.boxplot(values, labels=label, vert=False) ax.set_xlabel('') ax.set_ylabel('') _label_axis(ax, kind='x', label=b, position='bottom', rotate=True) _label_axis(ax, kind='y', label=a, position='left') if j!= 0: ax.yaxis.set_visible(False) if i != n-1: ax.xaxis.set_visible(False) for ax in axes.flat: setp(ax.get_xticklabels(), fontsize=8) setp(ax.get_yticklabels(), fontsize=8) return fig
#print(data) classes = ["Mammalia", "Aves", "Reptilia"] statuses = ["Endangered", "Critically endangered", "Vulnerable"] mosaic_data = [] for item in data: if item["Animal Class"] in classes and item["Category"] in statuses: mosaic_data.append(item) properties = { "Endangered": {"color": "#FACDB6"}, "Critically endangered": {"color": "#C5CADE"}, "Vulnerable": {"color": "#A8DBD2"}, } plt.rc("font", size=8) mosaic_dataframe = pd.DataFrame(mosaic_data) fig = mosaic( mosaic_dataframe, ["Category","Animal Class"], title="Conservation Status by Animal Class", gap=[0.02, 0.02], axes_label=True, properties=lambda x: properties[x[0]], ) plt.savefig("endangered_species.png")
ticket Ticket Number fare Passenger Fare cabin Cabin embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton) ''' ######## ## Performing some EDA of the data ######## # Pull out survived as the response series response_series = df.Survived # Create some Mosiac plots to inspect the data mosaic(df,['Pclass','Survived'], title = 'Survival Rate by Class') mosaic(df,['Sex','Survived'], title = 'Survival Rate by Gender') ## Creating a function to pull out the titles of the Passengers def find_between( s, first, last ): try: start = s.index( first ) + len( first ) + 1 end = s.index( last, start ) return s[start:end] except ValueError: return "" # Test the function to pull out the titles test_title = find_between(df.Name[100],',','.') print test_title
def view_image(headers_x: headers_x, headers_y: headers_y, colour_headers: colour_headers): data = df #no selection made #======================================================================== if headers_x == 'Select' and headers_y == 'Select': sns.set_context("notebook", font_scale=1.1) #X selected but noy Y #======================================================================== elif headers_x != 'Select' and headers_y == 'Select': sns.set_context("notebook", font_scale=1.1) x = headers_x if df[headers_x].dtype == np.float or df[headers_x].dtype == np.int: x_type = 'is_numeric' elif df[headers_x].dtype == np.object: x_type = 'is_string' #plot when x is a string #-------------------------------------------------------------------- if x_type == 'is_string': #if colours have not been selected #................................................................ if colour_headers == 'Select': g = sns.countplot(x=x, data=data) loc, labels = plt.xticks() g.set_xticklabels(labels, rotation=90) plt.show() #sns_plot.savefig("output.png") #if colours have been selected #................................................................ else: g = sns.countplot(x=x, hue=colour_headers, data=data) loc, labels = plt.xticks() g.set_xticklabels(labels, rotation=90) plt.show() #plot when x is numeric #-------------------------------------------------------------------- else: #if colours have not been selected #................................................................ if colour_headers == 'Select': xplot = data[x] sns.distplot(xplot) plt.show() #if colours have been selected #................................................................ else: g = sns.FacetGrid(data, hue=colour_headers) g = g.map(sns.distplot, x) plt.show() #if only Y has been selected #======================================================================== elif headers_x == 'Select' and headers_y != 'Select': sns.set_context("notebook", font_scale=1.1) #if both X and Y have been selected #======================================================================== elif headers_x != 'Select' and headers_y != 'Select': x = headers_x y = headers_y if df[headers_x].dtype == np.float or df[headers_x].dtype == np.int: x_type = 'is_numeric' elif df[headers_x].dtype == np.object: x_type = 'is_string' if df[headers_y].dtype == np.float or df[headers_y].dtype == np.int: y_type = 'is_numeric' elif df[headers_y].dtype == np.object: y_type = 'is_string' sns.set_context("notebook", font_scale=1.1) sns.set_style("ticks") #Numeric vs Numeric #------------------------------------------------------------------------ if x_type == 'is_numeric' and y_type == 'is_numeric': # Create scatterplot of dataframe #if colours have not been selected #................................................................ if colour_headers == 'Select': g = sns.lmplot( x=x, # Horizontal axis y=y, # Vertical axis data=data, # Data source fit_reg=False, # Don't fix a regression line scatter_kws={"marker": "D"}, legend=True) plt.show() #if colours have been selected #................................................................ else: g = sns.lmplot( x=x, # Horizontal axis y=y, # Vertical axis data=data, # Data source fit_reg=False, # Don't fix a regression line hue=colour_headers, # Set color scatter_kws={"marker": "D"}, # S marker size legend=True) plt.show() #Numeric vs String #------------------------------------------------------------------------ elif x_type == 'is_numeric' and y_type == 'is_string': sns.set_style("ticks") #if colours have not been selected #................................................................ if colour_headers == 'Select': g = sns.violinplot(x=x, y=y, data=data) plt.show() #if colours have been selected #................................................................ else: g = sns.violinplot(x=x, y=y, hue=colour_headers, data=data) plt.show() #String vs Numeric #------------------------------------------------------------------------ elif x_type == 'is_string' and y_type == 'is_numeric': #if colours have not been selected #................................................................ if colour_headers == 'Select': sns.set_style("ticks") g = sns.boxplot(x=x, y=y, data=data) plt.show() #if colours have been selected #................................................................ else: sns.set_style("ticks") g = sns.boxplot(x=x, y=y, hue=colour_headers, data=data) plt.show() #String vs String #------------------------------------------------------------------------ elif x_type == 'is_string' and y_type == 'is_string': if headers_x != headers_y: g = mosaic(data, [x, y]) plt.show() elif headers_x == headers_y: g = sns.countplot(x=x, data=data) loc, labels = plt.xticks() g.set_xticklabels(labels, rotation=90) plt.show()
df # %% # Let's calculate the Cramér's V coefficient for Survived and Pclass cramers_v(df['Survived'], df['Pclass'], bias_correction=False) # %% # Let's verify that Cramér's V is a symmetric function cramers_v(df['Survived'], df['Pclass']) == cramers_v(df['Pclass'], df['Survived']) # %% # You can also draw a mosaic plot for these variables mosaic(data=df, index=['Survived', 'Pclass'], statistic=True, axes_label=True, gap=[0.01, 0.02]) # %% # Take advantage of the asymmetry of Theil's U calculating it for the same variables. # This is U(Survived|Pcalss) that is "U for Survived given Pclass" theils_u(df['Survived'], df['Pclass']) # %% # Just check that the opposite direction gives you a different result theils_u(df['Pclass'], df['Survived']) # %% # Let's draw a violin plot of Age and Pclass violinPlot(data=df,
line_group="pop") fig.show() #%% gapminder #%% #Mekko chart import pandas as pd from statsmodels.graphics.mosaicplot import mosaic import pylab from itertools import product import numpy as np rand = np.random.random speaks_mul_foreign_languages = list(product(['male', 'female'], ['yes', 'no'])) index = pd.MultiIndex.from_tuples(speaks_mul_foreign_languages, names=['male', 'female']) data = pd.Series(rand(4), index=index) mosaic(data, gap=0.01, title='Who knows multiple foreign languages? - Mosaic Chart') pylab.show() #%% #Pie chart import matplotlib.pyplot as plt # Data to plot labels = 'Python', 'C++', 'Ruby', 'Java' sizes = [215, 130, 245, 210] colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue'] explode = (0.1, 0, 0, 0) # explode 1st slice # Plot plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)