def euclidean_dist(self): """ This metric measures the preservation of intrinsic patterns occurring between the attributes of the original dataset in the corresponding synthetic dataset. The lower the value is the better the data generation tool preserves the patterns. The threshold limit for this metric is a value below 14.""" real_cat, synth_cat = self.to_cat(self.origdst, self.synthdst) real_cat_dem = self.get_demographics(real_cat) synth_cat_dem = self.get_demographics(synth_cat) corr_real_obj = associations(real_cat_dem, theil_u=True, bias_correction=False, plot=False) corr_synth_obj = associations(synth_cat_dem, theil_u=True, bias_correction=False, plot=False) corr_real = corr_real_obj['corr'] corr_rand = corr_synth_obj['corr'] eucl_matr = distance.cdist(corr_real, corr_rand, 'euclidean') eucl = LA.norm(eucl_matr) return eucl, eucl_matr
def pairwise_correlation_difference(self): """ PCD measures the difference in terms of Frobenius norm of the correlation matrices computed from real and synthetic datasets. The smaller the PCD, the closer the synthetic data is to the real data in terms of linear correlations across the variables. The threshold limit for this metric is a value below 2.4 """ real_cat, synth_cat = self.to_cat(self.origdst, self.synthdst) real_cat_dem = self.get_demographics(real_cat) synth_cat_dem = self.get_demographics(synth_cat) corr_real_obj = associations(real_cat_dem, theil_u=True, bias_correction=False, plot=False) corr_synth_obj = associations(synth_cat_dem, theil_u=True, bias_correction=False, plot=False) corr_real = corr_real_obj['corr'] corr_rand = corr_synth_obj['corr'] substract_m = np.subtract(corr_real, corr_rand) prwcrdst = LA.norm(substract_m) return prwcrdst, substract_m
def correlationMatrix(data, encoded): if (encoded): correlation_matrix = data.corr(method='spearman') plt.figure(figsize=(8, 8)) ax = sns.heatmap(correlation_matrix, vmax=1, square=True, annot=True, fmt='.2f', cmap='GnBu', cbar_kws={"shrink": .5}, robust=True) plt.title('Macierz korelacji pomiędzy cechami', fontsize=20) plt.show() else: nominal.associations(data, theil_u=True, nominal_columns=[ 'SendingCountry', 'ReceivingCountry', 'MobilityType', 'SpecialNeeds', 'SubjectAreaName', 'LevelOfStudy', 'ParticipantGender', 'Language', 'SendingPartnerErasmusID', 'HostingPartnerCity' ])
def registration_correlations(passed_df=None, save_path=None, columns=None, prediction_window=None, scaled=False, cmap='coolwarm'): """ registration_correlations(save_path = None, columns = None, prediction_window=None, scaled=False, drop_course=False, cmap='coolwarm') --- Loads registrations according to giving prediction window, and creates a dython.associations() correlation plot between all or listed columns. --- save_path: (optional) path to save figure to. columns: (optional) columns to plot correlations between prediction_window: (int) how far into the course the dataframe should include scaled: (boolean) whether to use CourseScaler to scale data by course. cmap: (default is 'coolwarm') colormap for plotted correlations """ import matplotlib.pyplot as plt from dython.nominal import associations if type(passed_df) == type(None): df = load_OU_data(prediction_window=prediction_window) else: df = passed_df.copy() if 'final_result' in df.columns: df.loc[df['final_result'] == 'Withdrawn', 'final_result'] = 0 df.loc[df['final_result'] == 'Fail', 'final_result'] = 1 df.loc[df['final_result'] == 'Pass', 'final_result'] = 2 df.loc[df['final_result'] == 'Distinction', 'final_result'] = 3 if 'age_band' in df.columns: df.loc[df['age_band'] == '0-35', 'age_band'] = 0 df.loc[df['age_band'] == '35-55', 'age_band'] = 1 df.loc[df['age_band'] == '55<=', 'age_band'] = 2 to_drop = ['code_presentation', 'id_student', 'module_presentation_length'] if prediction_window == None: to_drop.append('date_unregistration') for column in to_drop: if column in df.columns: df = df.drop(column, axis=1) if scaled: if 'code_module' in df.columns: cs = CourseScaler(drop_course=False) df = cs.fit_transform(df) else: print('cannot scale, code_module not found in columns') if type(columns) == list: df = df[columns] fig, ax = plt.subplots(1, 1, figsize=(len(df.columns) * 2**1.2, len(df.columns) * 1.5**1.2)) fig.suptitle('Variable Correlations', fontsize=len(df.columns) * 2 + 5) associations(df, ax=ax, mark_columns=False, cmap=cmap) if type(save_path) == str: fig.savefig(save_path, dpi=250) plt.show()
def associations_example(): """ Plot an example of an associations heat-map of the Iris dataset features """ iris = datasets.load_iris() X = pd.DataFrame(data=iris.data, columns=iris.feature_names) y = pd.DataFrame(data=iris.target, columns=['target']) df = pd.concat([X, y], axis=1) associations(df, nominal_columns=['target'])
def fit(self, data_original, data_synthetic): data_original, data_synthetic = self._check_input_data(data_original, data_synthetic) self.stats_original_ = associations(data_original, nom_nom_assoc=self.nom_nom_assoc, nominal_columns=self.nominal_columns, nan_replace_value='nan', compute_only=True)['corr'] self.stats_synthetic_ = associations(data_synthetic, nom_nom_assoc=self.nom_nom_assoc, nominal_columns=self.nominal_columns, nan_replace_value='nan', compute_only=True)['corr'] return self
def associations_example(): """ Plot an example of an associations heat-map of the Iris dataset features """ iris = datasets.load_iris() # Convert int classes to strings to allow associations method auto recognition of categorical columns target = ['C{}'.format(i) for i in iris.target] X = pd.DataFrame(data=iris.data, columns=iris.feature_names) y = pd.DataFrame(data=target, columns=['target']) df = pd.concat([X, y], axis=1) associations(df)
def main(): # Data importation ------------------------------------------------------------- df = pd.read_csv("agaricus-lepiota.csv") panda_rules() # Construct barplot =================================================== make_barplot(df, "odor") make_barplot(df, "spore-print-color") # Make heatmap to show correlation between attribute associations(df, nominal_columns="all") # Use Cramer V nominal association associations(df, nominal_columns="all", theil_u=True) # Use Theil's U nominal association # Construct Pie chart =================================================== make_pie_chart(df, "odor") # Show the number of mushroom for each odor ============================= count_value_for_attribute(df, "odor") # Find the number of instance in each class ============================== find_nb_instance_class(df) print("==============================") # Count the number of edible mush by odor ============================== count_edible_mush_by_odor(df) print("==============================") # Count the number of toxic mush by odor ============================== count_toxic_mush_by_odor(df) print("==============================") # Count the number of toxic mush by spore color ============================== count_toxic_mush_by_spore_color(df) print("==============================") # Count the number of edible mush by spore-color ============================== count_edible_mush_by_spore_color(df) print("==============================") # All mush with spore-color green are poisonous show_rules_two(df) print("==============================") # All mush with habitat = leaves and cap-color = white are poisonous show_rules_four(df) print("==============================") return
def categorical_matrix(dataframe, theil_u=True, return_results=False): """Displays a kind of "correlation matrix" including categorical features.""" # loading library from dython.nominal import associations # Get the categorical and boolean columns categorical_columns = list( dataframe.select_dtypes(include='category').columns) categorical_columns += list( dataframe.select_dtypes(include='bool').columns) # Drop NaN values to avoid errors df_for_correlations = dataframe.dropna() # Drop 'object', 'datetime' and 'timedelta' columns df_for_correlations = df_for_correlations.select_dtypes( exclude=['object', 'datetime', 'timedelta']) # Calculate associations (returns None) and display graph corr = associations( df_for_correlations, figsize=(15, 7), theil_u=theil_u, # asymetric measure of correlation for nominal feature nominal_columns=categorical_columns, return_results=return_results) # Returns correlation matrix, if requested if return_results: return corr
def categorical_matrix(dataframe): """Displays a kind of "correlation matrix" including categorical features.""" # loading library from dython.nominal import associations, cluster_correlations # Get the categorical and boolean columns categorical_columns = list( dataframe.select_dtypes(include='category').columns) categorical_columns += list( dataframe.select_dtypes(include='bool').columns) # Drop NaN values to avoid errors df_for_correlations = dataframe.dropna() # Drop 'object', 'datetime' and 'timedelta' columns df_for_correlations = df_for_correlations.select_dtypes( exclude=['object', 'datetime', 'timedelta']) # Calculate associations and display graph assoc = associations(df_for_correlations.select_dtypes( exclude=['object', 'timedelta64[ns]', 'datetime64[ns]']), nan_strategy='drop_samples', figsize=(30, 30), plot=False) # Sort the correlations_matrix s and display graph correlations_matrix = assoc['corr'] correlations_matrix, _ = cluster_correlations(correlations_matrix) return (correlations_matrix)
def plot_correlation_difference(real: pd.DataFrame, fake: pd.DataFrame, plot_diff: bool = True, cat_cols: list = None, annot=False, model_dir_path="~"): """ Plot the association matrices for the `real` dataframe, `fake` dataframe and plot the difference between them. Has support for continuous and Categorical (Male, Female) data types. All Object and Category dtypes are considered to be Categorical columns if `dis_cols` is not passed. - Continuous - Continuous: Uses Pearson's correlation coefficient - Continuous - Categorical: Uses so called correlation ratio (https://en.wikipedia.org/wiki/Correlation_ratio) for both continuous - categorical and categorical - continuous. - Categorical - Categorical: Uses Theil's U, an asymmetric correlation metric for Categorical associations :param real: DataFrame with real data :param fake: DataFrame with synthetic data :param plot_diff: Plot difference if True, else not :param cat_cols: List of Categorical columns :param boolean annot: Whether to annotate the plot with numbers indicating the associations. """ assert isinstance(real, pd.DataFrame), f'`real` parameters must be a Pandas DataFrame' assert isinstance(fake, pd.DataFrame), f'`fake` parameters must be a Pandas DataFrame' cmap = sns.diverging_palette(220, 10, as_cmap=True) if cat_cols is None: cat_cols = real.select_dtypes(['object', 'category']) if plot_diff: fig, ax = plt.subplots(1, 3, figsize=(24, 7)) else: fig, ax = plt.subplots(1, 2, figsize=(20, 8)) real_corr = associations(real, nominal_columns=cat_cols, plot=False, theil_u=True, mark_columns=True, annot=annot, ax=ax[0], cmap=cmap)['corr'] fake_corr = associations(fake, nominal_columns=cat_cols, plot=False, theil_u=True, mark_columns=True, annot=annot, ax=ax[1], cmap=cmap)['corr'] if plot_diff: diff = abs(real_corr - fake_corr) sns.set(style="white") sns.heatmap(diff, ax=ax[2], cmap=cmap, vmax=.3, square=True, annot=annot, center=0, linewidths=.5, cbar_kws={"shrink": .5}, fmt='.2f') titles = ['Real', 'Synthetic', 'Difference'] if plot_diff else ['Real', 'Synthetic'] for i, label in enumerate(titles): title_font = {'size': '18'} ax[i].set_title(label, **title_font) plt.tight_layout() plt.savefig(model_dir_path + "/correlation.jpg") plt.show()
def plot_correlation_comparison(evaluators: List, annot=False): """ Plot the correlation differences of multiple TableEvaluator objects. :param evaluators: list of TableEvaluator objects :param boolean annot: Whether to annotate the plots with numbers. """ nr_plots = len(evaluators) + 1 cmap = sns.diverging_palette(220, 10, as_cmap=True) fig, ax = plt.subplots(2, nr_plots, figsize=(4 * nr_plots, 7)) flat_ax = ax.flatten() flat_ax[nr_plots + 1].clear() fake_corr = [] real_corr = associations(evaluators[0].real, nominal_columns=evaluators[0].categorical_columns, plot=False, theil_u=True, mark_columns=True, annot=False, cmap=cmap, cbar=False, ax=flat_ax[0])['corr'] for i in range(1, nr_plots): cbar = True if i % (nr_plots - 1) == 0 else False fake_corr.append( associations(evaluators[i - 1].fake, nominal_columns=evaluators[0].categorical_columns, plot=False, theil_u=True, mark_columns=True, annot=False, cmap=cmap, cbar=cbar, ax=flat_ax[i])['corr'] ) if i % (nr_plots - 1) == 0: cbar = flat_ax[i].collections[0].colorbar cbar.ax.tick_params(labelsize=20) for i in range(1, nr_plots): cbar = True if i % (nr_plots - 1) == 0 else False diff = abs(real_corr - fake_corr[i - 1]) sns.set(style="white") az = sns.heatmap(diff, ax=flat_ax[i + nr_plots], cmap=cmap, vmax=.3, square=True, annot=annot, center=0, linewidths=0, cbar=cbar, fmt='.2f') if i % (nr_plots - 1) == 0: cbar = az.collections[0].colorbar cbar.ax.tick_params(labelsize=20) titles = ['Real'] + [e.name if e.name is not None else idx for idx, e in enumerate(evaluators)] for i, label in enumerate(titles): flat_ax[i].set_yticklabels([]) flat_ax[i].set_xticklabels([]) flat_ax[i + nr_plots].set_yticklabels([]) flat_ax[i + nr_plots].set_xticklabels([]) title_font = {'size': '28'} flat_ax[i].set_title(label, **title_font) plt.tight_layout()
def associations(dataset, nominal_columns=None, mark_columns=False, theil_u=False, plot=True, return_results=False, **kwargs): """ See 'associations' in the 'nominal' module. """ return nominal.associations(dataset, nominal_columns, mark_columns, theil_u, plot, return_results, **kwargs)
def associations_iris_example(): """ Plot an example of an associations heat-map of the Iris dataset features. All features of this dataset are numerical (except for the target). """ # Load data iris = datasets.load_iris() # Convert int classes to strings to allow associations method # to automatically recognize categorical columns target = ['C{}'.format(i) for i in iris.target] # Prepare data X = pd.DataFrame(data=iris.data, columns=iris.feature_names) y = pd.DataFrame(data=target, columns=['target']) df = pd.concat([X, y], axis=1) # Plot features associations return associations(df)
def associations_mushrooms_example(): """ Plot an example of an associations heat-map of the UCI Mushrooms dataset features. All features of this dataset are categorical. This example will use Theil's U. """ # Download and load data from UCI df = pd.read_csv( 'http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data' ) df.columns = [ 'class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat' ] # Plot features associations return associations(df, theil_u=True, figsize=(15, 15))
# %% correlation_ratio(categories=t1['topic'], measurements=t1['score']) # %% t2 = pd.DataFrame({ 'topic': [ 'Algebra', 'Algebra', 'Algebra', 'Algebra', 'Algebra', 'Geometry', 'Geometry', 'Geometry', 'Geometry', 'Statistics', 'Statistics', 'Statistics', 'Statistics', 'Statistics', 'Statistics' ], 'score': [36, 36, 36, 36, 36, 33, 33, 33, 33, 78, 78, 78, 78, 78, 78] }) violinPlot(data=t2, varx='topic', vary='score', title='', xlab='', ylab='') # %% correlation_ratio(categories=t2['topic'], measurements=t2['score']) # %% # Calculate correlation coefficients for a Pandas dataframe regardless column data types ass = associations(df, nom_nom_assoc='theil', num_num_assoc='pearson', figsize=(10, 10), clustering=True) # %% ass['corr'] # %%
plt.show() #Relation between attributes plt.figure(figsize=(14, 12)) foo = sns.heatmap(train.corr(), vmax=0.6, square=True, annot=True) plt.show() '''I found a new package! Although in its tutorial showed its performance on mixed categorical-numerical datasets, I just get the categorical becuase the the computation is so heavy! ''' from dython.nominal import associations train_categorical = train[[ 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip', 'device_model' ]] associations(train_categorical, theil_u=True, figsize=(30, 30)) ''' Seems that 'Unnamed: 0', 'click', 'hour', 'C14', 'C17' are highly correlated to click rate. In the next section we will plot their inter connections ''' #Pairplot warnings.filterwarnings(action="ignore") cols = ['Unnamed: 0', 'click', 'hour', 'C14', 'C17'] g = sns.pairplot(data=train, vars=cols, size=1.5, hue='click') g.set(xticklabels=[]) plt.show() plt.tight_layout() #jointplot of correlated sns.jointplot("C14",
def main(): # Read csv file and prepare dataframe data = pd.read_csv("data/auto-mpg.data-original", sep="\s+", header=None, names=[ "mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model_year", "origin", "car_name" ]) print(data.info()) print(data.shape) # Find How many null entries print(data.isnull().sum()) # Drop na columns data = data.dropna() # Drop car_name feature since it has too many unique values # TODO: Create etl to map car names to model data = data.drop(["car_name"], axis=1) # Need to look at the target variable print(data.mpg.describe()) # Print Distribution of the target variable sns.distplot(data.mpg) plt.savefig("plots/mpg_distribution.jpg") plt.show() # Plot all numerical features against each other sns.pairplot(data, vars=["displacement", "horsepower", "weight", "acceleration"], hue="cylinders") plt.savefig("plots/pair_plot_cylinders.jpg") sns.pairplot(data, vars=["displacement", "horsepower", "weight", "acceleration"], hue="origin") plt.savefig("plots/pair_plot_origin.jpg") sns.pairplot(data, vars=["displacement", "horsepower", "weight", "acceleration"], hue="model_year") plt.savefig("plots/pair_plot_model_year.jpg") plt.show() # Plot categorical values against the target variable sns.boxplot(x="cylinders", y="mpg", data=data) plt.axhline(data.mpg.mean(), color='r', linestyle='dashed', linewidth=2) plt.savefig("plots/box_plot_cylinders.jpg") plt.show() sns.boxplot(x="origin", y="mpg", data=data) plt.axhline(data.mpg.mean(), color='r', linestyle='dashed', linewidth=2) plt.savefig("plots/box_plot_origin.jpg") plt.show() sns.boxplot(x="model_year", y="mpg", data=data) plt.axhline(data.mpg.mean(), color='r', linestyle='dashed', linewidth=2) plt.savefig("plots/box_plot_model_year.jpg") plt.show() # Get correlation plot associations(data, figsize=(15, 15), cmap="viridis") plt.show()
] train['capital.loss'] = [ float(re.sub(',', '.', aa)) for aa in train['capital.loss'] ] train['capital.gain'] = [ float(re.sub(',', '.', aa)) for aa in train['capital.gain'] ] #===================================== # Select the test set #===================================== competitors = os.listdir(res_folder + design + '/' + sub_design) assocs = {} assoc_test = associations(test.astype(dtype), nominal_columns = list(test.columns[cat_features]),\ plot = False)['corr'] #ax = axs[0], cbar = False, annot = False assocs['Test'] = assoc_test for c_idx, competitor in enumerate(competitors): try: preds = pd.read_csv(res_folder + design + '/' + sub_design + '/' + competitor\ + '/preds' + str(filenum) + '.csv', sep = ',') if preds.shape[1] == 1: preds = pd.read_csv(res_folder + design + '/' + sub_design + '/' + competitor\ + '/preds' + str(filenum) + '.csv', sep = ';') #if preds.shape[1] == p + 1: #preds = preds.iloc[:, 1:] except ParserError: preds = pd.read_csv(res_folder + design + '/' + sub_design + '/' + competitor\
def correlations(data): associations(data, figsize=(15, 15), cmap="viridis") plt.show()
def associations_example(): iris = datasets.load_iris() X = pd.DataFrame(data=iris.data, columns=iris.feature_names) y = pd.DataFrame(data=iris.target, columns=['target']) df = pd.concat([X, y], axis=1) associations(df, nominal_columns=['target'])
cate_col = [ 'payment_plan', 'program_name', 'application_type_name', 'referrer', 'gender', 'home_country', 'work_country', 'practice_type', 'professional_assoc', 'home_state', 'work_state', 'Orientation' ] nume_col = [ "response", "hours_online", 'preprobation', 'currentafterpreprbation', "Unit 1", "Unit 2", "Unit 3", "Unit 4" ] # Numerical Heatmap #nominal.associations(x[nume_col]) # Total Heatmap nominal.associations(x, nominal_columns=cate_col) # Drop the unnecessary column drop_column = [ "Unit 1", "Unit 2", "Unit 3", "Unit 4", "status_id_binary", "user_id", "application_id", "gender", "application_type_name", "home_country", "home_state", "professional_assoc" ] tmp = x.drop(columns=drop_column) # One Hot-Encoding cate_col_new = [x for x in cate_col if x not in drop_column] x_train = pd.get_dummies(data=tmp, columns=cate_col_new) columns = list(x.columns) # Data Split train_x, test_x, train_y, test_y = train_test_split(x_train,
def associations_example(): df = pd.DataFrame(data=data, columns=data.columns.values) associations(df, nominal_columns=data.columns.values)
gow.append((true.astype(int) != imputed.astype(int)).mean()) error.loc[method, 'gow'] = np.mean(gow) #error.T[['full']].T.to_csv(res + 'Run' + str(run_idx) + '/res' + dataset_name + '.csv', index = False) #============================= # Comparing associations structure #============================= import seaborn as sns from dython.nominal import compute_associations, associations from sklearn.metrics.pairwise import cosine_similarity original_assoc = compute_associations(full_pima, nominal_columns = cat_features) associations(full_pima, nominal_columns = cat_features) Ez = out2['Ez.y'] vc = vars_contributions(completed_y2, Ez, assoc_thr = 0.0, \ title = 'Contribution of the variables to the latent dimensions',\ storage_path = None) assoc = cosine_similarity(vc, dense_output=True) labels = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'D.P. Function', 'Age', 'Outcome'] fig, axn = plt.subplots(1, 2, sharex=True, sharey=True, figsize = (12,10)) cbar_ax = fig.add_axes([.91, .3, .03, .4])
s_full = cosine_similarity(vc_full, dense_output=True) # Compare the representation between full and complete idx = 0 fig, ax = plt.subplots(figsize = (4,4)) ax.scatter(s2[idx], s_full[idx]) plt.title(full_contra.columns[idx]) for i, txt in enumerate(full_contra.columns): ax.annotate(txt, (s2[idx][i], s_full[idx][i])) ax.set_xlim([-1,1]) ax.set_ylim([-1,1]) # Compare the representation between completed cosine similarity and original associations associations(complete_y.astype(float), nominal_columns = cat_features) associations(completed_y.astype(float), nominal_columns = cat_features) associations(full_contra.astype(float), nominal_columns = cat_features) assoc = compute_associations(full_contra.astype(float), nominal_columns = cat_features).values idx = 0 fig, ax = plt.subplots(figsize = (4,4)) ax.scatter(assoc[idx], s_full[idx]) plt.title(full_contra.columns[idx]) for i, txt in enumerate(full_contra.columns): ax.annotate(txt, (assoc[idx][i], s_full[idx][i])) ax.set_xlim([-1,1]) ax.set_ylim([-1,1])
base_col = "accommodates" for col in ["log_bedrooms", "log_bathrooms"]: new_col_name = f"{col}_per_{base_col}" df[new_col_name] = df[col] / df[base_col] num_attribs.append(new_col_name) # ### Feature Elimination # We can also examine the correlation heatmap, which would serve as guidance for further data elimination. # In[970]: fig, ax = plt.subplots(figsize=(30, 30)) associations(df[num_attribs], theil_u=True, ax=ax) plt.xticks(rotation=45) plt.suptitle("Initial Correlation Heatmap") plt.savefig("../imgs/correlation_heatmap_before.png") # Check the saved correlation heatmap [here](../imgs/correlation_heatmap_before.png). # # We now proceed to examine the highly correlated ones. # In[971]: review_col_names = [ 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value' ]
def ml3(self): global df nominal.associations(df, nominal_columns=['Process', 'Library'], theil_u=True) return
def correlations(data): associations(data, figsize=(15, 15), cmap="viridis") plt.savefig("plots/correlations.png") plt.show()
def exploratory_data_analysis(data, categoricals, numericals, plot_with_target=False, plot_corr_mat=False, save=False): if plot_corr_mat: associations( data[["tenure", "MonthlyCharges", "TotalCharges", "Churn"]]) for cat in categoricals: sns.countplot(x=cat, data=data) plt.title("Distribution of " + str(cat)) plt.xticks(rotation=90) plt.tight_layout() plt.xlabel(None) if save: plt.savefig(fname=str(cat) + "_count.png") plt.show() if plot_with_target: if cat != 'Churn': splot = sns.countplot(x=cat, data=data, hue='Churn') plt.title("Distribution of " + str(cat) + " dependent to target variable") plt.xticks(rotation=90) plt.tight_layout() plt.xlabel(None) for p in splot.patches: height = p.get_height() splot.text( p.get_x() + p.get_width() / 2., height + 3, '{:1.2f}'.format(height / float(len(data)) * 100) + "%", ha="center") if save: plt.savefig(fname=str(cat) + "_count_target.png") plt.show() for num in numericals: sns.distplot(data[num]) plt.tight_layout() plt.title("Distribution of " + str(num)) plt.xlabel(None) if save: plt.savefig(fname=str(num) + "_dist.png") plt.show() if plot_with_target: data.groupby("Churn")[num].apply( lambda x: sns.distplot(x, label=x.name)) plt.tight_layout() plt.title("Distribution of " + str(num) + " dependent to target variable") plt.xlabel(None) plt.legend() if save: plt.savefig(fname=str(num) + "_dist_target.png") plt.show() sns.boxplot(x=num, data=data) plt.tight_layout() plt.title(str(num) + " boxplot") plt.xlabel(None) if save: plt.savefig(fname=str(num) + "_box.png") plt.show()
df_pvals[column1] = p_list # In[66]: df_p.index = df_p.columns p_vals = df_p.values print(p_vals[p_vals > .01]) print(np.nonzero(p_vals > .01)) # Running the $\chi^2$ test on all pairs of variables returns only four unique pairs that fail to reject the null hypothesis on a selected $\alpha = 0.01$, the null hypothesis being that the two selected variables are independent of each other. I opted for a lower value of $\alpha$ because I believed the relatively large sample size I'm using would otherwise make it easier to detect false positives. The pairs that fail to reject the null hypothesis ((7, 20), (7, 21), (7, 22), and (9, 16)) correspond to (`flushot`, `any_exercise`), (`flushot`, `blindness`), (`flushot`, `trouble_concentrating`), and (`sex`, `kidney_disease`). Ultimately, I am most interested in the significance of values related to the target variable, `diabetes`, and all variables related to `diabetes` successfully reject the null hypothesis. # # An additional test I can run after computing the $\chi^2$ statistics and their p-value is Cramér's V, which uses value of $\chi^2$ to compute the strength of association between two variables. This measure of association ranges from 0 to 1 and serves a similar purpose to measuring the correlation between two continuous variables, so it seems like a potentially useful tool to use on data that is entirely categorical (whether ordinal or nominal). Again, I am most interested in the strength of association between `diabetes` and all other variables, but I have also computed Cramér's V for all other pairs since it may be useful to know how strongly associated two features are to each other when it comes time to build the machine learning model. # # The bias correction operation is included in the computation of Cramér's V. # In[4]: from dython.nominal import associations associations(df_diabetes, figsize=(15, 15)) #Bias correction set to True by default plt.show() # Although most pairs of features were found to likely have some association by their low p-values on the $\chi^2$ test, very few pairs of features have a strength of association higher than $0.25$. Perhaps the large sample size made it possible to be reasonably confident that even weak associations were statistically significant (recall that this is related to the reason I selected a very low p-value). Three of the four pairs of features that failed to reject the null hypothesis on the chi-squared test were also found to have a Cramér's of $0.00$. The fourth, `flushot` and `blindness`, has a Cramér's V of $0.01$. # # On `diabetes`, the strongest association, $0.30$, is to `general_health`, and the lowest, $0.02$, is to `mscode`. The next-highest associations to `diabetes` are to `bmi_category` at $0.23$, `employed` at $0.20$, `income` at $0.18$, and `age_category` at $0.17$. # # Some pairs of features, such as `mental_health_days_per_month` and `depressive_disorder`, have a moderate degree of association to each other. However, no pair of features has an association higher than $0.44$, so at this point it does not seem especially likely that any of the predictive variables will be too strongly associated with each other, so all of the features recommended by the scientific literature on diabetes will likely be retained by the model. # In[ ]: