def column_correlations(dataset_a, dataset_b, categorical_columns, theil_u=True): """ Column-wise correlation calculation between ``dataset_a`` and ``dataset_b``. :param dataset_a: First DataFrame :param dataset_b: Second DataFrame :param categorical_columns: The columns containing categorical values :param theil_u: Whether to use Theil's U. If False, use Cramer's V. :return: Mean correlation between all columns. """ if categorical_columns is None: categorical_columns = list() elif categorical_columns == 'all': categorical_columns = dataset_a.columns assert dataset_a.columns.tolist() == dataset_b.columns.tolist() corr = pd.DataFrame(columns=dataset_a.columns, index=['correlation']) for column in dataset_a.columns.tolist(): if column in categorical_columns: if theil_u: corr[column] = theils_u(dataset_a[column].sort_values(), dataset_b[column].sort_values()) else: corr[column] = cramers_v(dataset_a[column].sort_values(), dataset_b[column].sort_vaues()) else: corr[column], _ = ss.pearsonr(dataset_a[column].sort_values(), dataset_b[column].sort_values()) corr.fillna(value=np.nan, inplace=True) correlation = np.mean(corr.values.flatten()) return correlation
def corr_categories(df): cols = df.columns df1 = pd.DataFrame(columns = ['Var1', 'Var2', 'Corr_Cat']) for i in cols: #j=i[i+1] for j in cols: if i != j: new_row = {'Var1':i, 'Var2':j, 'Corr_Cat': theils_u(df[i], df[j])} df1 = df1.append(new_row, ignore_index=True) return df1.sort_values(by=['Corr_Cat'], ascending=False)
def correlation(df, y): #Co-relating between categorical and categorical variables for col in df.columns: if df[col].dtype == object: corr = theils_u(df[col], y) print('{} = {:.3f}'.format(col, corr)) else: corr = correlation_ratio(y, df[col]) print('{} = {:.3f}'.format(col, corr))
def calculateUncertanityCoeff(df: DataFrame, labels: List[str]) -> List[List[float]]: """Calculates Theil's U uncertainity coefficient. Implemented as in: https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9 Args: df: dataframe one-hot encoding for M labels labels (1,M): name of columns with M labels, one-hot encoding Returns: uncertanity_coeff (M,M): theil's uncertanity coefficient for labels """ uncertanity_coeff = [[0] * len(labels) for label in labels] for label_1 in range(len(labels)): for label_2 in range(len(labels)): uncertanity_coeff[label_1][label_2] = theils_u( df[labels[label_1]], df[labels[label_2]]) return uncertanity_coeff
def get_correlation_dataframe(data, **kwargs): """ Parameters ---------- data: pandas.DataFrame DataFrame with nominal or metrical columns kwargs: show_progress: bool, default=False Prints each row if True Returns ------- var name=data_corr: pandas.DataFrame, with two column names and their correlation """ if 'show_progress' not in kwargs: kwargs['show_progress'] = False data_corr = pd.DataFrame(columns=[ 'variable1', 'variable2', 'correlation', 'correlation_rounded' ]) for variable1 in data: for variable2 in data: # nominal-nominal -> Theils U if type(data[variable1][0]) == str and type( data[variable2][0]) == str: corr = nominal.theils_u(data[variable1], data[variable2], nan_replace_value='f') # metircal-metrical -> Pearsons R elif util_func.is_number( data[variable1][0]) and util_func.is_number( data[variable2][0]): corr = scipy.stats.stats.pearsonr(data[variable1], data[variable2])[0] # change range from [-1, 1] to [0, 1] as the other metrics corr = (corr + 1) / 2 # metrical-nominal -> correlation ratio elif type(data[variable1][0]) == str and util_func.is_number( data[variable2][0]): corr = nominal.correlation_ratio(data[variable1], data[variable2], nan_replace_value='f') elif type(data[variable2][0]) == str and util_func.is_number( data[variable1][0]): corr = nominal.correlation_ratio(data[variable2], data[variable1], nan_replace_value='f') else: print('var1-type: ' + str(type(data[variable1][0])) + ', var2-type: ' + str(type(data[variable2][0]))) print('var1: ' + str(data[variable1][0]) + ', var2: ' + str(data[variable2][0])) new_row = { 'variable1': variable1, 'variable2': variable2, 'correlation': corr, 'correlation_rounded': round(corr, 2) } data_corr = data_corr.append(new_row, ignore_index=True) if kwargs['show_progress']: print(new_row) return data_corr
def categorical_categorical(df, col1, col2): corr = theils_u(df[col1], df[col2]) print(corr)
corr = correlation_ratio(df['subscribed'], df[col]) print('{:.3f}'.format(corr)) print('\n\n\n') for col in df.columns: if df[col].dtype == object: print('{} vs Subscribed'.format(col)) corr = theils_u(df['subscribed'], df[col]) print('{:.3f}'.format(corr)) """ #Marital status and Loans: is there a relation? print('\nAnalyzing marital status vs Loans:\n') status = df.marital.unique() for s in status: print(s) data = df.groupby(df.marital).get_group(s)['loan'].value_counts() print('{:.2f}{}'.format(100 * data['yes'] / (data['yes'] + data['no']), '%')) corelation = theils_u(df['loan'], df['marital']) print('\nCorrelation ratio = {:.3f}'.format(corelation)) print('Marital status does not influence loans') #Jobs and Durations: again anova_test(df)
df_all = pd.read_csv('C:/Users/vince_000/Documents/BPI Challenge 2019/New_Exports/clusters_02_02_01.csv') # Build database connection engine = db.create_engine('mssql+pyodbc://adminuser:Yxcvbnm@[email protected]/ProMi?driver=ODBC+Driver+17+for+SQL+Server') con = engine.connect() metadata = db.MetaData(schema = 'stg') table = db.Table('case_table_filtered',metadata,autoload = True, autoload_with=engine) ResultProxy = con.execute(db.select([table])) ResultSet = ResultProxy.fetchall() df_export = pd.DataFrame(ResultSet) df_export.columns = ResultSet[0].keys() df_val = df_all.merge(df_export, left_on= 'case', right_on = '_case_concept_name_') column_name = ['_case_Spend_area_text_', '_case_Sub_spend_area_text_', '_case_Name_','_case_Vendor_'] theils_u = [] cramers_v = [] for c in column_name: theils_u.append(nl.theils_u(df_val['cluster'] , df_val[c])) cramers_v.append(nl.cramers_v(df_val[c], df_val['cluster'] )) df_nominal_corr = pd.DataFrame({'column' : column_name, 'uncertainty coefficient': theils_u}) df_nominal_corr.to_csv('C:/Users/vince_000/Documents/GitHub/BPI_Challenge_2019/Python/Clustering/Correlation_to_Clustering_in_02_02_01/Correlation_to_Clustering_in_02_02_01.csv', index = False) #group_analysis = df_val.filter(['_case_Vendor_', 'cluster', '_case_concept_name_']).groupby(['_case_Vendor_', 'cluster']).count() #
# Let's verify that Cramér's V is a symmetric function cramers_v(df['Survived'], df['Pclass']) == cramers_v(df['Pclass'], df['Survived']) # %% # You can also draw a mosaic plot for these variables mosaic(data=df, index=['Survived', 'Pclass'], statistic=True, axes_label=True, gap=[0.01, 0.02]) # %% # Take advantage of the asymmetry of Theil's U calculating it for the same variables. # This is U(Survived|Pcalss) that is "U for Survived given Pclass" theils_u(df['Survived'], df['Pclass']) # %% # Just check that the opposite direction gives you a different result theils_u(df['Pclass'], df['Survived']) # %% # Let's draw a violin plot of Age and Pclass violinPlot(data=df, varx='Pclass', vary='Age', title='Passenger age VS Passenger class', xlab='Pclass', ylab='Age') # In case you're not using a Jupyter notebook run also the following: # plt.show()
def theils_u(x, y): return theils_u(x, y)