def data_correaltion(): data_path = "/Users/vaishnaviv/PycharmProjects/Assignment3_NeuralNetworks/BSOM_DataSet_for_HW3.csv" __data_raw = pd.read_csv(data_path) __data = __data_raw[[ 'all_mcqs_avg_n20', 'all_NBME_avg_n4', 'CBSE_01', 'CBSE_02', 'SA_NBME', 'STEP_1', 'LEVEL' ]] print(__data_raw.columns.tolist()) #__data = __data_raw[['O1_PI_01', 'O1_PI_02', 'O1_PI_03', 'O1_PI_04', 'O1_PI_05', 'O1_PI_06', 'O1_PI_07', 'O1_PI_08', 'O1_PI_09', 'O1_PI_10', 'O1_PI_11', 'O1_PI_12', 'O1_PI_13', 'O2_PI_01', 'O2_PI_02', 'O2_PI_03', 'O2_PI_04', 'O2_PI_05', 'O2_PI_06', 'O2_PI_07', 'O2_PI_08', 'O2_PI_09', 'O2_PI_10', 'O2_PI_11', 'O2_PI_12', 'O2_PI_13', 'HA_PI_01', 'HA_PI_02', 'HA_PI_03', 'HA_PI_04', 'HD_PI_01', 'HD_PI_02', 'HD_PI_03', 'HD_PI_04', 'HD_PI_05', 'HD_PI_06', 'HD_PI_07', 'HD_PI_08', 'HD_PI_09', 'HD_PI_10', 'HD_PI_11', 'HD_PI_12', 'HD_PI_13', 'HD_PI_14', 'HD_PI_15', 'SA_PI_01', 'SA_PI_02', 'SA_PI_03', 'SA_PI_04', 'SA_PI_05', 'SA_PI_06', 'SA_PI_07', 'SA_PI_08', 'SA_PI_09', 'SA_PI_10', 'SA_PI_11', 'SA_PI_12', 'SA_PI_13', 'SA_PI_14', 'SA_PI_15', 'SA_PI_16', 'SA_PI_17', 'SA_PI_18', 'SA_PI_19', 'SA_PI_20', 'SA_PI_21', 'SA_PI_22', 'SA_PI_23', 'SA_PI_24', 'SA_PI_25', 'SA_PI_26', 'B2E_PI_01', 'B2E_PI_02', 'B2E_PI_03', 'B2E_PI_04', 'B2E_PI_05', 'B2E_PI_06', 'B2E_PI_07', 'B2E_PI_08', 'B2E_PI_09', 'B2E_PI_10', 'B2E_PI_11', 'B2E_PI_12', 'B2E_PI_13', 'B2E_PI_14', 'B2E_PI_15', 'B2E_PI_16', 'B2E_PI_17', 'B2E_PI_18', 'B2E_PI_19', 'B2E_PI_20', 'B2E_PI_21', 'B2E_PI_22', 'B2E_PI_23', 'B2E_PI_24', 'B2E_PI_25', 'B2E_PI_26', 'B2E_PI_27', 'B2E_PI_28', 'B2E_PI_29', 'B2E_PI_30', 'BCR_PI_01', 'BCR_PI_02', 'BCR_PI_03', 'BCR_PI_04', 'BCR_PI_05', 'BCR_PI_06', 'BCR_PI_07', 'BCR_PI_08', 'BCR_PI_09', 'BCR_PI_10', 'BCR_PI_11', 'BCR_PI_12', 'BCR_PI_13', 'BCR_PI_14', 'BCR_PI_15', 'BCR_PI_16', 'BCR_PI_17', 'BCR_PI_18', 'BCR_PI_19', 'BCR_PI_20', 'BCR_PI_21', 'BCR_PI_22', 'BCR_PI_23', 'BCR_PI_24', 'BCR_PI_25', 'BCR_PI_26', 'BCR_PI_27', 'BCR_PI_28', 'BCR_PI_29', 'BCR_PI_30', 'BCR_PI_31', 'O1_IRAT_01', 'O1_IRAT_02', 'O1_IRAT_03', 'O1_IRAT_04', 'O1_IRAT_05', 'O1_IRAT_06', 'O1_IRAT_07', 'O1_IRAT_08', 'O1_IRAT_09', 'O1_IRAT_10', 'O1_IRAT_11', 'O1_IRAT_12', 'O2_IRAT_01', 'O2_IRAT_02', 'HA_IRAT_01', 'HA_IRAT_02', 'HD_IRAT_01', 'HD_IRAT_02', 'SA_IRAT_01', 'SA_IRAT_02', 'SA_IRAT_03', 'SA_IRAT_04', 'SA_IRAT_05', 'SA_IRAT_06', 'SA_IRAT_07', 'B2E_IRAT_01', 'B2E_IRAT_02', 'B2E_IRAT_03', 'B2E_IRAT_04', 'B2E_IRAT_05', 'B2E_IRAT_06', 'BCR_IRAT_01', 'BCR_IRAT_02', 'BCR_IRAT_03', 'O1_MCQ1_IND', 'O1_MCQ1_GRP', 'O1_MCQ1_TOT', 'O1_MCQ2_IND', 'O1_MCQ2_GRP', 'O1_MCQ2_TOT', 'O1_MCQ3_IND', 'O1_MCQ3_GRP', 'O1_MCQ3_TOT', 'O2_MCQ1_IND', 'O2_MCQ1_GRP', 'O2_MCQ1_TOT', 'O2_MCQ2_IND', 'O2_MCQ2_GRP', 'O2_MCQ2_TOT', 'O2_MCQ3_IND', 'O2_MCQ3_GRP', 'O2_MCQ3_TOT', 'HD_MCQ1_IND', 'HD_MCQ1_GRP', 'HD_MCQ1_TOT', 'SA_MCQ1_IND', 'SA_MCQ1_GRP', 'SA_MCQ1_TOT', 'SA_MCQ2_IND', 'SA_MCQ2_GRP', 'SA_MCQ2_TOT', 'SA_MCQ3_IND', 'SA_MCQ3_GRP', 'SA_MCQ3_TOT', 'SA_MCQ4_IND', 'SA_MCQ4_GRP', 'SA_MCQ4_TOT', 'SA_MCQ5_IND', 'SA_MCQ5_GRP', 'SA_MCQ5_TOT', 'B2E_MCQ1_IND', 'B2E_MCQ1_GRP', 'B2E_MCQ1_TOT', 'B2E_MCQ2_IND', 'B2E_MCQ2_GRP', 'B2E_MCQ2_TOT', 'B2E_MCQ3_IND', 'B2E_MCQ3_GRP', 'B2E_MCQ3_GRP.1', 'B2E_MCQ4_IND', 'B2E_MCQ4_GRP', 'B2E_MCQ4_TOT', 'BCR_MCQ1_IND', 'BCR_MCQ1_GRP', 'BCR_MCQ1_TOT', 'BCR_MCQ2_IND', 'BCR_MCQ2_GRP', 'BCR_MCQ2_TOT', 'BCR_MCQ3_IND', 'BCR_MCQ3_GRP', 'BCR_MCQ3_TOT', 'BCR_MCQ4_IND', 'BCR_MCQ4_GRP', 'BCR_MCQ4_TOT', 'BCR_NBME_final', 'B2E_NBME_final', 'O1_O2_NBME', 'SA_NBME', 'HA_final', 'HD_final', 'all_NBME_avg_n4', 'all_mcqs_avg_n20', 'O1_PI_AVG_13', 'O2_PI_AVG_13', 'O1O2_PI_AVG_26', 'HA_PI_AVG_04', 'HD_PI_AVG_15', 'SA_PI_AVG_26', 'B2E_PI_AVG_30', 'BCR_PI_AVG_30', 'O1_IRAT_AVG_12', 'O2_IRAT_AVG_02', 'HA_IRAT_AVG_02', 'HD_IRAT_AVG_02', 'SA_IRAT_AVG_07', 'B2E_IRAT_AVG_06', 'BCR_IRAT_AVG_03', 'O1_MCQ_AVG_03', 'O2_MCQ_AVG_03', 'HD_MCQ_AVG_01', 'SA_MCQ_AVG_05', 'B2E_MCQ_AVG_04', 'BCR_MCQ_AVG_04', 'BCR_ANAT_MCQ_AVG_02', 'CBSE_01', 'CBSE_02', 'STEP_1', 'LEVEL']] #__data=__data.dropna() # corelatiodata = __data.corr(method="spearman") # print(corelatiodata) # c = corrplot.Corrplot(corelatiodata) # c.plot(colorbar=False, method="square", shrink=.9, rotation=45) # plt.show() __data_LEVEL = __data.LEVEL.astype("category").cat.codes __data = __data.drop(['LEVEL'], axis=1) __data['LEVEL'] = __data_LEVEL #spearman corealtion print(__data) corelatiodata = __data.corr(method="spearman") print(corelatiodata['LEVEL'].sort_values()) c = corrplot.Corrplot(corelatiodata) c.plot(colorbar=False, method="square", shrink=.9, rotation=45) plt.show() #pearson corealtion corelatiodata = __data.corr() c = corrplot.Corrplot(corelatiodata) c.plot(colorbar=False, method="square", shrink=.9, rotation=45) plt.show()
def myinstance(): try: letters = string.uppercase[0:10] except: #python3 letters= string.ascii_uppercase[0:10] df = pd.DataFrame(dict(( (k, np.random.random(10)+ord(k)-65) for k in letters))) klass = corrplot.Corrplot(df.corr()) klass = corrplot.Corrplot(df) return klass
def test_correlation(myinstance): df1 = pd.DataFrame([[1,2,3,4],[4,5,1,2]]) c1 = corrplot.Corrplot(df1) df2 = pd.DataFrame([[1,2,3,4],[4,5,1,2]]).corr() c2 = corrplot.Corrplot(df2) # in c1, the correlation is computed. assert (c1.df == c2.df).all().all() == True
def heatmap_plot(df, x, path): #!pip install biokit # Drop unrelevant columns """ Inputs= df,x, path df---> the dataframe x--> list of coolums to be dropped Path---> the path to store the heatmap plot """ for i in range(len(x)): del df[x[i]] from sklearn import preprocessing le = preprocessing.LabelEncoder() for x in df.columns: if df[x].dtypes == 'object': df[x] = le.fit_transform(df[x]) from biokit.viz import corrplot cor = df.corr(method='kendall') # {‘pearson’, ‘kendall’, ‘spearman’} c = corrplot.Corrplot(cor) c.plot(colorbar=True, method='square', shrink=.99, rotation=90)
def metric_corr(results: list): df = get_all_metric_values(results) # ref: https://nbviewer.jupyter.org/github/biokit/biokit/blob/master/notebooks/viz/corrplot.ipynb c = corrplot.Corrplot(df) # c.plot(method="text", colorbar=False, fontsize=12, rotation=45) c.plot(method="square", colorbar=True, shrink=0.9, rotation=45) plt.show()
def generate_correlation(data): measures = [ 'dg', 'stg', 'sp', 'sp_w', 'pr', 'pr_w', 'accs', 'gaccs', 'sym', 'at' ] names = [ 'dg', 'stg', 'sp', 'sp_w', 'pr', 'pr_w', 'access', 'gAccess', 'sym', 'absT' ] matrix_correlations = [] for index, measure in enumerate(measures): print index, measure for measure in measures: temporal = data[measure] vector = [] for temp_measure in measures: temp_vector = data[temp_measure] correlation = get_correlations(temporal, temp_vector) vector.append(correlation) matrix_correlations.append(vector) for i in matrix_correlations: print i #letters = string.uppercase[0:10] #print dict( ( (k, np.random.random(10)+ord(k)-65) for k in letters)) dictionary = dict() for index, measure in enumerate(measures): dictionary[measure] = matrix_correlations[index] df = pd.DataFrame(matrix_correlations) #df = df.corr() #fig, ax = plt.subplots(1, 1) #m = plot_corr_ellipses(df, ax=ax, cmap='seismic') #cb = fig.colorbar(m) #cb.set_label('Correlation coefficient') #ax.margins(0.1) c = corrplot.Corrplot(df) c.plot(lower='ellipse', cmap='hsv') # hsv gist_rainbow jet #value = np.asarray(matrix_correlations) t1 = 'Matrix of Spearman correlation for CSTNews' t2 = 'Matrix of Spearman correlation for DUC-2002' t3 = 'Matrix of Spearman correlation for DUC-2004' #sm.graphics.plot_corr(value, xnames=names, title=t3) plt.show()
def plot_correlations(blups, traits, outprefix): # Subset blups to just the traits listed myblups = blups[traits] # Set up figure outpng = outprefix + ".corrplot.png" fig = plt.figure(figsize=(len(traits) * 1.25, len(traits))) ax = fig.add_subplot(111) # Plot correlation matrix cors = corrplot.Corrplot(myblups) cors.plot(ax=ax, lower='ellipse', upper='number') fig.savefig(outpng, dpi=100)
def hinton(df, fig=1, shrink=2, method='square', bgcolor='grey', cmap='gray_r', binarise_color=True): """Hinton plot (simplified version of correlation plot) :param df: the input data as a dataframe or list of items (list, array). See :class:`~biokit.viz.corrplot.Corrplot` for details. :param fig: in which figure to plot the data :param shrink: factor to increase/decrease sizes of the symbols :param method: set the type of symbols for each coordinates. (default to square). See :class:`~biokit.viz.corrplot.Corrplot` for more details. :param bgcolor: set the background and label colors as grey :param cmap: gray color map used by default :param binarise_color: use only two colors. One for positive values and one for negative values. .. plot:: :include-source: :width: 80% from biokit.viz import hinton df = np.random.rand(20, 20) - 0.5 hinton(df) .. note:: Idea taken from a matplotlib recipes http://matplotlib.org/examples/specialty_plots/hinton_demo.html but solely using the implementation within :class:`~biokit.viz.corrplot.Corrplot` .. seealso:: :class:`biokit.viz.corrplot.Corrplot` .. note:: Values must be between -1 and 1. No sanity check performed. """ from biokit.viz import corrplot c = corrplot.Corrplot(df) c.plot(colorbar=False, cmap=cmap, fig=fig, method=method, facecolor=bgcolor, shrink=shrink, label_color=bgcolor, binarise_color=binarise_color)
def correlation_matrix(df, features=FEATURES, output_file=None): corr = corrplot.Corrplot(df[features].corr(method="spearman")) corr.plot(grid=False, method='text', colorbar=True, lower='ellipse', upper='text') figure = plt.gcf() ax = plt.gca() labels = [ label.replace("sonar_", "").replace("_", " ").title() for label in features ] ax.set_xticklabels(labels) ax.set_yticklabels(labels) plt.gca().spines['left'].set_visible(False) plt.gca().spines['top'].set_visible(False) plt.gca().spines['right'].set_visible(False) plt.gca().spines['bottom'].set_visible(False) if output_file: figure.tight_layout() figure.savefig(output_file) else: figure.show()
""" Corrplot example ================== """ # some useful pylab imports for this notebook # Create some random data import string letters = string.ascii_uppercase[0:15] import pandas as pd import numpy as np df = pd.DataFrame( dict(((k, np.random.random(10) + ord(k) - 65) for k in letters))) df = df.corr() # if the input is not a square matrix or indices do not match # column names, correlation is computed on the fly from biokit.viz import corrplot c = corrplot.Corrplot(df) c.plot(colorbar=False, method='square', shrink=.9, rotation=45)
def setup_class(klass): letters = string.ascii_uppercase[0:10] df = pd.DataFrame( dict(((k, np.random.random(10) + ord(k) - 65) for k in letters))) klass.s = corrplot.Corrplot(df.corr()) klass.s = corrplot.Corrplot(df)
import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from biokit.viz import corrplot # load data dataframe = pd.read_csv("preprocessed_data.csv") # find correlation co_relation = dataframe.corr() # generate correlation plot cp = corrplot.Corrplot(co_relation) cp.plot(method='pie', shrink=.9, grid=False) plt.savefig('correlation.png') # generate pair plot for all attributes sns.pairplot(dataframe) sns.plt.savefig('data_distribution.png') sns.plt.clf() # generate heatmap based on correlation sns.heatmap(co_relation, linewidths=.5, cmap="YlGnBu") sns.plt.savefig('correlation_heatmap.png')
'PartZeta', # included in dimensionless ratio of zetas 'PartIEP', # included in dimensionless ratio of pH to IEP 'PartDiam', # included in dimensionless aspect ratio 'CollecDiam', # included in dimensionless aspect ratio 'CollecZeta',# included in dimensionless aspect ratio 'IonStr', # included in Debye Length 'SaltType',# included in Debye Length 'pH'# included in dimensionless ratio of pH to IEP ],1) # print list(data) # print out the remaining data field headers # Make sure to install biokit dependencies with requirements.txt # https://pypi.python.org/pypi/biokit/0.0.5 c = corrplot.Corrplot(data) c.plot(upper='circle',fontsize = 10) # assign the remaining data to the training data set. trainingData = data # Store the training data and target data as a matrices for import into ML. trainingDataMatrix = trainingData.as_matrix() # all numbers, no headers targetDataRPShapeMatrix = targetDataRPShape targetDataRFMatrix = targetDataRF.as_matrix() # all numbers, no headers # Get a list of the trainingData features remaining. This is used later for plotting etc. trainingDataNames = list(trainingData) # print trainingDataNames
injuries = {'NUMBER_KILLED': 'Total', 'COUNT_PED_KILLED': 'Walked', 'COUNT_PED_INJURED': 'Walked', 'COUNT_BICYCLIST_KILLED': 'Bicycle', 'COUNT_BICYCLIST_INJURED': 'Bicycle', 'COUNT_MC_KILLED': 'Taxicab, motorcycle, or other means', 'COUNT_MC_INJURED': 'Taxicab, motorcycle, or other means'} for injury in list(injuries): df[injury] = df.index.map(lambda county: co[(int(county) * 100 <= co['CNTY_CITY_LOC']) & (co['CNTY_CITY_LOC'] < int(county) * 100 + 100)][injury].sum()) for injury, mode in injuries.items(): df[injury + ' Rate'] = df[injury].apply(int) / df[mode].apply(int) * 100 df[['NAME','Bicycle','Bicycle Rate','COUNT_BICYCLIST_KILLED','COUNT_BICYCLIST_KILLED Rate']].sort_values(['Bicycle Rate'], ascending=False) dft = df.convert_objects(convert_numeric=True) # cor = list(set(injuries.values())) + list(injuries) cor = list(modes) + list(injuries) b = [] for a in cor: b.append(a + ' Rate') c = corrplot.Corrplot(dft[b]) matplotlib.rcParams.update({'font.size': 8}) c.plot() pyplot.savefig('/Users/david/Desktop/fig.svg') # pyplot.show() # df.to_csv('modes.csv') pyplot.scatter(df['Bicycle Rate'],df['COUNT_BICYCLIST_KILLED Rate']) pyplot.show() for county in df.index: print(df['NAME'][county]) for injury in injuries: print(injury) co[(int(county) * 100 <= co['CNTY_CITY_LOC']) & (co['CNTY_CITY_LOC'] < int(county) * 100 + 100)][injury].sum()
correlation_matrix = np.load('correlation-matrix-nohbond-poster-new.npy') for i, row in enumerate(correlation_matrix): for j, val in enumerate(row): correlation_matrix[i][j] = round(val, 2) if abs(val) < 0.005: correlation_matrix[i][j] == 0.0 fig, ax = plt.subplots(figsize=(16, 12)) ''' ax = sns.heatmap(correlation_matrix, xticklabels=variables, yticklabels=variables, annot=True, cmap='coolwarm', vmin=-1.0, vmax=1.0, center=0.0, square=True, annot_kws={'size': 16, 'weight': 'semibold'}, cbar_kws={'ticks': [-1, -0.5, 0, 0.5, 1], 'shrink': 0.68, 'label': 'Pearson correlation coefficient'}) ''' c = corrplot.Corrplot(correlation_matrix) c.order(inplace=True) c.plot(fig=fig, grid=True, rotation=30, upper='circle', lower=None, shrink=0.9, facecolor='white', colorbar=True, label_color='black', fontsize='large', edgecolor='black', method='circle', cmap='coolwarm', ax=ax)
def plot_corr(df_corr, method='square'): c = corrplot.Corrplot(df_corr) c.plot(method=method, shrink=.9, rotation=45)