def applyHierarchiqueClusteringFromDataset(metadataDataset,distanceMatrixComplete,parameter=5,typeOfHierarchical=HIERARCHICAL_FIXED_NUMBER_OF_CLUSERS,method=HIERARCHICAL_COMPLETE_LINKAGE): #single;#average;#complete;#weighted;#centroid;#median;#ward clusteringResults=[dict(obj) for obj in metadataDataset] innerMatrix,mapRowsID,mapColumnsID=getInnerMatrix(distanceMatrixComplete) # for i in range (len(innerMatrix)) : # innerMatrix[i][i]=float(0) # for index,row in enumerate(innerMatrix) : for column,val in enumerate(row) : if not innerMatrix[column][index] == innerMatrix[index][column] : if (math.isnan(innerMatrix[index][column])) : innerMatrix[index][column]=1. #print innerMatrix[index][column],'-',innerMatrix[column][index] ##ERREURE d'ARRONDIE else : innerMatrix[index][column]=innerMatrix[column][index] ##ERREURE d'ARRONDIE distArray = ssd.squareform(innerMatrix) linkageMatrix=linkage(distArray, method) if (typeOfHierarchical==HIERARCHICAL_FIXED_NUMBER_OF_CLUSERS): cutree = hierarchy.cut_tree(linkageMatrix, n_clusters=[parameter, parameter]) elif (typeOfHierarchical==HIERARCHICAL_SIMPLE): cutree = hierarchy.cut_tree(linkageMatrix, height =[parameter, parameter]) cuttreeclusters = [(k,v[0]) for k,v in enumerate(cutree.tolist())] clusters={} for value in iter(cuttreeclusters): clusteringResults[value[0]]['CLUSTER']=str(value[1]) if not clusters.has_key(str(value[1])) : clusters[str(value[1])]=0 clusters[str(value[1])]+=1 return clusteringResults,clusters,linkageMatrix
def reorganize_similarly(origin,dest): innerMatrix,rower,header=getInnerMatrix(origin) rower=[rower[r] for r in sorted(rower)] header=[header[r] for r in sorted(header)] rower_inv={v:key for key,v in enumerate(rower)} header_inv={v:key for key,v in enumerate(header)} innerMatrix_ref,rower_ref,header_ref=getInnerMatrix(dest) rower_ref=[rower_ref[r] for r in sorted(rower_ref)] header_ref=[header_ref[r] for r in sorted(header_ref)] rower_ref_inv={v:key for key,v in enumerate(rower_ref)} header_ref_inv={v:key for key,v in enumerate(header_ref)} new_inner_matrix=[[innerMatrix_ref[rower_ref_inv[rowVal]][header_ref_inv[headVal]] for headVal in header] for rowVal in rower] dest=getCompleteMatrix(new_inner_matrix,{xx:yy for xx,yy in enumerate(rower)},{xx:yy for xx,yy in enumerate(header)}) return origin,dest
def generateHeatMap(dataset, destination, color='RdYlGn', vmin=None, vmax=None, organize=False, title=None, showvalues_text=False, only_heatmap=True): ''' @note : color = 'RdYlGn' or 'RdYlGn_r' ''' #dataset = readCSV(source, delimiter=',') innerMatrix, rower, header = getInnerMatrix(dataset) rower = [ unicodedata.normalize('NFD', unicode(str(rower[k]), 'iso-8859-1')).encode('ascii', 'ignore') for k in sorted(rower) ] header = [ unicodedata.normalize('NFD', unicode(str(header[k]), 'iso-8859-1')).encode('ascii', 'ignore') for k in sorted(header) ] nba = pd.DataFrame(innerMatrix, index=rower, columns=header, dtype=float) matrixSimilairty = nba.as_matrix() header_new = header[:] rower_new = rower[:] ########################################################################## if organize: matrix = nba.as_matrix() matrix = [[1 - x for x in row] for row in matrix] isSquare = True if len(matrix) <> len(matrix[0]): isSquare = False if isSquare: for index, row in enumerate(matrix): for column, val in enumerate(row): if not innerMatrix[column][index] == matrix[index][column]: if (math.isnan(matrix[index][column])): matrix[index][column] = 1. #print innerMatrix[index][column],'-',innerMatrix[column][index] ##ERREURE d'ARRONDIE else: matrix[index][column] = matrix[column][ index] ##ERREURE d'ARRONDIE if index == column: matrix[index][column] = 0. distArray = ssd.squareform(matrix) linkageMatrix = linkage(distArray, 'average') cuttreeclusters = sorted([(i, t) for (i, t) in enumerate( hierarchy.fcluster(linkageMatrix, 0.2, 'distance'))], key=lambda x: x[1]) clusters = {} for i, c in cuttreeclusters: if not clusters.has_key(c): clusters[c] = [] clusters[c].append(i) #print clusters cuttreeclustersSorted = [] pairs = [] for i in range(len(matrix)): row = matrix[i] for j in range(len(row)): pairs.append((i, j, row[j])) pairs = sorted(pairs, key=lambda x: x[2]) visited = [] for c, c_elems in clusters.iteritems(): for i, j, d in pairs: if i in c_elems and j in c_elems: if i not in visited: visited.append(i) cuttreeclustersSorted.append((i, c)) if j not in visited: visited.append(j) cuttreeclustersSorted.append((j, c)) rower_new = [rower[t[0]] for t in cuttreeclustersSorted] header_new = [header[t[0]] for t in cuttreeclustersSorted] nba = nba.reindex(index=rower_new, columns=header_new) else: cuttreeclustersSorted = [] pairs = [] for i in range(len(matrix)): row = matrix[i] for j in range(len(row)): pairs.append((i, sum(row))) pairs = sorted(pairs, key=lambda x: x[1]) visited = [] for i, d in pairs: if i not in visited: visited.append(i) cuttreeclustersSorted.append((i, 0)) rower_new = [rower[t[0]] for t in cuttreeclustersSorted] header_new = header[:] nba = nba.reindex(index=rower_new, columns=header_new) ########################################################################## fig, ax = plt.subplots() #fig.gca().set_position((.4, .4, .8, .8)) masked_array = np.ma.array(nba, mask=np.isnan(nba)) heatmap = ax.pcolor(masked_array, cmap=plt.cm.get_cmap(name=color), alpha=1, vmax=vmax, vmin=vmin) if showvalues_text: show_values(heatmap) fig = plt.gcf() if not only_heatmap: fig.subplots_adjust(bottom=0.2, top=0.87, right=1.) fig.set_size_inches(40, 40) #40, 40 ax.set_frame_on(False) ax.set_yticks(np.arange(nba.shape[0]) + 0.5, minor=False) ax.set_xticks(np.arange(nba.shape[1]) + 0.5, minor=False) ax.invert_yaxis() ax.xaxis.tick_top() if not only_heatmap: fig.suptitle(title, fontsize=25, fontweight='bold') xlabels = header_new #header[:] ylabels = rower_new #rower ax.set_xticklabels(xlabels, minor=False, fontsize=60) ax.set_yticklabels(ylabels, minor=False, fontsize=60) plt.xticks(rotation=90) plt.yticks(rotation=0) #plt.colorbar(heatmap) ax.grid(False) ax = plt.gca() #ax.set_position((.1, .3, .8, .6)) if not only_heatmap: plt.figtext( 0.12, .1, 'Details about :\n * the pattern \n * dossiers \n * compared MEPs', fontsize=40) for t in ax.xaxis.get_major_ticks(): t.tick1On = False t.tick2On = False for t in ax.yaxis.get_major_ticks(): t.tick1On = False t.tick2On = False if not only_heatmap: plt.colorbar(heatmap) #fig.tight_layout() plt.tight_layout() plt.savefig(destination, dpi=100) fig.clf() plt.clf() plt.gcf().clear() plt.cla() plt.close('all')
def organize_matrix(pairwiseStatistics): mat_pattern = pairwiseStatistics[1] mat_ref = pairwiseStatistics[0] innerMatrix, rower, header = getInnerMatrix(mat_pattern) rower = [ unicodedata.normalize('NFD', unicode(str(rower[k]), 'iso-8859-1')).encode('ascii', 'ignore') for k in sorted(rower) ] header = [ unicodedata.normalize('NFD', unicode(str(header[k]), 'iso-8859-1')).encode('ascii', 'ignore') for k in sorted(header) ] import pandas as pd import scipy.spatial.distance as ssd from scipy.cluster.hierarchy import linkage from scipy.cluster import hierarchy nba = pd.DataFrame(innerMatrix, index=rower, columns=header, dtype=float) header_new = header[:] rower_new = rower[:] matrix = nba.as_matrix() matrix = [[1 - x for x in row] for row in matrix] isSquare = True if len(matrix) <> len(matrix[0]): isSquare = False if isSquare: for index, row in enumerate(matrix): for column, val in enumerate(row): if not innerMatrix[column][index] == matrix[index][column]: if (math.isnan(matrix[index][column])): matrix[index][column] = 1. #print innerMatrix[index][column],'-',innerMatrix[column][index] ##ERREURE d'ARRONDIE else: matrix[index][column] = matrix[column][ index] ##ERREURE d'ARRONDIE if index == column: matrix[index][column] = 0. distArray = ssd.squareform(matrix) linkageMatrix = linkage(distArray, 'average') cuttreeclusters = sorted([(i, t) for ( i, t) in enumerate(hierarchy.fcluster(linkageMatrix, 0.2, 'distance')) ], key=lambda x: x[1]) clusters = {} for i, c in cuttreeclusters: if not clusters.has_key(c): clusters[c] = [] clusters[c].append(i) #print clusters cuttreeclustersSorted = [] pairs = [] for i in range(len(matrix)): row = matrix[i] for j in range(len(row)): pairs.append((i, j, row[j])) pairs = sorted(pairs, key=lambda x: x[2]) visited = [] for c, c_elems in clusters.iteritems(): for i, j, d in pairs: if i in c_elems and j in c_elems: if i not in visited: visited.append(i) cuttreeclustersSorted.append((i, c)) if j not in visited: visited.append(j) cuttreeclustersSorted.append((j, c)) rower_new = [rower[t[0]] for t in cuttreeclustersSorted] header_new = [header[t[0]] for t in cuttreeclustersSorted] nba = nba.reindex(index=rower_new, columns=header_new) else: cuttreeclustersSorted = [] pairs = [] for i in range(len(matrix)): row = matrix[i] for j in range(len(row)): pairs.append((i, sum(row))) pairs = sorted(pairs, key=lambda x: x[1]) visited = [] for i, d in pairs: if i not in visited: visited.append(i) cuttreeclustersSorted.append((i, 0)) rower_new = [rower[t[0]] for t in cuttreeclustersSorted] header_new = header[:] nba = nba.reindex(index=rower_new, columns=header_new) new_inner_matrix = nba.get_values().tolist() new_rower = list(nba.index) new_header = list(nba) new_rower_map = {i: v for i, v in enumerate(new_rower)} new_header_map = {i: v for i, v in enumerate(new_header)} new_mat_pattern = getCompleteMatrix(new_inner_matrix, new_rower_map, new_header_map) innerMatrix2, rower2, header2 = getInnerMatrix(mat_ref) rower2 = [ unicodedata.normalize('NFD', unicode(str(rower[k]), 'iso-8859-1')).encode('ascii', 'ignore') for k in sorted(rower2) ] header2 = [ unicodedata.normalize('NFD', unicode(str(header[k]), 'iso-8859-1')).encode('ascii', 'ignore') for k in sorted(header2) ] nba2 = pd.DataFrame(innerMatrix2, index=rower2, columns=header2, dtype=float) nba2 = nba2.reindex(index=rower_new, columns=header_new) new_inner_matrix2 = nba2.get_values().tolist() new_mat_ref = getCompleteMatrix(new_inner_matrix2, new_rower_map, new_header_map) return [new_mat_ref, new_mat_pattern]