Пример #1
0
def applyHierarchiqueClusteringFromDataset(metadataDataset,distanceMatrixComplete,parameter=5,typeOfHierarchical=HIERARCHICAL_FIXED_NUMBER_OF_CLUSERS,method=HIERARCHICAL_COMPLETE_LINKAGE): #single;#average;#complete;#weighted;#centroid;#median;#ward
    clusteringResults=[dict(obj) for obj in metadataDataset]
    innerMatrix,mapRowsID,mapColumnsID=getInnerMatrix(distanceMatrixComplete)
    
#     for i in range (len(innerMatrix)) :
#         innerMatrix[i][i]=float(0)
#     
    for index,row in enumerate(innerMatrix) :
        for column,val in enumerate(row) :
            if not innerMatrix[column][index] == innerMatrix[index][column] :
                if (math.isnan(innerMatrix[index][column])) :
                    innerMatrix[index][column]=1.
                #print innerMatrix[index][column],'-',innerMatrix[column][index] ##ERREURE d'ARRONDIE
                else :
                    innerMatrix[index][column]=innerMatrix[column][index] ##ERREURE d'ARRONDIE
                    
    distArray = ssd.squareform(innerMatrix)
    
        
    linkageMatrix=linkage(distArray, method)
    
    if (typeOfHierarchical==HIERARCHICAL_FIXED_NUMBER_OF_CLUSERS):
        cutree = hierarchy.cut_tree(linkageMatrix, n_clusters=[parameter, parameter])
    elif (typeOfHierarchical==HIERARCHICAL_SIMPLE):
        cutree = hierarchy.cut_tree(linkageMatrix, height =[parameter, parameter])
    cuttreeclusters = [(k,v[0]) for k,v in enumerate(cutree.tolist())]
    clusters={}
    for value in iter(cuttreeclusters):
        clusteringResults[value[0]]['CLUSTER']=str(value[1])
        if not clusters.has_key(str(value[1])) :
            clusters[str(value[1])]=0
        clusters[str(value[1])]+=1
    
    
    return clusteringResults,clusters,linkageMatrix
Пример #2
0
def reorganize_similarly(origin,dest):

	innerMatrix,rower,header=getInnerMatrix(origin)
	rower=[rower[r] for r in sorted(rower)]
	header=[header[r] for r in sorted(header)]
	rower_inv={v:key for key,v in enumerate(rower)}
	header_inv={v:key for key,v in enumerate(header)}

	innerMatrix_ref,rower_ref,header_ref=getInnerMatrix(dest)
	rower_ref=[rower_ref[r] for r in sorted(rower_ref)]
	header_ref=[header_ref[r] for r in sorted(header_ref)]
	rower_ref_inv={v:key for key,v in enumerate(rower_ref)}
	header_ref_inv={v:key for key,v in enumerate(header_ref)}

	
	new_inner_matrix=[[innerMatrix_ref[rower_ref_inv[rowVal]][header_ref_inv[headVal]] for headVal in header] for rowVal in rower]
	dest=getCompleteMatrix(new_inner_matrix,{xx:yy for xx,yy in enumerate(rower)},{xx:yy for xx,yy in enumerate(header)})
	
	
	return origin,dest
Пример #3
0
def generateHeatMap(dataset,
                    destination,
                    color='RdYlGn',
                    vmin=None,
                    vmax=None,
                    organize=False,
                    title=None,
                    showvalues_text=False,
                    only_heatmap=True):
    '''
    @note : color = 'RdYlGn' or 'RdYlGn_r'
    '''
    #dataset = readCSV(source, delimiter=',')
    innerMatrix, rower, header = getInnerMatrix(dataset)

    rower = [
        unicodedata.normalize('NFD',
                              unicode(str(rower[k]),
                                      'iso-8859-1')).encode('ascii', 'ignore')
        for k in sorted(rower)
    ]
    header = [
        unicodedata.normalize('NFD',
                              unicode(str(header[k]),
                                      'iso-8859-1')).encode('ascii', 'ignore')
        for k in sorted(header)
    ]

    nba = pd.DataFrame(innerMatrix, index=rower, columns=header, dtype=float)
    matrixSimilairty = nba.as_matrix()
    header_new = header[:]
    rower_new = rower[:]

    ##########################################################################
    if organize:
        matrix = nba.as_matrix()
        matrix = [[1 - x for x in row] for row in matrix]
        isSquare = True
        if len(matrix) <> len(matrix[0]):
            isSquare = False
        if isSquare:
            for index, row in enumerate(matrix):
                for column, val in enumerate(row):
                    if not innerMatrix[column][index] == matrix[index][column]:
                        if (math.isnan(matrix[index][column])):
                            matrix[index][column] = 1.
                        #print innerMatrix[index][column],'-',innerMatrix[column][index] ##ERREURE d'ARRONDIE
                        else:
                            matrix[index][column] = matrix[column][
                                index]  ##ERREURE d'ARRONDIE
                    if index == column:
                        matrix[index][column] = 0.

            distArray = ssd.squareform(matrix)
            linkageMatrix = linkage(distArray, 'average')

            cuttreeclusters = sorted([(i, t) for (i, t) in enumerate(
                hierarchy.fcluster(linkageMatrix, 0.2, 'distance'))],
                                     key=lambda x: x[1])
            clusters = {}
            for i, c in cuttreeclusters:
                if not clusters.has_key(c):
                    clusters[c] = []
                clusters[c].append(i)
            #print clusters

            cuttreeclustersSorted = []

            pairs = []
            for i in range(len(matrix)):
                row = matrix[i]
                for j in range(len(row)):
                    pairs.append((i, j, row[j]))
            pairs = sorted(pairs, key=lambda x: x[2])
            visited = []
            for c, c_elems in clusters.iteritems():
                for i, j, d in pairs:
                    if i in c_elems and j in c_elems:
                        if i not in visited:
                            visited.append(i)
                            cuttreeclustersSorted.append((i, c))
                        if j not in visited:
                            visited.append(j)
                            cuttreeclustersSorted.append((j, c))
            rower_new = [rower[t[0]] for t in cuttreeclustersSorted]
            header_new = [header[t[0]] for t in cuttreeclustersSorted]
            nba = nba.reindex(index=rower_new, columns=header_new)
        else:
            cuttreeclustersSorted = []
            pairs = []
            for i in range(len(matrix)):
                row = matrix[i]
                for j in range(len(row)):
                    pairs.append((i, sum(row)))
            pairs = sorted(pairs, key=lambda x: x[1])
            visited = []
            for i, d in pairs:
                if i not in visited:
                    visited.append(i)
                    cuttreeclustersSorted.append((i, 0))

            rower_new = [rower[t[0]] for t in cuttreeclustersSorted]
            header_new = header[:]
            nba = nba.reindex(index=rower_new, columns=header_new)

    ##########################################################################

    fig, ax = plt.subplots()
    #fig.gca().set_position((.4, .4, .8, .8))
    masked_array = np.ma.array(nba, mask=np.isnan(nba))
    heatmap = ax.pcolor(masked_array,
                        cmap=plt.cm.get_cmap(name=color),
                        alpha=1,
                        vmax=vmax,
                        vmin=vmin)
    if showvalues_text:
        show_values(heatmap)
    fig = plt.gcf()

    if not only_heatmap:
        fig.subplots_adjust(bottom=0.2, top=0.87, right=1.)
    fig.set_size_inches(40, 40)  #40, 40

    ax.set_frame_on(False)
    ax.set_yticks(np.arange(nba.shape[0]) + 0.5, minor=False)
    ax.set_xticks(np.arange(nba.shape[1]) + 0.5, minor=False)
    ax.invert_yaxis()
    ax.xaxis.tick_top()
    if not only_heatmap:
        fig.suptitle(title, fontsize=25, fontweight='bold')
    xlabels = header_new  #header[:]
    ylabels = rower_new  #rower

    ax.set_xticklabels(xlabels, minor=False, fontsize=60)
    ax.set_yticklabels(ylabels, minor=False, fontsize=60)

    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    #plt.colorbar(heatmap)
    ax.grid(False)

    ax = plt.gca()

    #ax.set_position((.1, .3, .8, .6))
    if not only_heatmap:
        plt.figtext(
            0.12,
            .1,
            'Details about :\n * the pattern \n * dossiers \n * compared MEPs',
            fontsize=40)

    for t in ax.xaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    for t in ax.yaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    if not only_heatmap:
        plt.colorbar(heatmap)
    #fig.tight_layout()
    plt.tight_layout()
    plt.savefig(destination, dpi=100)
    fig.clf()
    plt.clf()
    plt.gcf().clear()
    plt.cla()
    plt.close('all')
Пример #4
0
def organize_matrix(pairwiseStatistics):
    mat_pattern = pairwiseStatistics[1]

    mat_ref = pairwiseStatistics[0]
    innerMatrix, rower, header = getInnerMatrix(mat_pattern)
    rower = [
        unicodedata.normalize('NFD',
                              unicode(str(rower[k]),
                                      'iso-8859-1')).encode('ascii', 'ignore')
        for k in sorted(rower)
    ]
    header = [
        unicodedata.normalize('NFD',
                              unicode(str(header[k]),
                                      'iso-8859-1')).encode('ascii', 'ignore')
        for k in sorted(header)
    ]
    import pandas as pd
    import scipy.spatial.distance as ssd
    from scipy.cluster.hierarchy import linkage
    from scipy.cluster import hierarchy

    nba = pd.DataFrame(innerMatrix, index=rower, columns=header, dtype=float)
    header_new = header[:]
    rower_new = rower[:]
    matrix = nba.as_matrix()

    matrix = [[1 - x for x in row] for row in matrix]

    isSquare = True
    if len(matrix) <> len(matrix[0]):
        isSquare = False
    if isSquare:
        for index, row in enumerate(matrix):
            for column, val in enumerate(row):
                if not innerMatrix[column][index] == matrix[index][column]:
                    if (math.isnan(matrix[index][column])):
                        matrix[index][column] = 1.
                    #print innerMatrix[index][column],'-',innerMatrix[column][index] ##ERREURE d'ARRONDIE
                    else:
                        matrix[index][column] = matrix[column][
                            index]  ##ERREURE d'ARRONDIE
                if index == column:
                    matrix[index][column] = 0.

        distArray = ssd.squareform(matrix)
        linkageMatrix = linkage(distArray, 'average')

        cuttreeclusters = sorted([(i, t) for (
            i,
            t) in enumerate(hierarchy.fcluster(linkageMatrix, 0.2, 'distance'))
                                  ],
                                 key=lambda x: x[1])
        clusters = {}
        for i, c in cuttreeclusters:
            if not clusters.has_key(c):
                clusters[c] = []
            clusters[c].append(i)
        #print clusters

        cuttreeclustersSorted = []

        pairs = []
        for i in range(len(matrix)):
            row = matrix[i]
            for j in range(len(row)):
                pairs.append((i, j, row[j]))
        pairs = sorted(pairs, key=lambda x: x[2])
        visited = []
        for c, c_elems in clusters.iteritems():
            for i, j, d in pairs:
                if i in c_elems and j in c_elems:
                    if i not in visited:
                        visited.append(i)
                        cuttreeclustersSorted.append((i, c))
                    if j not in visited:
                        visited.append(j)
                        cuttreeclustersSorted.append((j, c))

        rower_new = [rower[t[0]] for t in cuttreeclustersSorted]
        header_new = [header[t[0]] for t in cuttreeclustersSorted]

        nba = nba.reindex(index=rower_new, columns=header_new)

    else:
        cuttreeclustersSorted = []
        pairs = []
        for i in range(len(matrix)):
            row = matrix[i]
            for j in range(len(row)):
                pairs.append((i, sum(row)))
        pairs = sorted(pairs, key=lambda x: x[1])
        visited = []
        for i, d in pairs:
            if i not in visited:
                visited.append(i)
                cuttreeclustersSorted.append((i, 0))

        rower_new = [rower[t[0]] for t in cuttreeclustersSorted]
        header_new = header[:]
        nba = nba.reindex(index=rower_new, columns=header_new)
    new_inner_matrix = nba.get_values().tolist()

    new_rower = list(nba.index)
    new_header = list(nba)
    new_rower_map = {i: v for i, v in enumerate(new_rower)}
    new_header_map = {i: v for i, v in enumerate(new_header)}
    new_mat_pattern = getCompleteMatrix(new_inner_matrix, new_rower_map,
                                        new_header_map)
    innerMatrix2, rower2, header2 = getInnerMatrix(mat_ref)
    rower2 = [
        unicodedata.normalize('NFD',
                              unicode(str(rower[k]),
                                      'iso-8859-1')).encode('ascii', 'ignore')
        for k in sorted(rower2)
    ]
    header2 = [
        unicodedata.normalize('NFD',
                              unicode(str(header[k]),
                                      'iso-8859-1')).encode('ascii', 'ignore')
        for k in sorted(header2)
    ]
    nba2 = pd.DataFrame(innerMatrix2,
                        index=rower2,
                        columns=header2,
                        dtype=float)

    nba2 = nba2.reindex(index=rower_new, columns=header_new)

    new_inner_matrix2 = nba2.get_values().tolist()
    new_mat_ref = getCompleteMatrix(new_inner_matrix2, new_rower_map,
                                    new_header_map)
    return [new_mat_ref, new_mat_pattern]