Exemplo n.º 1
0
def KM(domain, n_clusters):
    if domain == 'DietType':
        X = getDietTypeTFArray4DC()
    elif domain == 'ActType':
        X = getActTypeTFArray4DC()
    X = utilise.normArray(X)
    reduced_data = PCA(n_components=2).fit_transform(X)

    Inertia = []
    Labels = []
    
    # for n_clusters in range_n_clusters:
    for j in range(300):

        kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)

        kmeans.fit(reduced_data)
        inertia = kmeans.inertia_
        Inertia.append(inertia)

        labels = kmeans.labels_
        Labels.append(labels)

    min = np.min(Inertia)
    for i in range(len(Inertia)):
        if Inertia[i] == min:
            inertia = Inertia[i] 
            labels = Labels[i] 
    print domain,n_clusters,inertia, labels
Exemplo n.º 2
0
def KM_nonslp(domain, n_clusters):
    if domain == 'DietType':
        X = dataGen4DietAct.genDietTypeTFArrayWithSlp()
    else:
        X = dataGen4DietAct.genActTypeTFArrayWithSlp()

    X = utilise.normArray(X)

    Inertia = []
    Labels = []

    for j in range(300):

        kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)

        kmeans.fit(X)

        inertia = kmeans.inertia_
        Inertia.append(inertia)

        labels = kmeans.labels_
        Labels.append(labels)

    min = np.min(Inertia)

    for i in range(len(Inertia)):

        if Inertia[i] == min:
            inertia = Inertia[i]
            labels = Labels[i]

    print domain, n_clusters, inertia, labels
def sihouetteScoreArtificialData(metric):
    df, cols = artificialDataGenerator.artificialData()
    for domain in Domain:
        print df.columns
        if domain == 'DietType':
            df_temp = df[[
                'alcoholD', 'caffeineD', 'compositeP', 'dairyP', 'eggP',
                'fruitP', 'grainP', 'meatP', 'seafood', 'snack', 'starchyP',
                'vegetables'
            ]]
        else:
            df_temp = df[[
                'entertainmentRelax', 'others', 'social', 'sport',
                'transportation1', 'transportation2', 'transportation3',
                'workStudy'
            ]]
        X = df_temp.as_matrix()
        X = utilise.normArray(X)

        range_n_clusters = [2, 3, 4, 5, 6]

        for n_clusters in range_n_clusters:
            clusterer = KMeans(n_clusters=n_clusters, n_init=300)
            clusterer.fit(X)
            cluster_labels = clusterer.labels_

            # The silhouette_score gives the average value for all the samples.
            # This gives a perspective into the density and separation of the formed clusters
            silhouette_avg = silhouette_score(X, cluster_labels)
            print(metric, domain, 'For n_clusters =', n_clusters,
                  'The average silhouette_score is :', silhouette_avg)
def sihouetteScore(metric):
    for domain in Domain:
        if metric == 'TF':
            if domain == 'DietType':
                X = dataGen4DietAct.genDietTypeTFArray()
            elif domain == 'ActType':
                X = dataGen4DietAct.genActTypeTFArray()
        elif metric == 'TFIDF':
            if domain == 'DietType':
                X = dataGen4DietAct.DietTypeTfidfArray()
            elif domain == 'ActType':
                X = dataGen4DietAct.ActTypeTfidfArray()
        X = utilise.normArray(X)
        reduced_data = PCA(n_components=2).fit_transform(X)

        range_n_clusters = [2, 3, 4, 5, 6]

        for n_clusters in range_n_clusters:
            clusterer = KMeans(n_clusters=n_clusters, n_init=300)
            clusterer.fit(reduced_data)
            cluster_labels = clusterer.labels_

            # The silhouette_score gives the average value for all the samples.
            # This gives a perspective into the density and separation of the formed clusters
            silhouette_avg = silhouette_score(X, cluster_labels)
            print(metric, domain, 'For n_clusters =', n_clusters,
                  'The average silhouette_score is :', silhouette_avg)
Exemplo n.º 5
0
def visTFIDFMatrix():
    tfidf1 = utilise.normArray(dataGen4DietAct.ActItemTfidfArray())
    # tfidf1 = dataGen4DietAct.ActItemTfidfArray()
    plt.figure()
    plt.matshow(tfidf1)
    plt.colorbar()
    plt.title('actTFIDFMatrix')
    plt.savefig('visTForTFIDFMatrix/actTFIDFMatrix')

    tfidf2 = utilise.normArray(dataGen4DietAct.DietItemTfidfArray())
    # tfidf2 = dataGen4DietAct.DietItemTfidfArray()
    plt.figure()
    plt.matshow(tfidf2)
    plt.colorbar()
    plt.title('dietTFIDFMatrix')
    plt.savefig('visTForTFIDFMatrix/dietTFIDFMatrix')

    tfidf = utilise.genCombiArray(tfidf1, tfidf2)
    plt.figure()
    plt.matshow(tfidf)
    plt.colorbar()
    plt.title('actDietTFIDFMatrix')
    plt.savefig('visTForTFIDFMatrix/actDietTFIDFMatrix')

    tfidf2 = utilise.normArray(dataGen4DietAct.DietTypeTfidfArray())
    # tfidf2 = dataGen4DietAct.DietTypeTfidfArray()
    plt.figure()
    plt.matshow(tfidf2)
    plt.colorbar()
    plt.title('dietTypeTFIDFMatrix')
    plt.savefig('visTForTFIDFMatrix/dietTypeTFIDFMatrix')

    tfidf1 = utilise.normArray(dataGen4DietAct.ActTypeTfidfArray())
    # tfidf1 = dataGen4DietAct.ActTypeTfidfArray()
    plt.figure()
    plt.matshow(tfidf1)
    plt.colorbar()
    plt.title('actTypeTFIDFMatrix')
    plt.savefig('visTForTFIDFMatrix/actTypeTFIDFMatrix')

    tfidf = utilise.genCombiArray(tfidf1, tfidf2)
    plt.figure()
    plt.matshow(tfidf)
    plt.colorbar()
    plt.title('actDietTypeTFIDFMatrix')
    plt.savefig('visTForTFIDFMatrix/actDietTypeTFIDFMatrix')
Exemplo n.º 6
0
def visTFMatrix():
	tf_ActItem = utilise.normArray(dataGen4DietAct.genActItemTFArray())
	# tf_ActItem = dataGen4DietAct.genActItemTFArray()
	plt.figure()
	plt.matshow(tf_ActItem)
	plt.colorbar()
	plt.title('actTFMatrix')
	plt.savefig('visTForTFIDFMatrix/actTFMatrix')
	
	tf_DietItem = utilise.normArray(dataGen4DietAct.genDietItemTFArray())
	# tf_DietItem = dataGen4DietAct.genDietItemTFArray()
	plt.figure()
	plt.matshow(tf_DietItem)
	plt.colorbar()
	plt.title('dietTFMatrix')
	plt.savefig('visTForTFIDFMatrix/dietTFMatrix')
	
	tf = utilise.genCombiArray(tf_ActItem,tf_DietItem)
	plt.figure()
	plt.matshow(tf)
	plt.colorbar()
	plt.title('actDietTFMatrix')
	plt.savefig('visTForTFIDFMatrix/actDietTFMatrix')
	
	tf_DietType = utilise.normArray(dataGen4DietAct.genDietTypeTFArray())
	# tf_DietType = dataGen4DietAct.genDietTypeTFArray()
	plt.figure()
	plt.matshow(tf_DietType)
	plt.colorbar()
	plt.title('dietTypeTFMatrix')
	plt.savefig('visTForTFIDFMatrix/dietTypeTFMatrix')
	
	tf_ActType = utilise.normArray(dataGen4DietAct.genActTypeTFArray())
	# tf_ActType = dataGen4DietAct.genActTypeTFArray()
	plt.figure()
	plt.matshow(tf_ActType)
	plt.colorbar()
	plt.title('actTypeTFMatrix')
	plt.savefig('visTForTFIDFMatrix/actTypeTFMatrix')
	
	tf = utilise.genCombiArray(tf_ActType,tf_DietType)
	plt.figure()
	plt.matshow(tf)
	plt.colorbar()
	plt.title('actDietTypeTFMatrix')
	plt.savefig('visTForTFIDFMatrix/actDietTypeTFMatrix')
Exemplo n.º 7
0
def KM_AtificialData():
    df, cols = artificialDataGenerator.artificialData()
    for domain in Domain:
        print df.columns
        if domain == 'DietType':
            df_temp = df[[
                'alcoholD', 'caffeineD', 'compositeP', 'dairyP', 'eggP',
                'fruitP', 'grainP', 'meatP', 'seafood', 'snack', 'starchyP',
                'vegetables'
            ]]
        else:
            df_temp = df[[
                'entertainmentRelax', 'others', 'social', 'sport',
                'transportation1', 'transportation2', 'transportation3',
                'workStudy'
            ]]
        X = df_temp.as_matrix()
        X = utilise.normArray(X)

        range_n_clusters = [2, 3, 4, 5, 6]

        for n_clusters in range_n_clusters:
            kmeans = KMeans(n_clusters=n_clusters, n_init=3000)
            kmeans.fit(X)
            labels = kmeans.labels_
            inertia = kmeans.inertia_

            plt.figure()
            reduced_data = PCA(n_components=2).fit_transform(X)
            N = np.max(labels) + 1
            for k in range(N):
                class_members = labels == k
                if k == 0:
                    for x in reduced_data[class_members]:
                        plt.plot(x[0], x[1], 'go', markersize=5)
                if k == 1:
                    for x in reduced_data[class_members]:
                        plt.plot(x[0], x[1], 'ro', markersize=5)
                if k == 2:
                    for x in reduced_data[class_members]:
                        plt.plot(x[0], x[1], 'bo', markersize=5)
                if k == 3:
                    for x in reduced_data[class_members]:
                        plt.plot(x[0], x[1], 'yo', markersize=5)
#            for i in range(reduced_data.shape[0]):
#                plt.text(reduced_data[i, 0], reduced_data[i, 1],i)
            plt.title('K-means clustering (PCA-reduced data)')
            plt.savefig('visClustering' + domain +
                        'Pattern/KMeans_TF_artificial_' + str(n_clusters))

            print domain, n_clusters, inertia, labels
def sihouetteScore4DC(metric):
    for domain in Domain:
        if domain == 'DietType':
            X = validation4DC.getDietTypeTFArray4DC()
        elif domain == 'ActType':
            X = validation4DC.getActTypeTFArray4DC()
        X = utilise.normArray(X)

        range_n_clusters = [2, 3, 4, 5, 6]

        for n_clusters in range_n_clusters:
            clusterer = KMeans(n_clusters=n_clusters, n_init=300)
            clusterer.fit(X)
            cluster_labels = clusterer.labels_

            # The silhouette_score gives the average value for all the samples.
            # This gives a perspective into the density and separation of the formed clusters
            silhouette_avg = silhouette_score(X, cluster_labels)
            print(metric, domain, 'For n_clusters =', n_clusters,
                  'The average silhouette_score is :', silhouette_avg)
def singleSubjectDailyArray(domain, subjectID):
    '''
	build daily item TFIDF normalization array 
	'''
    if domain == 'ActItem':
        item_dict = dataGen4DietAct.genActItemDict()
    elif domain == 'DietItem':
        item_dict = dataGen4DietAct.genDietItemDict()

    duration = dietActInfoRetrv.getDuration(subjectID)
    x = duration
    n = len(item_dict)
    dims = (x, n)
    array = np.zeros(dims)

    if domain == 'ActItem':
        for i in range(duration):
            ItemIndex = buildItemIndex.build_daily_single_activity_index(
                subjectID, i + 1)
            for key in item_dict:
                if "'" + item_dict[key] + "'" in ItemIndex:
                    array[i, key] = ItemIndex["'" + item_dict[key] + "'"]
    if domain == 'DietItem':
        for i in range(duration):
            ItemIndex = buildItemIndex.build_daily_single_diet_index(
                subjectID, i + 1)
            for key in item_dict:
                if "'" + item_dict[key] + "'" in ItemIndex:
                    array[i, key] = ItemIndex["'" + item_dict[key] + "'"]

    transformer = TfidfTransformer(norm=None)
    tfidf = transformer.fit_transform(array)
    aa = tfidf.toarray()
    tfidfNorm = utilise.normArray(aa)
    print tfidfNorm.shape
    return tfidfNorm
Exemplo n.º 10
0
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 16 20:12:45 2016

@author: jingjing
"""

from sklearn.cluster import DBSCAN
import utilise

Domain = ['DietType', 'ActType']
for domain in Domain:

    if domain == 'DietType':
        X = utilise.genDietTypeTFArray()
    elif domain == 'ActType':
        X = utilise.genActTypeTFArray()
    X = utilise.normArray(X)

    db = DBSCAN(0.8, 1).fit(X)
    labels = db.labels_
    print db.components_
    print labels
def buildSubAveInfo():
    workbookW = xlwt.Workbook()
    ws = workbookW.add_sheet('AveInfo')

    groupAct = dietActInfoRetrv.getGroups(labelsActType)
    groupDiet = dietActInfoRetrv.getGroups(labelsDietType)

    Age, Gender, Height, Weight, BMI, FatFree, FatMass, PercFat, Vo2max = slpInfoRetrv.getDemoGInfo(
    )
    SlpHours = slpInfoRetrv.getSlpHours()
    MedianHR = slpInfoRetrv.getMedianHR()
    MedianHRBefore = slpInfoRetrv.getMedianHRBefore()
    MedianHRAfter = slpInfoRetrv.getMedianHRAfter()

    titles = [
        'SubjId', 'ActGroup', 'DietGroup', 'HoursSleep', 'MedianHR',
        'MedianHRBefore', 'MedianHRAfter', 'age', 'gender', 'height', 'weight',
        'BMI', 'FatFreeMass', 'FatMass', 'PercFat', 'vo2max'
    ]

    for i in range(len(titles)):
        ws.write(0, i, titles[i])

    rowW = 1

    for index in range(len(sleep_list)):
        ws.write(rowW, 0, sleep_list[index])

        for key in groupAct:
            if sleep_list[index] in groupAct[key]:
                ws.write(rowW, 1, key)
                break

        for key in groupDiet:
            if sleep_list[index] in groupDiet[key]:
                ws.write(rowW, 2, key)
                break

        ws.write(rowW, 1 + 2, SlpHours[index])
        ws.write(rowW, 2 + 2, MedianHR[index])
        ws.write(rowW, 3 + 2, MedianHRBefore[index])
        ws.write(rowW, 4 + 2, MedianHRAfter[index])
        ws.write(rowW, 5 + 2, Age[index])
        ws.write(rowW, 6 + 2, Gender[index])
        ws.write(rowW, 7 + 2, Height[index])
        ws.write(rowW, 8 + 2, Weight[index])
        ws.write(rowW, 9 + 2, BMI[index])
        ws.write(rowW, 10 + 2, FatFree[index])
        ws.write(rowW, 11 + 2, FatMass[index])
        ws.write(rowW, 12 + 2, PercFat[index])
        ws.write(rowW, 13 + 2, Vo2max[index])
        rowW += 1

    ws2 = workbookW.add_sheet('DietTF')

    row_labels = utilise.itemDict2list(dataGen4DietAct.genDietTypeDict())

    X = utilise.normArray(dataGen4DietAct.genDietTypeTFArray())

    ws2.write(0, 0, 'SubjId')
    ws2.write(0, 1, 'DietGroup')

    for i in range(len(row_labels)):
        ws2.write(0, i + 2, row_labels[i])

    rowW = 1
    for index in range(len(available_list)):
        ws2.write(rowW, 0, available_list[index])

        for key in groupDiet:
            if available_list[index] in groupDiet[key]:
                ws2.write(rowW, 1, key)
                break

        for i in range(len(row_labels)):
            ws2.write(rowW, i + 2, X[index][i])

        rowW += 1

    ws3 = workbookW.add_sheet('ActTF')

    row_labels = utilise.itemDict2list(dataGen4DietAct.genActTypeDict())

    X = utilise.normArray(dataGen4DietAct.genActTypeTFArray())

    ws3.write(0, 0, 'SubjId')
    ws3.write(0, 1, 'ActGroup')

    for i in range(len(row_labels)):
        ws3.write(0, i + 2, row_labels[i])

    rowW = 1
    for index in range(len(available_list)):
        ws3.write(rowW, 0, available_list[index])

        for key in groupAct:
            if available_list[index] in groupAct[key]:
                ws3.write(rowW, 1, key)
                break

        for i in range(len(row_labels)):
            ws3.write(rowW, i + 2, X[index][i])

        rowW += 1

    workbookW.save('SubAveInfo.xls')
Exemplo n.º 12
0
def KM(domain, n_clusters):
    #    if domain == 'DietType':
    #        X = dataGen4DietAct.genDietTypeTFArray()
    #    elif domain == 'ActType':
    #        X = dataGen4DietAct.genActTypeTFArray()
    #    X = utilise.normArray(X)

    # if domain == 'DietType':
    # Similarity_dict = utilise.SimilarityDict(domain,'TFEclud')
    # elif domain == 'ActType':
    # Similarity_dict = utilise.SimilarityDict(domain,'TFEclud')
    # X = visSimilarityMat.similarityDict2array(Similarity_dict,0)

    if domain == 'DietType':
        X = validation4DC.getDietTypeTFArray4DC()
    elif domain == 'ActType':
        X = validation4DC.getActTypeTFArray4DC()
    X = utilise.normArray(X)

    # print X
    # print X.shape

    Inertia = []
    Labels = []
    # range_n_clusters = [2, 3, 4, 5, 6]
    # range_n_clusters = [4]

    # for n_clusters in range_n_clusters:
    for j in range(300):

        kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)

        # kmeans.fit(reduced_data)
        kmeans.fit(X)
        inertia = kmeans.inertia_
        Inertia.append(inertia)
        # print domain,inertia
        labels = kmeans.labels_
        Labels.append(labels)
        # print labels

    min = np.min(Inertia)
    for i in range(len(Inertia)):
        if Inertia[i] == min:
            inertia = Inertia[i]
            labels = Labels[i]

    plt.figure()
    reduced_data = PCA(n_components=2).fit_transform(X)
    N = np.max(labels) + 1
    for k in range(N):
        class_members = labels == k
        if k == 0:
            for x in reduced_data[class_members]:
                plt.plot(x[0], x[1], 'go', markersize=5)
        if k == 1:
            for x in reduced_data[class_members]:
                plt.plot(x[0], x[1], 'ro', markersize=5)
        if k == 2:
            for x in reduced_data[class_members]:
                plt.plot(x[0], x[1], 'bo', markersize=5)
        if k == 3:
            for x in reduced_data[class_members]:
                plt.plot(x[0], x[1], 'yo', markersize=5)
    for i in range(reduced_data.shape[0]):
        plt.text(reduced_data[i, 0], reduced_data[i, 1], i)
    plt.title('K-means clustering (PCA-reduced data)')
    plt.savefig('visClustering' + domain + 'Pattern/KMeans_TF_' +
                str(n_clusters))

    # a,b = kMeans(X,2)
    # print b[:,0].shape
    # print a,b[:,0].ravel()
    # print sum(b[:,1].ravel())

    print domain, n_clusters, inertia, labels
def clusteringKmeansLabelsNewSubs():
    
    workbookW = xlwt.Workbook()
    ws = workbookW.add_sheet('sheet1')
    rowW = 0
    
    df = newDataProcess.newSubInfo() 
    for domain in Domain:
        print df.columns 
        if domain == 'DietType':
            df_temp = df[['alcoholD','caffeineD','dairyP','eggP','fruitP','grainP','meatP','seafood','snack','starchyP','vegetables']]
            row_labels = df_temp.columns 
            X = df_temp.as_matrix()
            X = utilise.normArray(X)
            kmeans = KMeans(n_clusters=2, n_init = 3000)
            kmeans.fit(X)
            labels = kmeans.labels_ 
        else:
            df_temp = df[['leisure','social','sport','walk','car','bike','workStudy']]
            row_labels = df_temp.columns             
            X = df_temp.as_matrix()
            X = utilise.normArray(X)
            kmeans = KMeans(n_clusters=3, n_init = 3000)
            kmeans.fit(X)
            labels = kmeans.labels_ 
        
        # write the lables to excel file  
        col = 0
        for label in row_labels:
            ws.write(rowW,col,label)
            col += 1 
        rowW += 1 
        
        # print type(labels)
        plt.figure()
        
        n_clusters = np.max(labels) + 1
        
        for k in range(n_clusters):
            class_members = labels == k
            group = [] 
            for x in X[class_members]:
                group.append(x)
            group = np.array(group)
            
            meanVec = np.mean(group,axis=0)
            meanVec.tolist()
            stdVec = np.std(group,axis=0)
            stdVec.tolist() 
            print stdVec
            
            # write the mean vector of each group to excel file 
            col = 0
            for value in meanVec:
                ws.write(rowW,col,value)
                col += 1 
            rowW += 1 
            # print meanVec 
            
            firstMax = np.max(meanVec)
            # print firstMax
            tempVec = np.copy(meanVec)
            for j in range(X.shape[1]):
                if tempVec[j] == firstMax:
                    tempVec[j] = 0
            secondMax = np.max(tempVec)
            # print secondMax
            tempVec2 = np.copy(tempVec)
            for j in range(X.shape[1]):
                if tempVec2[j]==secondMax:
                    tempVec2[j] = 0
            thirdMax = np.max(tempVec2)
            # print thirdMaxO

            
            x = range(X.shape[1])
            plt.plot(x,meanVec)
#            plt.errorbar(x,meanVec,yerr=stdVec)
            # print meanVec
            for j in range(X.shape[1]):
                # if meanVec[j] == firstMax:
                # if meanVec[j] == firstMax or meanVec[j] == secondMax:
                if meanVec[j] == firstMax or meanVec[j] == secondMax or meanVec[j] == thirdMax:
                    ws.write(rowW,0,k)
                    ws.write(rowW,1,domain)
                    ws.write(rowW,2,row_labels[j])
                    ws.write(rowW,3,meanVec[j])
                    rowW += 1 
                    print k,domain,n_clusters,meanVec[j],row_labels[j]
                plt.text(x[j],meanVec[j],row_labels[j])

        # print row_labels
        # plt.xlabel(row_labels)
        plt.title(domain+'_TF_KMeans_'+str(n_clusters))
        plt.savefig('visClustering'+domain+'Pattern/KMeans__TF_NewDataSubs_'+str(n_clusters)+'_groupFreq')
    
    workbookW.save('tempLabels.xls')
def bestLabel(labelsDietType,labelsActType):

    workbookW = xlwt.Workbook()
    ws = workbookW.add_sheet('sheet1')
    rowW = 0

    for domain in Domain:
        if domain == 'DietType':
            labels = utilise.string2array(labelsDietType) 
            row_labels = utilise.itemDict2list(dataGen4DietAct.genDietTypeDict())
            X = dataGen4DietAct.genDietTypeTFArray()
        elif domain == 'ActType':
            labels = utilise.string2array(labelsActType)
            row_labels = utilise.itemDict2list(dataGen4DietAct.genActTypeDict())
            X = dataGen4DietAct.genActTypeTFArray()
        X = utilise.normArray(X)
        
        
        # write the lables to excel file  
        col = 0
        for label in row_labels:
            ws.write(rowW,col,label)
            col += 1 
        rowW += 1 
        
        # print type(labels)
        plt.figure()
        
        n_clusters = np.max(labels) + 1 
        
        for k in range(n_clusters):
            class_members = labels == k
            group = [] 
            for x in X[class_members]:
                group.append(x)
            group = np.array(group)
            
            meanVec = np.mean(group,axis=0)
            meanVec.tolist()
            stdVec = np.std(group,axis=0)
            stdVec.tolist() 
            
            # write the mean vector of each group to excel file 
            col = 0
            for value in meanVec:
                ws.write(rowW,col,value)
                col += 1 
            rowW += 1 
            # print meanVec 
            
            # we don't have to do normalization here, as the input X has already been normalized 
            # totalSum = np.sum(meanVec[0])
            # print totalSum
            # meanVec = meanVec/totalSum
            
            # # normalize the meanVec 
            # firstMax = np.max(meanVec)
            # meanVec = meanVec/firstMax
            
            firstMax = np.max(meanVec)
            # print firstMax
            tempVec = np.copy(meanVec)
            for j in range(X.shape[1]):
                if tempVec[j] == firstMax:
                    tempVec[j] = 0
            secondMax = np.max(tempVec)
            # print secondMax
            tempVec2 = np.copy(tempVec)
            for j in range(X.shape[1]):
                if tempVec2[j]==secondMax:
                    tempVec2[j] = 0
            thirdMax = np.max(tempVec2)
            # print thirdMax

            
            x = range(X.shape[1])
            plt.plot(x,meanVec)
            # print meanVec
            for j in range(X.shape[1]):
                # if meanVec[j] == firstMax:
                # if meanVec[j] == firstMax or meanVec[j] == secondMax:
                if meanVec[j] == firstMax or meanVec[j] == secondMax or meanVec[j] == thirdMax:
                    print k,domain,n_clusters,meanVec[j],row_labels[j]
                    plt.text(x[j],meanVec[j],row_labels[j])

        # print row_labels
        # plt.xlabel(row_labels)
        plt.title(domain+'_TF_KMeans_'+str(n_clusters))
        plt.savefig('visClustering'+domain+'Pattern/KMeans__TF_'+str(n_clusters)+'_groupFreq')
    
    workbookW.save('tempLabels.xls')
def HC(domain, para):
    if para in Metric:
        if para == 'TF':
            if domain == 'DietItem':
                X = dataGen4DietAct.genDietItemTFArray()
            elif domain == 'ActItem':
                X = dataGen4DietAct.genActItemTFArray()
            elif domain == 'DietType':
                X = dataGen4DietAct.genDietTypeTFArray()
            elif domain == 'ActType':
                X = dataGen4DietAct.genActTypeTFArray()
        elif para == 'TFIDF':
            if domain == 'DietItem':
                X = dataGen4DietAct.DietItemTfidfArray()
            elif domain == 'ActItem':
                X = dataGen4DietAct.ActItemTfidfArray()
            elif domain == 'DietType':
                X = dataGen4DietAct.DietTypeTfidfArray()
            elif domain == 'ActType':
                X = dataGen4DietAct.ActTypeTfidfArray()
        X = utilise.normArray(X)

    if para in Sim:
        Similarity_dict = {}
        if domain == 'DietItem':
            Similarity_dict = utilise.SimilarityDict(domain, para)
        elif domain == 'ActItem':
            Similarity_dict = utilise.SimilarityDict(domain, para)
        elif domain == 'DietType':
            Similarity_dict = utilise.SimilarityDict(domain, para)
        elif domain == 'ActType':
            Similarity_dict = utilise.SimilarityDict(domain, para)
        X = visSimilarityMat.similarityDict2array(Similarity_dict, 0)

    # method can be ward, complete, average
    method = 'ward'
    row_method = method
    row_metric = 'euclidean'
    column_method = method
    column_metric = 'euclidean'

    # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.pdist.html
    # d1 = ssd.pdist(X,'cosine')
    d1 = ssd.pdist(X)
    # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.squareform.html#scipy.spatial.distance.squareform
    D1 = ssd.squareform(d1)  # full matrix
    # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
    Y1 = sch.linkage(D1, method=row_method, metric=row_metric)
    row_idxing = sch.leaves_list(Y1)

    # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.pdist.html
    d2 = ssd.pdist(X.T)
    # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.squareform.html#scipy.spatial.distance.squareform
    D2 = ssd.squareform(d2)
    # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage
    Y2 = sch.linkage(D2, method=column_method, metric=column_metric)
    col_idxing = sch.leaves_list(Y2)

    heatmap_array = X[:, col_idxing][
        row_idxing, :]  #a numpy.ndarray or numpy.matrix, for this example, let's say mxn array
    top_dendrogram = Y2  #a (n-1) x 4 array
    side_dendrogram = Y1  #a (m-1) x 4 array

    row_labels = range(X.shape[0])
    if para in Sim:
        col_labels = range(X.shape[1])
    if para in Metric:
        if domain == 'DietItem':
            col_labels = utilise.itemDict2list(
                dataGen4DietAct.genDietItemDict())
        elif domain == 'ActItem':
            col_labels = utilise.itemDict2list(
                dataGen4DietAct.genActItemDict())
        elif domain == 'DietType':
            col_labels = utilise.itemDict2list(
                dataGen4DietAct.genDietTypeDict())
        elif domain == 'ActType':
            col_labels = utilise.itemDict2list(
                dataGen4DietAct.genActTypeDict())
    col_idxing = list(col_idxing)
    row_idxing = list(row_idxing)
    print col_idxing

    new_row_labels = []
    new_col_labels = []
    for i in range(len(row_idxing)):
        new_row_labels.append(str(row_labels[row_idxing[i]]))
    for j in range(len(col_idxing)):
        new_col_labels.append(str(col_labels[col_idxing[j]]))

    heatmap = pdh.DendroHeatMap(heat_map_data=heatmap_array,
                                left_dendrogram=side_dendrogram,
                                top_dendrogram=top_dendrogram)
    heatmap.title = 'HC_' + domain + '_' + para + '_' + method
    heatmap.row_labels = new_row_labels
    heatmap.col_labels = new_col_labels

    # heatmap.show()
    heatmap.export('VisClustering' + domain + 'Pattern/Hierarchy_' + para +
                   '_' + method + '.png')