def buildSingleDietExcel(subjectID): ''' build diet excel for single subject, including the date, diet item and type ''' file_location = 'subject_template_' + subjectID + '.xlsx' workbookR = xlrd.open_workbook(file_location) sheet = workbookR.sheet_by_index(3) workbookW = xlwt.Workbook() ws = workbookW.add_sheet('sheet1') rowW = 0 index = 0 row_labels = utilise.itemDict2list(dataGen4DietAct.genDietTypeDict()) for rowR in range(8, sheet.nrows): if sheet.cell_value(rowR, 0): index += 1 dd = {} PList = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6'] for key in PList: dd[key] = {} for key in dd: for label in row_labels: dd[key][label] = 0 temp = buildTypeIndex.build_daily_single_diet_index_with_time4DC( subjectID, index) for key in dd: for type in temp[key]: if type in dd[key]: dd[key][type] = temp[key][type] ws.write(rowW, 0, subjectID) ws.write(rowW, 1, sheet.cell_value(rowR, 0)) ws.write(rowW, 2, str(dd['P1'].keys())) ws.write(rowW, 3, 'P1') ws.write(rowW, 4, str(dd['P1'].values())) ws.write(rowW, 5, 'P2') ws.write(rowW, 6, str(dd['P2'].values())) ws.write(rowW, 7, 'P3') ws.write(rowW, 8, str(dd['P3'].values())) ws.write(rowW, 9, 'P4') ws.write(rowW, 10, str(dd['P4'].values())) ws.write(rowW, 11, 'P5') ws.write(rowW, 12, str(dd['P5'].values())) ws.write(rowW, 13, 'P6') ws.write(rowW, 14, str(dd['P6'].values())) rowW += 1 workbookW.save('diet/dietTable_' + subjectID + '_withFreqP.xls')
def buildSingleDietExcel(subjectID): ''' build diet excel for single subject, including the date, diet item and type ''' file_location = 'subject_template_' + subjectID + '.xlsx' workbookR = xlrd.open_workbook(file_location) sheet = workbookR.sheet_by_index(3) workbookW = xlwt.Workbook() ws = workbookW.add_sheet('sheet1') rowW = 0 index = 0 row_labels = utilise.itemDict2list(dataGen4DietAct.genDietTypeDict()) # for i in range(len(row_labels)): # ws.write(rowW,2+i,row_labels[i]) # rowW += 1 for rowR in range(8, sheet.nrows): if sheet.cell_value(rowR, 0): ws.write(rowW, 0, subjectID) ws.write(rowW, 1, sheet.cell_value(rowR, 0)) index += 1 dd = {} for label in row_labels: dd[label] = 0 temp = buildTypeIndex.build_daily_single_diet_index_with_time4DC( subjectID, index) for key in temp: for type in temp[key]: dd[type] += temp[key][type] for i in range(len(row_labels)): ws.write(rowW, 2 + i, dd[row_labels[i]]) rowW += 1 workbookW.save('diet/dietTable_' + subjectID + '_withFreq4DC.xls')
def getDietTypeTFArray4DC(): type_dict = dataGen4DietAct.genDietTypeDict() x = len(available_list) n = len(type_dict) array = np.zeros((x,n)) i = 0 for subjectID in available_list: duration = dietActInfoRetrv.getDuration(subjectID) for n in range(1,duration+1): dictWithTime = buildTypeIndex.build_daily_single_diet_index_with_time4DC(subjectID,n) for time in dictWithTime: for key in type_dict: if type_dict[key] in dictWithTime[time]: array[i,key] += dictWithTime[time][type_dict[key]] i += 1 return array
def buildSingleDietExcel(subjectID): ''' build diet excel for single subject, including the date, diet item and type ''' file_location = 'subject_template_' + subjectID + '.xlsx' workbookR = xlrd.open_workbook(file_location) sheet = workbookR.sheet_by_index(3) workbookW = xlwt.Workbook() ws = workbookW.add_sheet('sheet1') rowW = 0 index = 0 row_labels = utilise.itemDict2list(dataGen4DietAct.genDietTypeDict()) for rowR in range(8, sheet.nrows): if sheet.cell_value(rowR, 0): ws.write(rowW, 0, subjectID) ws.write(rowW, 1, sheet.cell_value(rowR, 0)) index += 1 dd = {} for label in row_labels: dd[label] = 0 for line in open( 'diet/dietTypeFreq/dietType_frequency_' + subjectID + '_' + str(index) + '.txt', 'r'): line = line.strip('\n') words = wordpunct_tokenize(line) if words[0] in dd: dd[words[0]] = int(words[1]) ws.write(rowW, 2, str(dd.keys())) ws.write(rowW, 3, str(dd.values())) rowW += 1 workbookW.save('diet/dietTable_' + subjectID + '_withFreq.xls')
def buildDietWithSleepExcel(): ''' build diet excel for all the subjects, including the date, activity item and type, sleep ''' workbookW = xlwt.Workbook() ws = workbookW.add_sheet('sheet1') row_labels = utilise.itemDict2list(dataGen4DietAct.genDietTypeDict()) titles = ['SubjId', 'Day'] + row_labels + [ 'Morningness', 'Eveningness', 'Lark', 'Owl', 'HoursSleep', 'SleepMoveCount', 'SleepQuality', 'MedianHR', 'MedianBefore', 'MedianHRAfter', 'age', 'gender', 'height', 'weight', 'BMI', 'FatFreeMass', 'FatMass', 'PercFat', 'vo2max' ] for i in range(len(titles)): ws.write(0, i, titles[i]) rowW = 1 file_location1 = 'diet/dietTable_withFreq4DC.xls' workbookR1 = xlrd.open_workbook(file_location1) sheet1 = workbookR1.sheet_by_index(0) file_location2 = 'allSubjectsSleepDatamatrix.xls' workbookR2 = xlrd.open_workbook(file_location2) sheet2 = workbookR2.sheet_by_index(0) for rowRDiet in range(0, sheet1.nrows): for rowRSlp in range(1, sheet2.nrows): sub = unicode(int(sheet2.cell_value(rowRSlp, 0))) sub = '0' + sub # print sub if sheet1.cell_value(rowRDiet, 0) == sub: if sheet1.cell_value(rowRDiet, 1) == sheet2.cell_value(rowRSlp, 1): if rowRSlp < sheet2.nrows - 1: if sheet2.cell_value(rowRSlp, 1) == sheet2.cell_value( rowRSlp + 1, 1): day = sheet2.cell_value(rowRSlp, 1) temp = int(day.split('.')[1]) - 1 day = day.split('.')[0] + '.' + str( temp) + '.' + day.split('.')[2] if rowRDiet >= 1: dd = sheet1.cell_value(rowRDiet - 1, 1) temp = int(dd.split('.')[1]) dd = dd.split('.')[0] + '.' + str( temp) + '.' + dd.split('.')[2] if dd == day: for i in range(2, 14): ws.write( rowW, i, sheet1.cell_value(rowRDiet - 1, i)) else: break else: break else: day = sheet2.cell_value(rowRSlp, 1) temp = int(day.split('.')[1]) day = day.split('.')[0] + '.' + str( temp) + '.' + day.split('.')[2] for i in range(2, 14): ws.write(rowW, i, sheet1.cell_value(rowRDiet, i)) else: day = sheet2.cell_value(rowRSlp, 1) temp = int(day.split('.')[1]) day = day.split('.')[0] + '.' + str( temp) + '.' + day.split('.')[2] for i in range(2, 14): ws.write(rowW, i, sheet1.cell_value(rowRDiet, i)) ws.write(rowW, 0, sub) ws.write(rowW, 1, day) for i in range(14, 32): ws.write(rowW, i, sheet2.cell_value(rowRSlp, i - 9)) rowW += 1 workbookW.save('diet/dietTableWithSleep_withFreq4DC.xls')
def buildSubAveInfo(): workbookW = xlwt.Workbook() ws = workbookW.add_sheet('AveInfo') groupAct = dietActInfoRetrv.getGroups(labelsActType) groupDiet = dietActInfoRetrv.getGroups(labelsDietType) Age, Gender, Height, Weight, BMI, FatFree, FatMass, PercFat, Vo2max = slpInfoRetrv.getDemoGInfo( ) SlpHours = slpInfoRetrv.getSlpHours() MedianHR = slpInfoRetrv.getMedianHR() MedianHRBefore = slpInfoRetrv.getMedianHRBefore() MedianHRAfter = slpInfoRetrv.getMedianHRAfter() titles = [ 'SubjId', 'ActGroup', 'DietGroup', 'HoursSleep', 'MedianHR', 'MedianHRBefore', 'MedianHRAfter', 'age', 'gender', 'height', 'weight', 'BMI', 'FatFreeMass', 'FatMass', 'PercFat', 'vo2max' ] for i in range(len(titles)): ws.write(0, i, titles[i]) rowW = 1 for index in range(len(sleep_list)): ws.write(rowW, 0, sleep_list[index]) for key in groupAct: if sleep_list[index] in groupAct[key]: ws.write(rowW, 1, key) break for key in groupDiet: if sleep_list[index] in groupDiet[key]: ws.write(rowW, 2, key) break ws.write(rowW, 1 + 2, SlpHours[index]) ws.write(rowW, 2 + 2, MedianHR[index]) ws.write(rowW, 3 + 2, MedianHRBefore[index]) ws.write(rowW, 4 + 2, MedianHRAfter[index]) ws.write(rowW, 5 + 2, Age[index]) ws.write(rowW, 6 + 2, Gender[index]) ws.write(rowW, 7 + 2, Height[index]) ws.write(rowW, 8 + 2, Weight[index]) ws.write(rowW, 9 + 2, BMI[index]) ws.write(rowW, 10 + 2, FatFree[index]) ws.write(rowW, 11 + 2, FatMass[index]) ws.write(rowW, 12 + 2, PercFat[index]) ws.write(rowW, 13 + 2, Vo2max[index]) rowW += 1 ws2 = workbookW.add_sheet('DietTF') row_labels = utilise.itemDict2list(dataGen4DietAct.genDietTypeDict()) X = utilise.normArray(dataGen4DietAct.genDietTypeTFArray()) ws2.write(0, 0, 'SubjId') ws2.write(0, 1, 'DietGroup') for i in range(len(row_labels)): ws2.write(0, i + 2, row_labels[i]) rowW = 1 for index in range(len(available_list)): ws2.write(rowW, 0, available_list[index]) for key in groupDiet: if available_list[index] in groupDiet[key]: ws2.write(rowW, 1, key) break for i in range(len(row_labels)): ws2.write(rowW, i + 2, X[index][i]) rowW += 1 ws3 = workbookW.add_sheet('ActTF') row_labels = utilise.itemDict2list(dataGen4DietAct.genActTypeDict()) X = utilise.normArray(dataGen4DietAct.genActTypeTFArray()) ws3.write(0, 0, 'SubjId') ws3.write(0, 1, 'ActGroup') for i in range(len(row_labels)): ws3.write(0, i + 2, row_labels[i]) rowW = 1 for index in range(len(available_list)): ws3.write(rowW, 0, available_list[index]) for key in groupAct: if available_list[index] in groupAct[key]: ws3.write(rowW, 1, key) break for i in range(len(row_labels)): ws3.write(rowW, i + 2, X[index][i]) rowW += 1 workbookW.save('SubAveInfo.xls')
def singleSubjectDailyArray(domain,subjectID): ''' build daily item TF array ''' if domain == 'DietType': item_dict = dataGen4DietAct.genDietTypeDict() elif domain == 'ActType': item_dict = dataGen4DietAct.genActTypeDict() # print item_dict duration = dietActInfoRetrv.getDuration(subjectID) x = duration n = len(item_dict) dims = (x,n) array = np.zeros(dims) if domain == 'ActItem': for i in range(duration): ItemIndex = buildItemIndex.build_daily_single_activity_index(subjectID,i+1) for key in item_dict: if item_dict[key] in ItemIndex: array[i,key] = ItemIndex[item_dict[key]] else: array[i,key] = 0.0 if domain == 'DietItem': for i in range(duration): ItemIndex = buildItemIndex.build_daily_single_diet_index(subjectID,i+1) # print ItemIndex for key in item_dict: if item_dict[key] in ItemIndex: array[i,key] = ItemIndex[item_dict[key]] else: array[i,key] = 0.0 if domain == 'DietType': for i in range(duration): ItemIndex = buildTypeIndex.build_daily_single_diet_index(subjectID,i+1) # print ItemIndex for key in item_dict: if item_dict[key] in ItemIndex: array[i,key] = ItemIndex[item_dict[key]] else: array[i,key] = 0.0 if domain == 'ActType': for i in range(duration): ItemIndex = buildTypeIndex.build_daily_single_activity_index(subjectID,i+1) for key in item_dict: if item_dict[key] in ItemIndex: array[i,key] = ItemIndex[item_dict[key]] else: array[i,key] = 0.0 ''' change the TF array to TFIDF array. But the DF here is not equal to the one we use for mean Vector ''' # transformer = TfidfTransformer(norm=None) # tfidf = transformer.fit_transform(array) # aa = tfidf.toarray() # tfidfNorm = utilise.normArray(aa) # result = utilise.normArray(array) # print array return array
def visDailyPatternStack(): for sub in available_list: d = dataGen4DietAct.genDailySingleActTypeTFArray(sub) # for i in range(d.shape[0]): # for j in range(d.shape[1]): # if d[i][j] > 1: # d[i][j] = 1 labels = utilise.itemDict2list(dataGen4DietAct.genActTypeDict()) # x = np.arange(d.shape[0]) # plt.figure() # plt.stackplot(x,d[:,0],d[:,1],d[:,2],d[:,3],d[:,4],d[:,5],d[:,6],d[:,7]) # plt.title('DailyActivityPattern_'+sub) # plt.xlabel('days') # plt.savefig('visDailyActTypePattStack/DailyActivityPattern_'+sub) # plt.figure() # x = np.arange(d.shape[0]) # data = np.array([d[:,0],d[:,1],d[:,2],d[:,3],d[:,4],d[:,5],d[:,6],d[:,7]]) # bottom = np.cumsum(data, axis=0) # colors = ('#ff3333', '#33ff33', '#3333ff', '#33ffff','#ff3333', '#33ff33', '#3333ff', '#33ffff') # plt.bar(x, data[0], color=colors[0]) # for j in xrange(1, data.shape[0]): # plt.bar(x, data[1], color=colors[j], bottom=bottom[j-1]) # colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') colors = plt.cm.Paired df = pd.DataFrame(d, columns=labels) ax = df.plot.bar(colormap=colors, stacked=True) plt.legend(bbox_to_anchor=(1.05, 1), loc=2) plt.title('DailyActivityPattern_' + sub) plt.xlabel('days') plt.ylabel('frequency per day') data = dietActInfoRetrv.getDaysList(sub) ax.set_xticklabels(data) plt.savefig('visDailyActTypePattStack/DailyActivityPattern_' + sub, bbox_inches='tight') for sub in available_list: d = dataGen4DietAct.genDailySingleDietTypeTFArray(sub) # for i in range(d.shape[0]): # for j in range(d.shape[1]): # if d[i][j] > 1: # d[i][j] = 1 labels = utilise.itemDict2list(dataGen4DietAct.genDietTypeDict()) # x = np.arange(d.shape[0]) # plt.figure() # plt.stackplot(x,d[:,0],d[:,1],d[:,2],d[:,3],d[:,4],d[:,5],d[:,6],d[:,7],d[:,8],d[:,9],d[:,10],d[:,11]) # plt.title('DailyDietPattern_'+sub) # plt.xlabel('days') # plt.savefig('visDailyDietTypePattStack/DailyDietPattern_'+sub) colors = plt.cm.Paired df = pd.DataFrame(d, columns=labels) ax = df.plot.bar(colormap=colors, stacked=True) plt.legend(bbox_to_anchor=(1.05, 1), loc=2) # df.plot.area() plt.title('DailyDietPattern_' + sub) plt.ylabel('frequency per day') plt.xlabel('days') data = dietActInfoRetrv.getDaysList(sub) ax.set_xticklabels(data) plt.savefig('visDailyDietTypePattStack/DailyDietPattern_' + sub, bbox_inches='tight')
def bestLabel(labelsDietType,labelsActType): workbookW = xlwt.Workbook() ws = workbookW.add_sheet('sheet1') rowW = 0 for domain in Domain: if domain == 'DietType': labels = utilise.string2array(labelsDietType) row_labels = utilise.itemDict2list(dataGen4DietAct.genDietTypeDict()) X = dataGen4DietAct.genDietTypeTFArray() elif domain == 'ActType': labels = utilise.string2array(labelsActType) row_labels = utilise.itemDict2list(dataGen4DietAct.genActTypeDict()) X = dataGen4DietAct.genActTypeTFArray() X = utilise.normArray(X) # write the lables to excel file col = 0 for label in row_labels: ws.write(rowW,col,label) col += 1 rowW += 1 # print type(labels) plt.figure() n_clusters = np.max(labels) + 1 for k in range(n_clusters): class_members = labels == k group = [] for x in X[class_members]: group.append(x) group = np.array(group) meanVec = np.mean(group,axis=0) meanVec.tolist() stdVec = np.std(group,axis=0) stdVec.tolist() # write the mean vector of each group to excel file col = 0 for value in meanVec: ws.write(rowW,col,value) col += 1 rowW += 1 # print meanVec # we don't have to do normalization here, as the input X has already been normalized # totalSum = np.sum(meanVec[0]) # print totalSum # meanVec = meanVec/totalSum # # normalize the meanVec # firstMax = np.max(meanVec) # meanVec = meanVec/firstMax firstMax = np.max(meanVec) # print firstMax tempVec = np.copy(meanVec) for j in range(X.shape[1]): if tempVec[j] == firstMax: tempVec[j] = 0 secondMax = np.max(tempVec) # print secondMax tempVec2 = np.copy(tempVec) for j in range(X.shape[1]): if tempVec2[j]==secondMax: tempVec2[j] = 0 thirdMax = np.max(tempVec2) # print thirdMax x = range(X.shape[1]) plt.plot(x,meanVec) # print meanVec for j in range(X.shape[1]): # if meanVec[j] == firstMax: # if meanVec[j] == firstMax or meanVec[j] == secondMax: if meanVec[j] == firstMax or meanVec[j] == secondMax or meanVec[j] == thirdMax: print k,domain,n_clusters,meanVec[j],row_labels[j] plt.text(x[j],meanVec[j],row_labels[j]) # print row_labels # plt.xlabel(row_labels) plt.title(domain+'_TF_KMeans_'+str(n_clusters)) plt.savefig('visClustering'+domain+'Pattern/KMeans__TF_'+str(n_clusters)+'_groupFreq') workbookW.save('tempLabels.xls')
def HC(domain, para): if para in Metric: if para == 'TF': if domain == 'DietItem': X = dataGen4DietAct.genDietItemTFArray() elif domain == 'ActItem': X = dataGen4DietAct.genActItemTFArray() elif domain == 'DietType': X = dataGen4DietAct.genDietTypeTFArray() elif domain == 'ActType': X = dataGen4DietAct.genActTypeTFArray() elif para == 'TFIDF': if domain == 'DietItem': X = dataGen4DietAct.DietItemTfidfArray() elif domain == 'ActItem': X = dataGen4DietAct.ActItemTfidfArray() elif domain == 'DietType': X = dataGen4DietAct.DietTypeTfidfArray() elif domain == 'ActType': X = dataGen4DietAct.ActTypeTfidfArray() X = utilise.normArray(X) if para in Sim: Similarity_dict = {} if domain == 'DietItem': Similarity_dict = utilise.SimilarityDict(domain, para) elif domain == 'ActItem': Similarity_dict = utilise.SimilarityDict(domain, para) elif domain == 'DietType': Similarity_dict = utilise.SimilarityDict(domain, para) elif domain == 'ActType': Similarity_dict = utilise.SimilarityDict(domain, para) X = visSimilarityMat.similarityDict2array(Similarity_dict, 0) # method can be ward, complete, average method = 'ward' row_method = method row_metric = 'euclidean' column_method = method column_metric = 'euclidean' # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.pdist.html # d1 = ssd.pdist(X,'cosine') d1 = ssd.pdist(X) # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.squareform.html#scipy.spatial.distance.squareform D1 = ssd.squareform(d1) # full matrix # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage Y1 = sch.linkage(D1, method=row_method, metric=row_metric) row_idxing = sch.leaves_list(Y1) # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.pdist.html d2 = ssd.pdist(X.T) # http://docs.scipy.org/doc/scipy-0.16.0/reference/generated/scipy.spatial.distance.squareform.html#scipy.spatial.distance.squareform D2 = ssd.squareform(d2) # http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage Y2 = sch.linkage(D2, method=column_method, metric=column_metric) col_idxing = sch.leaves_list(Y2) heatmap_array = X[:, col_idxing][ row_idxing, :] #a numpy.ndarray or numpy.matrix, for this example, let's say mxn array top_dendrogram = Y2 #a (n-1) x 4 array side_dendrogram = Y1 #a (m-1) x 4 array row_labels = range(X.shape[0]) if para in Sim: col_labels = range(X.shape[1]) if para in Metric: if domain == 'DietItem': col_labels = utilise.itemDict2list( dataGen4DietAct.genDietItemDict()) elif domain == 'ActItem': col_labels = utilise.itemDict2list( dataGen4DietAct.genActItemDict()) elif domain == 'DietType': col_labels = utilise.itemDict2list( dataGen4DietAct.genDietTypeDict()) elif domain == 'ActType': col_labels = utilise.itemDict2list( dataGen4DietAct.genActTypeDict()) col_idxing = list(col_idxing) row_idxing = list(row_idxing) print col_idxing new_row_labels = [] new_col_labels = [] for i in range(len(row_idxing)): new_row_labels.append(str(row_labels[row_idxing[i]])) for j in range(len(col_idxing)): new_col_labels.append(str(col_labels[col_idxing[j]])) heatmap = pdh.DendroHeatMap(heat_map_data=heatmap_array, left_dendrogram=side_dendrogram, top_dendrogram=top_dendrogram) heatmap.title = 'HC_' + domain + '_' + para + '_' + method heatmap.row_labels = new_row_labels heatmap.col_labels = new_col_labels # heatmap.show() heatmap.export('VisClustering' + domain + 'Pattern/Hierarchy_' + para + '_' + method + '.png')