def genderAnalysis():
    df, labels = artificialDataGenerator.artificialData()
    df1 = df[df['gender'] == 1]
    print 'the total appearance of bike in men is '
    print sum(df1['transportation3'])
    print 'the number of men in the dataset is '
    print(df1.shape[0])
    df2 = df[df['gender'] == 0]
    print 'the total appearance of bike in women is '
    print sum(df2['transportation3'])
    print 'the number of women in the dataset is '
    print(df2.shape[0])

    #def bikeAnalysis():
    df, labels = artificialDataGenerator.artificialData()
    df1 = df[df['transportation3'] >= 1]
    print 'the total appearance of work/study in people who bike is '
    print sum(df1['workStudy'])
    print 'the number of people who bike in the dataset is '
    print(df1.shape[0])
    df2 = df[df['transportation3'] == 0]
    print "the total appearance of work/study in people who not bike is "
    print sum(df2['workStudy'])
    print 'the number of people who not bike in the dataset is '
    print(df2.shape[0])
def sihouetteScoreArtificialData(metric):
    df, cols = artificialDataGenerator.artificialData()
    for domain in Domain:
        print df.columns
        if domain == 'DietType':
            df_temp = df[[
                'alcoholD', 'caffeineD', 'compositeP', 'dairyP', 'eggP',
                'fruitP', 'grainP', 'meatP', 'seafood', 'snack', 'starchyP',
                'vegetables'
            ]]
        else:
            df_temp = df[[
                'entertainmentRelax', 'others', 'social', 'sport',
                'transportation1', 'transportation2', 'transportation3',
                'workStudy'
            ]]
        X = df_temp.as_matrix()
        X = utilise.normArray(X)

        range_n_clusters = [2, 3, 4, 5, 6]

        for n_clusters in range_n_clusters:
            clusterer = KMeans(n_clusters=n_clusters, n_init=300)
            clusterer.fit(X)
            cluster_labels = clusterer.labels_

            # The silhouette_score gives the average value for all the samples.
            # This gives a perspective into the density and separation of the formed clusters
            silhouette_avg = silhouette_score(X, cluster_labels)
            print(metric, domain, 'For n_clusters =', n_clusters,
                  'The average silhouette_score is :', silhouette_avg)
def artificialData():
    df = artificialDataGenerator.artificialData()
    print df.columns
    df = df[[
        'alcoholD', 'caffeineD', 'dairyP', 'eggP', 'fruitP', 'grainP', 'meatP',
        'seafood', 'snack', 'starchyP', 'vegetables', 'entertainmentRelax',
        'social', 'sport', 'transportation1', 'transportation2',
        'transportation3', 'workStudy', 'gender', 'label'
    ]]
    df.columns = [
        'alcohol', 'cafe', 'dairy', 'egg', 'fruit', 'grain', 'meat', 'seafood',
        'snack', 'starchy', 'vegetables', 'leisure', 'social', 'sport', 'walk',
        'car', 'bike', 'workStudy', 'gender', 'label'
    ]

    dd = {}
    dd_low = {}
    dd_high = {}
    dd_diff = {}
    for i in df.columns:
        if i != 'label':
            dd[i] = [0, 0]
            temp = df[df[i] > 0]
            #            dd[i][0] = temp[temp['label']==0].shape[0]
            #            dd[i][1] = temp[temp['label']==1].shape[0]
            #            dd[i][2] = temp[temp['label']==2].shape[0]
            #
            #            dd[i][0] = sum(temp[temp['label']==0][i])
            #            dd[i][1] = sum(temp[temp['label']==1][i])
            #            dd[i][2] = sum(temp[temp['label']==2][i])

            #            dd[i][0] = sum(temp[temp['label']==0][i])/(temp[temp['label']==0].shape[0])
            #            dd[i][1] = sum(temp[temp['label']==1][i])/(temp[temp['label']==1].shape[0])
            #            dd[i][2] = sum(temp[temp['label']==2][i])/(temp[temp['label']==2].shape[0])
            #            ll = copy.deepcopy(dd)
            #            dd[i][0] = dd[i][0]/float(sum(ll[i]))
            #            dd[i][1] = dd[i][1]/float(sum(ll[i]))
            #            dd[i][2] = dd[i][2]/float(sum(ll[i]))
            #            dd_low[i] = dd[i][0] #+ dd[i][1]
            #            dd_high[i] = dd[i][2] #+ dd[i][1]
            #            dd_diff[i] = max(dd[i])-min(dd[i])

            dd[i][0] = float(sum(temp[temp['label'] == 0][i])) / sum(df[i])
            dd[i][1] = float(sum(temp[temp['label'] == 1][i])) / sum(df[i])
            #            dd[i][2] = sum(temp[temp['label']==2][i])/sum(df[i])

            #            dd[i][0] = float(temp[temp['label']==0][i].shape[0])/temp.shape[0]
            #            dd[i][1] = float(temp[temp['label']==1][i].shape[0])/temp.shape[0]
            #            dd[i][2] = float(temp[temp['label']==2][i].shape[0])/temp.shape[0]

            dd_low[i] = dd[i][0]
            dd_high[i] = dd[i][1]
            dd_diff[i] = max(dd[i]) - min(dd[i])

    return dd, dd_low, dd_high, dd_diff
def walkAnalysis():
    df, labels = artificialDataGenerator.artificialData()
    df1 = df[df['transportation1'] >= 1]
    print 'the total appearance of cafe in people who walk is '
    print sum(df1['caffeineD'])
    print 'the number of people who walk in the dataset is '
    print(df1.shape[0])
    df2 = df[df['transportation1'] == 0]
    print "the total appearance of cafe in people who not walk is "
    print sum(df2['caffeineD'])
    print 'the number of people who not walk in the dataset is '
    print(df2.shape[0])
Exemplo n.º 5
0
def KM_AtificialData():
    df, cols = artificialDataGenerator.artificialData()
    for domain in Domain:
        print df.columns
        if domain == 'DietType':
            df_temp = df[[
                'alcoholD', 'caffeineD', 'compositeP', 'dairyP', 'eggP',
                'fruitP', 'grainP', 'meatP', 'seafood', 'snack', 'starchyP',
                'vegetables'
            ]]
        else:
            df_temp = df[[
                'entertainmentRelax', 'others', 'social', 'sport',
                'transportation1', 'transportation2', 'transportation3',
                'workStudy'
            ]]
        X = df_temp.as_matrix()
        X = utilise.normArray(X)

        range_n_clusters = [2, 3, 4, 5, 6]

        for n_clusters in range_n_clusters:
            kmeans = KMeans(n_clusters=n_clusters, n_init=3000)
            kmeans.fit(X)
            labels = kmeans.labels_
            inertia = kmeans.inertia_

            plt.figure()
            reduced_data = PCA(n_components=2).fit_transform(X)
            N = np.max(labels) + 1
            for k in range(N):
                class_members = labels == k
                if k == 0:
                    for x in reduced_data[class_members]:
                        plt.plot(x[0], x[1], 'go', markersize=5)
                if k == 1:
                    for x in reduced_data[class_members]:
                        plt.plot(x[0], x[1], 'ro', markersize=5)
                if k == 2:
                    for x in reduced_data[class_members]:
                        plt.plot(x[0], x[1], 'bo', markersize=5)
                if k == 3:
                    for x in reduced_data[class_members]:
                        plt.plot(x[0], x[1], 'yo', markersize=5)
#            for i in range(reduced_data.shape[0]):
#                plt.text(reduced_data[i, 0], reduced_data[i, 1],i)
            plt.title('K-means clustering (PCA-reduced data)')
            plt.savefig('visClustering' + domain +
                        'Pattern/KMeans_TF_artificial_' + str(n_clusters))

            print domain, n_clusters, inertia, labels
Exemplo n.º 6
0
def genArtificialActDietTypeDataSet():
    dataset = []

    newDF = artificialDataGenerator.artificialData()
    for i in range(newDF.shape[0]):
        temp = []
        for j in newDF.columns:
            if j != 'compositeP' and j != 'sleepTime' and j != 'label' and j!='gender' and j!='others': 
                if newDF.ix[i,j] > 0:
                    temp.append(j)
        temp = tuple(temp)
        dataset.append(temp)
    
    dataset = tuple(dataset)
    print len(dataset)
    return dataset
Exemplo n.º 7
0
def genArtificialActDietTypeDataSetForMoreSleep():
    dataset = []

    df = artificialDataGenerator.artificialData()
    df = df[df['label']==1]
    df.index = range(df.shape[0])
    
    for i in range(df.shape[0]):
        temp = []
        for j in df.columns:
            if j != 'compositeP' and j != 'sleepTime' and j != 'label' and j!='gender' and j!='others': 
                if df.ix[i,j] > 0:
                    temp.append(j)
        temp = tuple(temp)
        dataset.append(temp)
    
    dataset = tuple(dataset)
    print len(dataset)
    return dataset
def visdiff():
    surrogateDF, labels = artificialDataGenerator.artificialData()
    df, cols = artificialDataGenerator.originalData()
    newDF = newFeatureFrame()
    for i in newDF.columns:
        plt.figure()
        newDF[i].plot.kde(label='new')

        if i == 'walk':
            i = 'transportation1'
        if i == 'car':
            i = 'transportation2'
        if i == 'bike':
            i = 'transportation3'
        if i == 'leisure':
            i = 'entertainmentRelax'

        df[i].plot.kde(label='original')
        surrogateDF[i].plot.kde(label='surrogate')

        #        plt.legend()
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
        plt.title(i)
        plt.savefig('distribution/' + i + '_diff')
temp_df = df[['alcoholD', 'eggP', 'seafood', 'gender', 'bikeWork', 'walkCar']]
#temp_df = df[['alcoholD','eggP','seafood','gender']]
for i in temp_df.columns:
    for j in range(temp_df.shape[0]):
        if temp_df[i][j] > 1:
            temp_df.set_value(j, i, 1)
dataset = temp_df.as_matrix()
labels = list(df['label'])
clf = LogisticRegression(penalty='l1', C=0.5)
scores = cross_validation.cross_val_score(clf, dataset, labels, cv=5)
accuracy = scores.mean()
print accuracy
'''
artificial data test 
'''
df = artificialDataGenerator.artificialData()
temp_df = df[[
    'alcoholD', 'caffeineD', 'dairyP', 'eggP', 'fruitP', 'grainP', 'meatP',
    'seafood', 'snack', 'starchyP', 'vegetables', 'entertainmentRelax',
    'social', 'sport', 'transportation1', 'transportation2', 'transportation3',
    'workStudy', 'gender'
]]
dataset = temp_df.as_matrix()
labels = list(df['label'])
clf = LogisticRegression(penalty='l1', C=0.5)
scores = cross_validation.cross_val_score(clf, dataset, labels, cv=5)
accuracy = scores.mean()
print accuracy
'''
artificial data test pattern features 
'''
def clusteringKmeansLabelsArtificialDays():
    
    workbookW = xlwt.Workbook()
    ws = workbookW.add_sheet('sheet1')
    rowW = 0
    
    df = artificialDataGenerator.artificialData()
    for domain in Domain:
        print df.columns 
        if domain == 'DietType':
            df_temp = df[['alcoholD','caffeineD','dairyP','eggP','fruitP','grainP','meatP','seafood','snack','starchyP','vegetables']]
            row_labels = df_temp.columns 
            X = df_temp.as_matrix()
            X = utilise.normArray(X)
            kmeans = KMeans(n_clusters=2, n_init = 3000)
            kmeans.fit(X)
            labels = kmeans.labels_ 
        else:
            df_temp = df[['entertainmentRelax','social','sport','transportation1','transportation2','transportation3','workStudy']]
            row_labels = df_temp.columns             
            X = df_temp.as_matrix()
            X = utilise.normArray(X)
            kmeans = KMeans(n_clusters=3, n_init = 3000)
            kmeans.fit(X)
            labels = kmeans.labels_ 
        
        # write the lables to excel file  
        col = 0
        for label in row_labels:
            ws.write(rowW,col,label)
            col += 1 
        rowW += 1 
        
        # print type(labels)
        plt.figure()
        
        n_clusters = np.max(labels) + 1
        
        for k in range(n_clusters):
            class_members = labels == k
            group = [] 
            for x in X[class_members]:
                group.append(x)
            group = np.array(group)
            
            meanVec = np.mean(group,axis=0)
            meanVec.tolist()
            stdVec = np.std(group,axis=0)
            stdVec.tolist() 
            print stdVec
            
            # write the mean vector of each group to excel file 
            col = 0
            for value in meanVec:
                ws.write(rowW,col,value)
                col += 1 
            rowW += 1 
            # print meanVec 
            
            firstMax = np.max(meanVec)
            # print firstMax
            tempVec = np.copy(meanVec)
            for j in range(X.shape[1]):
                if tempVec[j] == firstMax:
                    tempVec[j] = 0
            secondMax = np.max(tempVec)
            # print secondMax
            tempVec2 = np.copy(tempVec)
            for j in range(X.shape[1]):
                if tempVec2[j]==secondMax:
                    tempVec2[j] = 0
            thirdMax = np.max(tempVec2)
            # print thirdMaxO

            
            x = range(X.shape[1])
            plt.plot(x,meanVec)
#            plt.errorbar(x,meanVec,yerr=stdVec)
            # print meanVec
            for j in range(X.shape[1]):
                # if meanVec[j] == firstMax:
                # if meanVec[j] == firstMax or meanVec[j] == secondMax:
                if meanVec[j] == firstMax or meanVec[j] == secondMax or meanVec[j] == thirdMax:
                    ws.write(rowW,0,k)
                    ws.write(rowW,1,domain)
                    ws.write(rowW,2,row_labels[j])
                    ws.write(rowW,3,meanVec[j])
                    rowW += 1 
                    print k,domain,n_clusters,meanVec[j],row_labels[j]
                if row_labels[j] == 'transportation1':
                    plt.text(x[j],meanVec[j],'walk')
                elif row_labels[j] == 'transportation2':
                    plt.text(x[j],meanVec[j],'car')
                elif row_labels[j] == 'transportation3':
                    plt.text(x[j],meanVec[j],'bike')
                else:
                    plt.text(x[j],meanVec[j],row_labels[j])

        # print row_labels
        # plt.xlabel(row_labels)
        plt.title(domain+'_TF_KMeans_'+str(n_clusters))
        plt.savefig('visClustering'+domain+'Pattern/KMeans__TF_ArtificialDays_'+str(n_clusters)+'_groupFreq')
    
    workbookW.save('tempLabels.xls')