def __init__(self, X, categorical_features): OneHotEncoder.__init__(self, categorical_features=categorical_features, sparse=False) self.fit(X)
def preProcessData(trainFeatureMatrix, testFeatureMatrix): totalFeatureNum = 52 singleValueIndexList = [17, 19, 20, 23] categoricalAttriIndexList = [0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 44, 45, 46] categoricalFeatureValueNumList = [13, 112, 2, 13, 13, 112, 2, 13, 145, 4, 3031, 4, 138, 102, 102, 2090] cateNumericIndexList = [1, 6, 15, 16, 18,21,22,24,25,26,27,28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,49,50,51] numericAttriIndexList = [1, 6, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 47, 48, 49, 50, 51] # for i in range(len(trainFeatureSpace[0])): # if not i in categoricalAttriIndexList: # #print 'numerical', i, len(list(set(trainFeatureSpace[:,i]))) # print '%s, numerical, train: %s, test:%s' % (i, len(list(set(trainFeatureMatrix[:,i]))), len(list(set(testFeatureMatrix[:,i])))) # else: # print '%s, categorical, train: %s, test:%s' % (i, len(list(set(trainFeatureMatrix[:,i]))), len(list(set(testFeatureMatrix[:,i])))) tempResultMatrix = np.concatenate((trainFeatureMatrix, testFeatureMatrix), axis=0) # print len(trainFeatureMatrix), len(trainFeatureMatrix[0]) # print len(testFeatureMatrix), len(testFeatureMatrix[0]) # print len(tempResultMatrix), len(tempResultMatrix[0]) # exit() # for i in range(len(trainFeatureMatrix)): # for j in range(len(trainFeatureMatrix[0])): # if j in cateNumericIndexList: # trainFeatureMatrix[i][j] = int(trainFeatureMatrix[i][j]) # for i in range(len(testFeatureMatrix)): # for j in range(len(testFeatureMatrix[0])): # if j in cateNumericIndexList: # testFeatureMatrix[i][j] = int(testFeatureMatrix[i][j]) #selectedFeatureList = [] # for i in range(53): # if not i in singleValueIndexList: # selectedFeatureList.append(i) # trainFeatureMatrix = trainFeatureMatrix[ : , selectedFeatureList] # testFeatureMatrix = testFeatureMatrix[ : , selectedFeatureList] from sklearn.preprocessing import OneHotEncoder enc = OneHotEncoder() enc.__init__(categorical_features = categoricalAttriIndexList + cateNumericIndexList) enc.fit(tempResultMatrix) trainFeatureMatrix = enc.transform(trainFeatureMatrix).toarray() testFeatureMatrix = enc.transform(testFeatureMatrix).toarray() print 'old feature num is ', len(trainFeatureMatrix[0]), len(testFeatureMatrix[0]) #tempResultMatrix = np.concatenate((trainFeatureMatrix, testFeatureMatrix), axis=0) sel = VarianceThreshold() sel.fit(trainFeatureMatrix) trainFeatureMatrix = sel.transform(trainFeatureMatrix) testFeatureMatrix = sel.transform(testFeatureMatrix) print 'new feature num is ', len(trainFeatureMatrix[0]), len(testFeatureMatrix[0]) #exit() return trainFeatureMatrix, testFeatureMatrix