-
Notifications
You must be signed in to change notification settings - Fork 0
/
FeatureHashing.py
174 lines (127 loc) · 5.58 KB
/
FeatureHashing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import sys
import time
from random import random
from pyspark import SparkContext
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
from math import log
import time
from math import exp
from collections import defaultdict
import hashlib
#read in data into Spark RDD
def loadData():
sc = SparkContext(appName="SparkPi")
rawData=(sc.textFile('/gpfs/courses/cse603/students/CTR/train_80M')
#rawData=(sc.textFile('/user/meethilv/PDP/sample_train')
#rawData=(sc.textFile('/scratch/CTR/train1')
#.map(lambda x: x.replace(',','==='))
)
return rawData
'''Now reduce dimensionality with hashing. What does hash function do? Takes in a (index,value) tuple and maps it to a number 'x' less
than total number of buckets which is roughly 1/1000 of total feature categories i.e 33M. Then updates the index=x
of the sparse feature vector.'''
def hashFunction(numBuckets, rawFeats, printMapping=False):
mapping = {}
for ind, category in rawFeats:
featureString = category + str(ind)
mapping[featureString] = int(int(hashlib.md5(featureString).hexdigest(), 16) % numBuckets)
if(printMapping): print mapping
sparseFeatures = defaultdict(float)
for bucket in mapping.values():
sparseFeatures[bucket] += 1.0
return dict(sparseFeatures)
#Take all features and create a list of tuples of (index,feature)
def parsePoint(row):
features = row.split(',')[2:]
indexed_features=[]
for i in range(0,len(features)):
indexed_features.append((i,features[i]))
return indexed_features
#Now pass these tuples through hash function to get the hashed features, create a tuple of the label(click/no-click) with Sparse Vector of hashed features
def parseHashPoint(point, numBuckets):
features=parsePoint(point)
print "len(features)"+str(len(features))
points_list=point.split(',')
label=points_list[1]
hashedFeatures=hashFunction(numBuckets,features,printMapping=False)
return LabeledPoint(label,SparseVector(numBuckets,hashedFeatures))
def computeLogLoss(p, y):
epsilon = 10e-12
if p==0:
p=p+epsilon
elif p==1:
p=p-epsilon
if y==1:
return -log(p)
elif y==0:
return -log(1-p)
def splitData(rawData):
weights = [0.8,0.2]
seed = 234
rawTrainData,rawValidationData = rawData.randomSplit(weights,seed)
#rawTrainData.cache() ############ check if without
#rawValidationData.cache()
#print rawTrainData.count()
#print rawValidationData.count()
return rawTrainData,rawValidationData
def getLogisticRegressionModel(Train_Data):
numIters = 10
stepSize = 10.
regParam = 1e-6
regType = 'l2'
includeIntercept = True
return LogisticRegressionWithSGD.train(data = Train_Data,
iterations = numIters,
miniBatchFraction=0.1,
step = stepSize,
regParam = regParam,
regType = regType,
intercept = includeIntercept)
def getPrediction(x, w, intercept):
rawPrediction = x.dot(w) + intercept
# Bound the raw prediction value
rawPrediction = min(rawPrediction, 20)
rawPrediction = max(rawPrediction, -20)
return 1.0 / (1.0 + exp(-rawPrediction))
def evaluateResults(model, data):
return data.map(lambda x: computeLogLoss(getPrediction(x.features, model.weights, model.intercept), x.label)).sum() / data.count()
if __name__ == "__main__":
start = time.time()
raw_Data = loadData()
end = time.time()
print "Time for loadData = "+str(end-start)
start = time.time()
raw_Train_Data,raw_Validation_Data = splitData(raw_Data)
end = time.time()
print "Time for splitData = "+str(end-start)
print "############## feature hashing ##############"
start = time.time()
# Number of numerical features from OHE = 233,286
# 2 ** 15 = 32,768
# 2 ** 17 = 131,072
# 2 ** 18 = 262,144
numBucketsCTR = 2 ** 15
hashTrainData = raw_Train_Data.map(lambda point: parseHashPoint(point,numBucketsCTR))
hashTrainData.cache()
end = time.time()
print "Time for feature hashing featureExtraction = "+str(end-start)
start = time.time()
TrainDataLRModel = getLogisticRegressionModel(hashTrainData)
end = time.time()
print "Time for getLogisticRegressionModel = "+str(end-start)
start = time.time()
trainingPredictions = hashTrainData.map(lambda x: getPrediction(x.features, TrainDataLRModel.weights, TrainDataLRModel.intercept))
end = time.time()
print trainingPredictions.take(5)
print "Time for trainingPredictions = "+str(end-start)
classOneFracTrain = hashTrainData.map(lambda x: x.label).reduce(lambda a, b: a + b) / hashTrainData.count()
print classOneFracTrain
logLossTrBase = hashTrainData.map(lambda x: computeLogLoss(classOneFracTrain, x.label)).sum() / hashTrainData.count()
print 'Baseline Train Logloss = {0:.3f}\n'.format(logLossTrBase)
start = time.time()
logLossTrLR0 = evaluateResults(TrainDataLRModel, hashTrainData)
print ('Feature hashing Features Train Logloss:\n\tBaseline = {0:.3f}\n\tLogReg = {1:.3f}'.format(logLossTrBase, logLossTrLR0))
end = time.time()
print "Time for evaluateResults = "+str(end-start)