-
Notifications
You must be signed in to change notification settings - Fork 0
/
runValidation.py
41 lines (40 loc) · 1.72 KB
/
runValidation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#encoding:utf-8
from pyspark import SparkContext, SparkConf
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from numpy import array
import time
t1 = time.ctime()
sc = SparkContext(appName="DianShang")
table1 = sc.textFile("/user/team322/junli_testFeature/*")
def f1(line):
line = str(line).replace('(','').replace(')','').replace('None','0')
userID = line.split(',')[0]
return userID
user = table1.map(f1).collect() #select the users of validation data
result6 = sc.textFile("/user/team322/junli_trainFeature/*")
# Load and parse the data
def parsePoint(line):
line = str(line).replace('(','').replace(')','').replace('None','0')
line = line.split(',')
values = [float(x) for x in line[2:]] #select label Column and features Columns
return LabeledPoint(values[0], values[1:])
parsedData = result6.map(parsePoint)
# Build the model
model = LogisticRegressionWithSGD.train(parsedData)
result7 = sc.textFile("/user/team322/junli_testFeature/*")
def testParsePoint(line):
line = str(line).replace('(','').replace(')','').replace('None','0')
line = line.split(',')
values = [float(x) for x in line[1:]] #select label Column and features Columns
return LabeledPoint(values[0], values[1:])
parsedData2 = result7.map(testParsePoint)
preds = parsedData2.map(lambda p: model.predict(p.features)) #use the model to predict parsedData2
preds = preds.collect() #translate the result of predict into list
userID = []
for i in xrange(len(preds)): #select users whose predict is 1
if preds[i] == 1:
userID.append(user[i])
sc.parallelize(userID).saveAsTextFile('/user/team322/solution_v') #create a parallelized collection and save it
t2 = time.ctime()
print 'Starting:\t%s\nEnding:\t%s'%(t1,t2)