/
reglog_nocv_simple.py
120 lines (88 loc) · 3.62 KB
/
reglog_nocv_simple.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# coding: utf-8
#import packages
from pyspark import SparkContext
import loadFiles as lf
import numpy as np
import nltk
from functools import partial
import loadFilesPartial as lfp
import transformers as tr
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import IDF
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.sql import SQLContext
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
#create Sparkcontext
sc = SparkContext(appName="Simple App")
def Predict(name_text,dictionary,model):
words=name_text[1].strip().split(' ')
vector_dict={}
for w in words:
if(w in dictionary):
vector_dict[dictionary[w]]=1
return (name_text[0], model.predict(SparseVector(len(dictionary),vector_dict)))
data,Y=lf.loadLabeled("./data/train")
print len(data)
labeledData = zip(data,[y.item() for y in Y])
labeledRdd = sc.parallelize(labeledData)
from pyspark.sql import SQLContext
def preProcess(doc):
clean = doc.replace("<br /><br />"," ")
return clean.lower()
rdd = labeledRdd.map(lambda doc : (preProcess(doc[0]),doc[1]))
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol='review', outputCol='words')
dfTrainTok = tokenizer.transform(dfTrain)
import itertools
lists=dfTrainTok.map(lambda r : r.review).collect()
dictWords=set(itertools.chain(*lists))
dictionaryWords={}
for i,word in enumerate(dictWords):
dictionaryWords[word]=i
dict_broad=sc.broadcast(dictionaryWords)
from pyspark.mllib.linalg import SparseVector
def vectorize(row,dico):
vector_dict={}
for w in row.words:
if w in dico:
vector_dict[dico[w]]=1
return (row.label,SparseVector(len(dico),vector_dict))
from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.types import StructType, StructField,DoubleType
schema = StructType([StructField('label',DoubleType(),True),StructField('Vectors',VectorUDT(),True)])
features=dfTrainTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema)
print "Features created"
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(features)
featIndexed = string_indexer_model.transform(features)
print "labels indexed"
lr = LogisticRegression(featuresCol='Vectors', labelCol=string_indexer.getOutputCol())
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
lr_model = lr.fit(featIndexed)
dfTestTok = tokenizer.transform(dfTest)
featuresTest=dfTestTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema)
testIndexed = string_indexer_model.transform(featuresTest)
df_test_pred = lr_model.transform(testIndexed)
res=evaluator.evaluate(df_test_pred)
print res
#test,names=lf.loadUknown('./data/test')
#name_text=zip(names,test)
##for each doc :(name,text):
##apply the model on the vector representation of the text
##return the name and the class
#predictions=sc.parallelize(name_text).map(partial(Predict,dictionary=dict_broad.value,model=mb.value)).collect()
#output=file('./classifications.txt','w')
#for x in predictions:
# output.write('%s\t%d\n'%x)
#output.close()