forked from PravGitHub/Kafka_Streaming-News-Classification
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Build pipeline.py
36 lines (25 loc) · 1.43 KB
/
Build pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from pyspark import SparkContext
from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml.feature import CountVectorizer, StopWordsRemover, RegexTokenizer
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.ml import Pipeline
from kafka import KafkaConsumer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
#-------------------Building the logistic regression and naive bayes pipelines----------------------------------
if __name__ == "__main__":
sc = SparkContext.getOrCreate()
sc.setLogLevel("ERROR")
sqlContext = SQLContext(sc)
regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")
stop_words = []
with open('/home/asdf/Documents/stopwords.txt', 'r') as contents:
stop_words = contents.read().split()
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stop_words)
count_vectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)
lr = LogisticRegression(maxIter=100, regParam=0.01)
nb = NaiveBayes(labelCol="label", featuresCol="features", smoothing=1.0, modelType="multinomial")
pipe1 = Pipeline(stages=[regex_tokenizer, stop_words_remover, count_vectors, lr])
pipe2 = Pipeline(stages=[regex_tokenizer, stop_words_remover, count_vectors, nb])
pipe1.save("models/lr")
pipe2.save("models/nb")