drop_list = [ 'Dates', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y' ] data = data.select( [column for column in data.columns if column not in drop_list]) data.show(5) data.printSchema() ################################################## ################## Transofrmers ################## ################################################## # regular expression tokenizer regexTokenizer = RegexTokenizer(inputCol="Descript", outputCol="words", pattern="\\W") # stop words add_stopwords = ["http", "https", "amp", "rt", "t", "c", "the"] # standard stop words stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(add_stopwords) # bag of words count countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) label_stringIdx = StringIndexer(inputCol="Category", outputCol="label") transformers = [ regexTokenizer, stopwordsRemover, countVectors, label_stringIdx ]
sqlContext = SQLContext(sc) df = pd.DataFrame(train_data) # df = df.transpose() df.columns = ['tweet_id', 'tweet_label', 'tweet_words'] data_complete = sqlContext.createDataFrame(df) data = data_complete.select(['tweet_label', 'tweet_words']) data.show(5) data.groupBy("tweet_label") \ .count() \ .orderBy(col("count").desc()) \ .show() # regular expression tokenizer regexTokenizer = RegexTokenizer(inputCol="tweet_words", outputCol="words", pattern="\\W") # stop words f = open("stopwords_twitter.txt", "r") add_stopwords = [] for l in f.readlines(): add_stopwords.append(l.strip()) print(add_stopwords) stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(add_stopwords) # bag of words count # countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) hashingTF = HashingTF(inputCol="filtered",
def __init__(self): # Convert Pandas dataframe to PySpark dataframe. df = sqlContext.read.format("csv").option("header", "true").load("hotel-reviews.csv") # df = sqlContext.createDataFrame(pandas_df) # Change Reviewer_Score in Sentiment value (1 <= 5.5, 0 < 5.5) df = df.withColumn('Reviewer_Score', fn.when(df.Reviewer_Score >= 7.0, 1).otherwise(0)) df = df.withColumnRenamed('Reviewer_Score', 'Sentiment') # Concatenate the negative and positive to a single review text df_with_text = df.withColumn('Review_Text', fn.concat(fn.col('Negative_Review'), fn.lit(' '), fn.col('Positive_Review'))) # Strip Dataframe to only what is necessary for sentiment analysis df_stripped = df_with_text.select('Negative_Review', 'Positive_Review', 'Review_Text', 'Sentiment') # Importing Stopwords to filter out of the reviews to exclude stopwords stop_words = requests.get('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words').text.split() # Configure tokenizer to extract words with only letters and save in column words tokenizer = RegexTokenizer().setGaps(False) \ .setPattern("\\p{L}+") \ .setInputCol("Review_Text") \ .setOutputCol("words") # Configure stopwords filter sw_filter = StopWordsRemover() \ .setStopWords(stop_words) \ .setCaseSensitive(False) \ .setInputCol("words") \ .setOutputCol("filtered") cv = CountVectorizer(minTF=1., minDF=5., vocabSize=2 ** 17) \ .setInputCol("filtered") \ .setOutputCol("tf") # Create Pipeline with Tokenizer, Stopwords Filter and CountVectorizer cv_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv]).fit(df_stripped) # Configure TFIDF idf = IDF(). \ setInputCol('tf'). \ setOutputCol('tfidf') idf_pipeline = Pipeline(stages=[cv_pipeline, idf]).fit(df_stripped) # Split data into training, validation and testing data (60%, 30%, 10%) training_df, validation_df, testing_df = df_stripped.randomSplit([0.6, 0.3, 0.1], seed=0) # Configure LogisticRegression for analysis of the reviews lr = LogisticRegression(). \ setLabelCol('Sentiment'). \ setFeaturesCol('tfidf'). \ setRegParam(0.0). \ setMaxIter(100). \ setElasticNetParam(0.) # Create new Pipelines for the LogisticRegression and train the model self.model = Pipeline(stages=[idf_pipeline, lr]).fit(training_df) # Calculate Score of our Model using the validation Dataframe self.model.transform(validation_df). \ select(fn.expr('float(prediction = Sentiment)').alias('correct')). \ select(fn.avg('correct')).show() spark = SparkSession \ .builder \ .appName("user_input_analysis") \ .getOrCreate()
def main(review_table,business_table,output_folder): #Read reviews and business data review_df = spark.read.parquet(review_table) review_df.createOrReplaceTempView("reviews_table") business_df = spark.read.parquet(business_table) business_toronto=business_df.filter(business_df.City=="Toronto") business_toronto.createOrReplaceTempView("business_table") #collect reviews for each business business_review=spark.sql( """ SELECT BusinessID, collect_set(Review) AS total_review FROM reviews_table GROUP BY BusinessID """ ) #convert reviews in string format merge_review = udf(lambda total_review: (" ").join(total_review)) business_concat_review=business_review.withColumn("comb_review", merge_review(business_review['total_review'])).drop(business_review['total_review']) business_concat_review.createOrReplaceTempView("comb_review_table") #Keep reviews for business in toronto Reviews_for_business=spark.sql(""" SELECT c.BusinessID,b.Name AS BusinessName,b.BusinessStars,c.comb_review FROM comb_review_table AS c INNER JOIN business_table AS b ON c.BusinessID=b.BusinessID """) #pipleine to preprocess text data regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'comb_review', outputCol = 'token') stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'no_stopword') countVectorizer = CountVectorizer(inputCol="no_stopword", outputCol="rawcol") TDF = IDF(inputCol="rawcol", outputCol="idf_vec") text_pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, countVectorizer, TDF]) IDF_model = text_pipeline.fit(Reviews_for_business) #IDF_model.write().overwrite().save('IDF_model1') #collect the vacabulary from text from count vectorizer model vocab=IDF_model.stages[2].vocabulary business_review_df=IDF_model.transform(Reviews_for_business) #two business categories base on low and high star rating reviews_low=business_review_df.where(business_review_df.BusinessStars<=3) reviews_high=business_review_df.where(business_review_df.BusinessStars>3) lda = LDA(k=6, seed=123, optimizer='online', featuresCol="idf_vec") vocab_word = udf(lambda termIndices: [vocab[idx] for idx in termIndices]) #topic modelling on low rating business lowtopic_model = lda.fit(reviews_low) lowtopic_transform=lowtopic_model.transform(reviews_low) print("topic distribution for low rating business") lowtopic_transform.select('BusinessID','BusinessName','topicDistribution').show(4,False) #lowtopic_model.write().overwrite().save('lowtopic_model') #topic distribution low_dist=lowtopic_transform.withColumn('topic_distribution',lowtopic_transform['topicDistribution'].cast('string')).drop('topicDistribution') low_dist_df=low_dist.select('BusinessID','BusinessName','topic_distribution') low_dist_df.write.csv(output_folder + '/Topic_low_business_topic_dist',header=True) #key topics lowreview_topics=lowtopic_model.describeTopics() lowreview_topics_concat=lowreview_topics.withColumn("topic_word", vocab_word(lowreview_topics['termIndices'])) low_df=lowreview_topics_concat.select('topic','topic_word') print("Topics for low rating business") low_df.show(6,False) low_df.coalesce(1).write.csv(output_folder + '/Topic_low_rating_topic',header=True) #topic modelling on high rating business high_topic_model = lda.fit(reviews_high) hightopic_transform=high_topic_model.transform(reviews_high) print("topic distribution for high rating business") hightopic_transform.select('BusinessID','BusinessName','topicDistribution').show(4,False) #high_topic_model.write().overwrite().save('high_topic_model') #topic distribution high_dist=hightopic_transform.withColumn('topic_distribution',hightopic_transform['topicDistribution'].cast('string')).drop('topicDistribution') high_dist_df=high_dist.select('BusinessID','BusinessName','topic_distribution') high_dist_df.write.csv(output_folder + '/Topic_high_business_topic_dist',header=True) #key topic highreview_topics=high_topic_model.describeTopics() highreview_topics_concat=highreview_topics.withColumn("topic_word", vocab_word(highreview_topics['termIndices'])) high_df=highreview_topics_concat.select('topic','topic_word') print("\nTopics for high rating business") high_df.show(6,False) high_df.coalesce(1).write.csv(output_folder + '/Topic_high_rating_topic',header=True)
#createDataFrame! missing_count = spark.createDataFrame(null_value_count(news_data), ['Coulmn_with_Null_Value', 'Null_values_count']).show() # title_category = news_data.select('TITLE', 'CATEGORY') title_category.select('Category').distinct().count() title_category.groupBy('Category').count().orderBy(col('Count').desc()).show(truncate = False) title_category.groupBy('TITLE').count().orderBy(col('count').desc()).show(truncate = False) #### #Top 20 news categories: #regexp_replace: regular expression replacing! title_category = title_category.withColumn('only_str', regexp_replace(col('TITLE'), '\d+', '')) title_category.select('TITLE', 'only_str').show(truncate = False) #Top 20 news title: regex_tokenizer = RegexTokenizer(inputCol = 'only_str', outputCol = 'words', pattern = '\\W') raw_words = regex_tokenizer.transform(title_category) raw_words.show() remover = StopWordsRemover(inputCol = 'words', outputCol = 'filtered') word_df = remover.transform(raw_words) word_df.select('words', 'filtered').show(truncate = False) indexer = StringIndexer(inputCol = 'CATEGORY', outputCol = 'categoryIndex') feature_data = indexer.fit(word_df).transform(word_df) feature_data.show() cv = CountVectorizer(inputCol = 'filtered', outputCol = 'features')
# $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession.builder.appName("TokenizerExample").getOrCreate() # $example on$ sentenceDataFrame = spark.createDataFrame( [(0, "Hi I heard about Spark"), (1, "I wish Java 12 2 could use case classes"), (2, "Logistic,regression,models,are,neat")], ["id", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") # alternatively, pattern="\\w+", gaps(False) countTokens = udf(lambda words: len(words), IntegerType()) tokenized = tokenizer.transform(sentenceDataFrame) tokenized.select("sentence", "words")\ .withColumn("tokens", countTokens(col("words"))).show(truncate=False) regexTokenized = regexTokenizer.transform(sentenceDataFrame) regexTokenized.select("sentence", "words") \ .withColumn("tokens", countTokens(col("words"))).show(truncate=False) print("Apa yang disini") # $example off$
def train_model(): ''' if(dataRdd != None): print("**************************************************************************************************** Inside train model with new rdd") # Read the model pipeModel_Prev = PipelineModel.load('sentiment.model') # regular expression tokenizer regexTokenizer = RegexTokenizer(inputCol="content", outputCol="words", pattern="\\W") # bag of words count countVectors = CountVectorizer(inputCol="words", outputCol="features", vocabSize=10000, minDF=5) # convert string labels to indexes label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label") nb = NaiveBayes(featuresCol="features", labelCol="label", smoothing=1.0, modelType="multinomial") # convert prediction to the predictedSentiment indexToLabels = IndexToString(inputCol = "prediction", outputCol = "predictedSentiment", labels=["bordem","love","relief", "fun", "hate", "neutral", "anger", "happiness", "surpirse","sadness","worry", "empty"]) # Buidl spark pipeline pipeline = Pipeline(stages=[regexTokenizer, countVectors, label_stringIdx, nb, indexToLabels]) # Fit the pipelin. pipeModel_Next = pipeline.fit(dataRDD) pipe_model_new = PipelineModel(stages = [pipeModel_Prev ,pipeModel_Next]) print("Workinggggggggggggggg") pipeModel_New.save("sentiment.model") ''' data = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('text_emotion.csv') #Drop unused columns drop_list = ['tweet_id'] data = data.select([column for column in data.columns if column not in drop_list]) \ .where( (data['sentiment'] == 'empty') | (data['sentiment'] == 'sadness') | (data['sentiment'] == 'enthusiam') | (data['sentiment'] == 'worry') | (data['sentiment'] == 'surprise') | (data['sentiment'] == 'love') | (data['sentiment'] == 'hate') | (data['sentiment'] == 'anger') | (data['sentiment'] == 'neutral') | (data['sentiment'] == 'relief') | (data['sentiment'] == 'boredom') | (data['sentiment'] == 'fun') | (data['sentiment'] == 'happiness')) \ .na.drop(thresh=3) data.show(5) data.groupBy("sentiment") \ .count() \ .orderBy(col("count").desc()) \ .show() # set seed for reproducibility (trainingData, testData) = data.randomSplit([0.8, 0.2], seed = 100) print("Training Dataset Count: " + str(trainingData.count())) print("Test Dataset Count: " + str(testData.count())) # regular expression tokenizer regexTokenizer = RegexTokenizer(inputCol="content", outputCol="words", pattern="\\W") # bag of words count countVectors = CountVectorizer(inputCol="words", outputCol="features", vocabSize=10000, minDF=5) # convert string labels to indexes label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label") nb = NaiveBayes(featuresCol="features", labelCol="label", smoothing=1.0, modelType="multinomial") # convert prediction to the predictedSentiment indexToLabels = IndexToString(inputCol = "prediction", outputCol = "predictedSentiment", labels=["bordem","love","relief", "fun", "hate", "neutral", "anger", "happiness", "surpirse","sadness","worry", "empty"]) # Buidl spark pipeline pipeline = Pipeline(stages=[regexTokenizer, countVectors, label_stringIdx, nb, indexToLabels]) # Fit the pipelin. pipelineFit = pipeline.fit(trainingData) predictions = pipelineFit.transform(testData) predictions.filter(predictions['prediction'] == 0) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 1) \ .select("content","sentiment", "predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 2) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 3) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 4) \ .select("content","sentiment","predictedSentiment", "probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 5) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 6) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 7) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 8) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 9) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 10) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) predictions.filter(predictions['prediction'] == 11) \ .select("content","sentiment","predictedSentiment","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) # Retrive F1 accuracy score evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="label") print("F1: %g" % (evaluator.evaluate(predictions))) pipelineFit.save("sentiment.model")
p_train = pd.DataFrame({ 'data': train.data, 'target': train.target, 'filenames': train.filenames }) p_test = pd.DataFrame({ 'data': test.data, 'target': test.target, 'filenames': test.filenames }) s_train = spark.createDataFrame(p_train) s_test = spark.createDataFrame(p_test) tokenizer = RegexTokenizer(inputCol='data', outputCol='words', pattern='\\W') termFreq = HashingTF(inputCol='words', outputCol='freq') pipeline = Pipeline(stages=[tokenizer, termFreq]) model = pipeline.fit(s_train) data = model.transform(s_train) def v_max(vector): return max(vector.toArray()) udf_v_max = udf(v_max, FloatType()) slen = udf(lambda s: s[0], IntegerType()) data.select(data.freq).rdd.map(lambda x: x.freq.toArray().argmax()).first() data.first()
wcss = model.computeCost(data_scaled) centers = model.clusterCenters() result = model.transform(data_scaled) result.show() result.groupBy('prediction').count().show() ##################################################################### ############# Natural Language Preprocessing from pyspark.sql.functions import regexp_replace data_cleaned = data.withColumn('text', regexp_replace(data.text, '[0-9]', '')) from pyspark.ml.feature import Tokenizer, RegexTokenizer tokenizer = Tokenizer(inputCol = 'text', outputCol = 'token') tokenizer2 = RegexTokenizer(inputCol = 'text', outputCol = 'token', pattern = '#\w+') data_token = tokenizer.transform(data) data_token.show() # n_words from pyspark.sql.functions import udf from pyspark.sql.types import IntegerType count_token = udf(lambda token: len(token), IntegerType()) data_token.withColumn('n_token', count_token(col('token'))).show() # stopwords from pyspark.ml.feature import StopWordsRemover remover = StopWordsRemover(inputCol = 'token', outputCol = 'filtered', stopwords = ['aaa']) data_2 = remover.transform(data_token)
if __name__ == "__main__": spark = SparkSession \ .builder \ .getOrCreate() # Prepare data data = spark.read.csv("hdfs://devenv/user/spark/spark_mllib_101/spam_detection/data/sms_messages_with_labels.csv", inferSchema=True, header=True) # Preprocessing and feature engineering feature_prep = data.select(lower(data["message"]).alias("message"), length(data["message"]).alias("length"), "label") feature_prep = RegexTokenizer(inputCol="message", outputCol="words", pattern="\\W+").transform(feature_prep) feature_prep = StopWordsRemover(inputCol='words',outputCol='stop_words_removed').transform(feature_prep) feature_prep = HashingTF(inputCol="stop_words_removed", outputCol="hashing_tf", numFeatures=4000).transform(feature_prep) feature_prep = IDF(inputCol="hashing_tf", outputCol="tf_idf").fit(feature_prep).transform(feature_prep) feature_prep = StringIndexer(inputCol='label',outputCol='label_indexed').fit(feature_prep).transform(feature_prep) feature_prep = VectorAssembler(inputCols=["tf_idf", "length"], outputCol="features").transform(feature_prep) final_data = feature_prep.select("label_indexed", "features")
df.groupBy(df.term).agg(F.avg(df.fracNumPmts)).show() # +----+------------------+ # |term| avg(fracNumPmts)| # +----+------------------+ # | 60|0.5334839555374444| #repaid very early # | 36|0.7324283072750936| # +----+------------------+ #start with text processing (most likely it has no significant impact) df = df.withColumn('desc', F.regexp_replace('desc', '(Borrower added on [0-9][0-9]/[0-9][0-9]/[0-9][0-9] >)|<br>|<br/>' , '').alias('desc')) #take a look to verify #df.select('desc').show(3,truncate=False) #split the doc strings, or use tokenizer regexTokenizer = RegexTokenizer(inputCol="desc", outputCol="words", pattern="\\W") df = regexTokenizer.transform(df) #3D vector space word2Vec = Word2Vec(vectorSize=10, minCount=0, inputCol="words", outputCol="result") #Fit to find word embeddings modelW2V = word2Vec.fit(df) #Use the embeddings to transform, with the vector for "words" in the column "result" df = modelW2V.transform(df) #rows without any comments NULL -> marked 'none' will share the same vector #df.select('desc','result').show(10,truncate=True) # set to false for large vector space #cluster the result kmeans = KMeans(k=3, seed=1, featuresCol="result", predictionCol="pred_KM") modelKM = kmeans.fit(df)
def main(topic): # 1. Load Data, Combine keywords, tweet_urls by news_url, Add id messages = spark.readStream.format('kafka') \ .option('kafka.bootstrap.servers', 'localhost:9092') \ .option('subscribe', topic)\ .option('failOnDataLoss', 'false')\ .option('auto.offset.reset', 'earliest')\ .load() values = messages.select(messages['value'].cast('string')) words = values.select( functions.explode(functions.split(values.value, ';')).alias("words")) data = words.withColumn('text', functions.split('words', ',')).select('text') data = data.withColumn('news_id', data['text'][0]) data = data.withColumn('news_url', data['text'][1]) print('finish load data') # 2. Scrap the news_text and tweets_comments data = data.withColumn('news_info', udf_get_news_info(data['news_url'])) data = data.withColumn('news_title', data['news_info'][0]) data = data.withColumn('news_text', data['news_info'][1]) data = data.withColumn('news_image', data['news_info'][2]) data = data.where(data['news_title'].isNotNull() & (functions.length(data['news_title']) > 0)) data = data.where(data['news_text'].isNotNull() & (functions.length(data['news_text']) > 0)) # data = data.where(data['tweets_comment'].isNotNull() & (functions.length(data['tweets_comment']) > 0)) # filter reviews with no text print('finish scrap') # 3. ML pipeline: Tokenization (with Regular Expression) and Remove Stop Words data = data.withColumn('sentiment_scores', udf_sentiment_score(data['news_text'])) news_regex_tokenizer = RegexTokenizer(inputCol='news_text', outputCol='news_words', pattern='[^A-Za-z]+') news_stopwords_remover = StopWordsRemover( inputCol='news_words', outputCol='news_tokens', stopWords=StopWordsRemover.loadDefaultStopWords('english')) # count_vectorizer = CountVectorizer(inputCol='filtered_words', outputCol='features') nlp_pipeline = Pipeline( stages=[news_regex_tokenizer, news_stopwords_remover]) model = nlp_pipeline.fit(data) nlp_data = model.transform(data).select('news_id', 'news_title', 'news_text', 'news_image', 'news_tokens', 'sentiment_scores') # 4. Select Features nlp_data = nlp_data.withColumn('news_tokens', udf_morphy(nlp_data['news_tokens'])) # nlp_data = nlp_data.withColumn('tweets_tokens', udf_morphy(nlp_data['tweets_tokens'])) # nlp_data = nlp_data.select(nlp_data['business_id'], review['stars'], udf_morphy(review['tokens']).alias('tokens')) nlp_data = nlp_data.where(functions.size(nlp_data['news_tokens']) > 0) # nlp_data = nlp_data.where(functions.size(nlp_data['tweets_tokens']) > 0) # nlp_data_score = nlp_data_score.withColumn('tweets_tokens', functions.split('tweets_tokens', '\s+')) nlp_data = nlp_data.withColumn('news_tokens', functions.concat_ws(' ', 'news_tokens')) print('finish scores') # 5. Save nlp_data = nlp_data.withColumn( 'dl_value', functions.to_json( functions.struct([nlp_data[x] for x in nlp_data.columns]))) stream = nlp_data.select(nlp_data.news_id.alias("key"), nlp_data.dl_value.alias("value"))\ .writeStream\ .format('kafka')\ .outputMode('update')\ .option('kafka.bootstrap.servers', 'localhost:9092')\ .option("topic", "mlnews-2")\ .option("checkpointLocation", "../check")\ .start() # stream = nlp_data.writeStream.format('console').outputMode('update').start() stream.awaitTermination()
"id", row_number().over(Window.orderBy(monotonically_increasing_id())) - 1) train_df.createOrReplaceTempView("train_df") train_df.show(5) test_df = test_df.withColumn( "id", row_number().over(Window.orderBy(monotonically_increasing_id())) - 1) test_df.createOrReplaceTempView("test_df") test_df.show(5) ######################################################################################################## # Build pipeline and run indexer = StringIndexer(inputCol="category", outputCol="label") tokenizer = RegexTokenizer(pattern=u'\W+', inputCol="text", outputCol="words", toLowercase=False) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures") idf = IDF(inputCol="rawFeatures", outputCol="features") lr = LogisticRegression(maxIter=20, regParam=0.001) # Builing model pipeline pipeline = Pipeline(stages=[indexer, tokenizer, hashingTF, idf, lr]) # Train model on training set model = pipeline.fit( train_df ) #if you give new names to your indexed datasets, make sure to make adjustments here # Model prediction on test set pred = model.transform(test_df) # ...and here
# df3 = df2.filter(df2.message.contains('\"Url\":\"https://isl-ca.dazn.com/misl/v2/Playback'))\ .filter(df2.message.contains('&Format=MPEG-DASH&'))\ .filter(df2.message.contains('\"User-Agent\":\"Mozilla/5.0,(Macintosh; Intel Mac OS X 10_12_6),AppleWebKit/605.1.15,(KHTML, like Gecko),Version/11.1.2,Safari/605.1.15\"},'))\ .filter(df2.message.contains(',\"Response\":{\"StatusCode\":200,\"ReasonPhrase\":\"OK\",')) df3.printSchema() df4 = df3.withColumn( "messagecut", expr( "substring(message, locate('|Livesport.WebApi.Controllers.Playback.PlaybackV2Controller|',message)+60 , length(message)-1)" )) # #val regexTokenizer = new RegexTokenizer().setInputCol("messagecut").setOutputCol("words").setPattern("\\w+|").setGaps(false) regexTokenizer = RegexTokenizer(minTokenLength=1, gaps=False, pattern='\\w+|', inputCol="messagecut", outputCol="words", toLowercase=True) # tokenized = regexTokenizer.transform(df4) tokenized.printSchema() tokenized.coalesce(1).write.json(output_file1) # df5 = sqlContext.read.json(input_file2).filter("message IS NOT NULL") # ngram = NGram(n=90, inputCol="words", outputCol="ngrams") # ngramDataFrame = ngram.transform(df5) ngramDataFrame.select("ngrams").coalesce(1).write.json(output_file2) # #
]) data_df = spark.read.csv(file_path, header=True, schema=schema, mode="DROPMALFORMED") splits = data_df.randomSplit([0.8, 0.2], 4) training = splits[0] test = splits[1] #------------------------------------------------------------------------------------------------------------------- tokenizer_svm = RegexTokenizer(inputCol="tweet", outputCol="words", pattern="\\s+") hashing_tf_svm = HashingTF(inputCol="words", outputCol="tf") idf_svm = IDF(inputCol="tf", outputCol="features") svm = LinearSVC() ovr = OneVsRest(classifier=svm) pipeline_svm = Pipeline( stages=[tokenizer_svm, hashing_tf_svm, idf_svm, ovr]) model_svm = pipeline_svm.fit(training) result_svm = model_svm.transform(test)