def fit_kmeans(spark, products_df): step = 0 step += 1 tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer") step += 1 stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords") step += 1 tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16) step += 1 idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf") step += 1 normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer") step += 1 kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20) kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans]) model = kmeans_pipeline.fit(products_df) words_prediction = model.transform(products_df) model.save("./kmeans") # the whole machine learning instance is saved in a folder return model, words_prediction
outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) ## Create H2OAutoML model automl = H2OAutoML(convertUnknownCategoricalLevelsToNa=False, seed=1, maxRuntimeSecs=300, # 5 minutes predictionCol="label") ## Remove all helper columns colPruner = ColumnPruner(columns=[idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol()]) ## Create the pipeline by defining all the stages pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, hashingTF, idf, automl, colPruner]) ## Train the pipeline model data = load() model = pipeline.fit(data) ## ## Make predictions on unlabeled data ## Spam detector ## def isSpam(smsText, model, hamThreshold = 0.5): smsTextDF = spark.createDataFrame([(smsText,)], ["text"]) # create one element tuple prediction = model.transform(smsTextDF)
caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) ## Create H2ODeepLearning model dl = H2ODeepLearning(epochs=10, l1=0.001, l2=0.0, hidden=[200, 200], featuresCols=[idf.getOutputCol()], predictionCol="label") ## Remove all helper columns colPruner = ColumnPruner(columns=[ idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol() ]) ## Create the pipeline by defining all the stages pipeline = Pipeline( stages=[tokenizer, stopWordsRemover, hashingTF, idf, dl, colPruner]) ## Train the pipeline model
StopWords = list(set(stopwords.words('english'))) labelIndexer = StringIndexer(inputCol="tags", outputCol="label").fit(train) bs_text_extractor = BsTextExtractor(inputCol="post", outputCol="untagged_post") RegexTokenizer = RegexTokenizer(inputCol=bs_text_extractor.getOutputCol(), outputCol="words", pattern="[^0-9a-z#+_]+") StopwordRemover = StopWordsRemover( inputCol=RegexTokenizer.getOutputCol(), outputCol="filtered_words").setStopWords(StopWords) CountVectorizer = CountVectorizer(inputCol=StopwordRemover.getOutputCol(), outputCol="countFeatures", minDF=5) idf = IDF(inputCol=CountVectorizer.getOutputCol(), outputCol="features") rf = RandomForestClassifier(labelCol="label", featuresCol=idf.getOutputCol(), numTrees=100, maxDepth=4) idx_2_string = IndexToString(inputCol="prediction", outputCol="predictedValue") idx_2_string.setLabels(labelIndexer.labels) # creating the pipeline pipeline = Pipeline(stages=[ labelIndexer, bs_text_extractor, RegexTokenizer, StopwordRemover, CountVectorizer, idf, rf, idx_2_string ]) # fitting the model model = pipeline.fit(train) # performing the prediction
print "Text is cleaned" sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ["review", "label"]) dfTrain, dfTest = df.randomSplit([0.8, 0.2]) print "Random split is done" tokenizerNoSw = tr.NLTKWordPunctTokenizer( inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english")) ) hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf") idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed") dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10) pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) # **************************************************************** # *********************CROSS VALIDATION: 80%/20%****************** # *******************Model: DecisionTreeClassifier***************** # ***************************************************************** evaluator = MulticlassClassificationEvaluator( predictionCol="prediction", labelCol="target_indexed", metricName="precision" ) grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build()
sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ['review', 'label']) dfTrain, dfTest = df.randomSplit([0.8, 0.2]) print "Random split is done" tokenizerNoSw = tr.NLTKWordPunctTokenizer( inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words('english'))) hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf') idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') dt = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxIter=30, regParam=0.01) pipeline = Pipeline( stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') # grid=(ParamGridBuilder() # .baseOn([evaluator.metricName,'precision']) # .addGrid(dt.maxDepth, [10,20]) # .build())
spark = SparkSession.builder.getOrCreate() data = pd.read_csv('https://raw.githubusercontent.com/DaiZack/MLdatasets/master/imdb500.csv') df = spark.createDataFrame(data) textCol = 'review' selfstopwords = ['br'] numOfTopics = 10 numOfKeywords = 5 tokenizer = RegexTokenizer(inputCol=textCol, outputCol='token', pattern='\\W+') stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='clean0') stopwords1 = StopWordsRemover(inputCol=stopwords.getOutputCol(), stopWords=selfstopwords,outputCol='clean') cv = CountVectorizer(inputCol=stopwords1.getOutputCol(), outputCol='cv') idf = IDF(inputCol=cv.getOutputCol(), outputCol='idf') lda = LDA(featuresCol=idf.getOutputCol(), k=numOfTopics, maxIter=10) pipe1 = Pipeline(stages=[tokenizer, stopwords,stopwords1,cv,idf, lda]) model = pipe1.fit(df) output = model.transform(df) def topicsTerms(vocab, termindices, leng=None): if not leng: return [voca[t] for t in termindices] return [vocab[t] for t in termindices][:leng] def topicsTerm_udf(vocab, leng=None): return udf(lambda x: topicsTerms(vocab,x, leng)) topweights = udf(lambda x: x[:numOfKeywords])
ngramer = NGram(n=2, inputCol='filtered_words', outputCol='ngrams') text = ngramer.transform(text) text.show(5) count_vec = CountVectorizer(inputCol=ngramer.getOutputCol(), outputCol='ft_features') count_vec_model = count_vec.fit(text) vocab = count_vec_model.vocabulary text = count_vec_model.transform(text) text.show(5) idf = IDF(inputCol=count_vec.getOutputCol(), outputCol='features') text = idf.fit(text).transform(text) lda = LDA(featuresCol=idf.getOutputCol(), k=5, maxIter=10) lda_model = lda.fit(text) topics = lda_model.describeTopics() # topics_words = topics.rdd.map(lambda x: x['termIndices']).map(lambda x:[vocab[i] for i in x]).collect() get_topics_words = F.udf(lambda x: [vocab[i] for i in x], ArrayType(StringType())) topics = topics.withColumn('topic_words', get_topics_words(F.col('termIndices'))) topics.show() text = lda_model.transform(text) text.show(5) ''' schema_sdf = StructType([StructField('paragraph', StringType(), True)]) text = spark.read.options(header='false').schema(schema_sdf).csv('/user/devel/2020210990Renqiang/data/speech.txt',sep='\n')
print "Text is cleaned" sqlContext = SQLContext(sc) #************************************************************** #*******Partie générique à mdifier dans les script************* dfTrain = sqlContext.createDataFrame(rdd, ['review', 'label']) #************************************************************** tokenizer = Tokenizer(inputCol='review', outputCol='reviews_words') hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='reviews_tf') idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10) pipeline = Pipeline(stages=[tokenizer, hashing_tf, idf, string_indexer, dt]) #************************************************************** #**************Partie de code générique************************ #*************à copier après le pipeline*********************** #************************************************************** model = pipeline.fit(dfTrain) print "The model is fitted" #import test set
def cleanLower(doc): return doc.replace("<br /><br />"," ").lower() rdd = labeledRdd.map(lambda doc : (cleanLower(doc[0]),doc[1])) print "Text is cleaned" sqlContext = SQLContext(sc) dfTrain = sqlContext.createDataFrame(rdd, ['review', 'label']) tokenizerNoSw = tr.NLTKWordPunctTokenizer( inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words('english'))) hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf') idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10) pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) model = pipeline.fit(dfTrain) print "The model is fitted" #import test set test,names=lf.loadUknown('./data/test') text_name=zip(test,names)
## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) if algo == "gbm": ## Create GBM model algoStage = H2OGBM(ratio=0.8, seed=1, featuresCols=[idf.getOutputCol()], predictionCol="label") elif algo == "dl": ## Create H2ODeepLearning model algoStage = H2ODeepLearning(epochs=10, seed=1, l1=0.001, l2=0.0, hidden=[200, 200], featuresCols=[idf.getOutputCol()], predictionCol="label") elif algo == "automl": ## Create H2OAutoML model algoStage = H2OAutoML(convertUnknownCategoricalLevelsToNa=True, maxRuntimeSecs=60, # 1 minutes maxModels=3,
print "Text is cleaned" sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ['review', 'label']) dfTrain, dfTest = df.randomSplit([0.8,0.2]) print "Random split is done" tokenizerNoSw = tr.NLTKWordPunctTokenizer( inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words('english'))) hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf') idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') dt = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(),maxIter=30, regParam=0.01) pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') # grid=(ParamGridBuilder() # .baseOn([evaluator.metricName,'precision']) # .addGrid(dt.maxDepth, [10,20]) # .build()) #cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid,evaluator=evaluator)
tokenizer = Tokenizer(inputCol='review', outputCol='tokens') stopWordRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='stoppedWords').setStopWords(stopWords) countVector = CountVectorizer(inputCol=stopWordRemover.getOutputCol(), outputCol='vectors') idf = IDF(inputCol=countVector.getOutputCol(), outputCol='idf') pipline = Pipeline(stages=[tokenizer, stopWordRemover, countVector, idf]) model = pipline.fit(trainDF) ptrainDF = model.transform(trainDF) ptestDF = model.transform(testDF) ptrainDF.show() # %% evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="f1") # %% lr = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol='class') lrModel = lr.fit(ptrainDF) predictionsLR = lrModel.transform(ptestDF) evaluator.evaluate(predictionsLR) # %% lda = LDA(featuresCol=idf.getOutputCol(), maxIter=10, k=2) ldaModel = lda.fit(ptrainDF) # %% naiveBayes = NaiveBayes(featuresCol=idf.getOutputCol(), labelCol='class') naiveModel = naiveBayes.fit(ptrainDF) predictionsNaive = naiveModel.transform(ptestDF) evaluator.evaluate(predictionsNaive) # %%
filterer = Filterer(key='subreddit', val='body', inputCol='subreddit', outputCol='body', minlength=args.minlength) tokenizer = RegexTokenizer(inputCol=cleaner.getOutputCol(), outputCol="tokens", pattern="\\W") remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="swr_tokens") cv = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="tf", minDF=args.mindf, vocabSize=args.vocabsize) idf = IDF(inputCol=cv.getOutputCol(), outputCol="tfidf") topkwords = TopKWords(inputCol=idf.getOutputCol(), outputCol='top_words', nwords=args.nwords) cos_similarity = CosineSimilarity(inputCol='subreddit', outputCol='norm', spark=spark) topksubreddits = TopKSubreddits(inputCol=cos_similarity.getOutputCol(), outputCol='top_subreddits', nsubreddits=args.nsubreddits) pipeline = Pipeline(stages=[ extractor, cleaner, filterer, tokenizer, remover, cv, idf, topkwords, cos_similarity, topksubreddits ]) # fit the model, extract the computed vocabulary
tokenizer = Tokenizer(inputCol="reviews", outputCol="tokens") countVector = CountVectorizer(inputCol=tokenizer.getOutputCol(), outputCol='features') idf = IDF(inputCol=countVector.getOutputCol(), outputCol='idf') pipeline = Pipeline(stages=[tokenizer, countVector, idf]) pipelineModel = pipeline.fit(trainDF) # %% pTrainDF = pipelineModel.transform(trainDF) pTestDF = pipelineModel.transform(testDF) # %% evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="f1") lr = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol='class') lrModel = lr.fit(pTrainDF) predictionsLR = lrModel.transform(pTestDF) evaluator.evaluate(predictionsLR) # %% inputComment = "Game of thrones is an awesome book. \ George RR Martin has done a fantastic job at it.\n \ Arya stark killed night king. \ At the end of the third book we find out that Jon Snow is actually the son of rhaegar targaryen and lyanna stark. \ Jon snow is not at all a bastard. \ Bran stark becomes the king. \ Arya stark. \ Jon Snow was always the heir to the throne. \ Arya Stark survives the whole battle of winterfell and goes on to kill the night king. \
df = spark.read.csv('file:///home/zfar/Sentiment Analysis Dataset.csv', header=True) df = df.select(df['ItemID'], df['SentimentText'], df['label']) training = df.selectExpr("cast(itemID as int) id", "SentimentText", "cast(label as int) label") tokenizer = Tokenizer(inputCol="SentimentText", outputCol="words") remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered") ngrams = NGram(n=2, inputCol=remover.getOutputCol(), outputCol="ngrams") hashingTF = HashingTF(inputCol=ngrams.getOutputCol(), outputCol="rawfeatures") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idffeatures") normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol="features", p=1.0) #lr = LogisticRegression(maxIter=10, regParam=0.001) nb = NaiveBayes(smoothing=1.0) pipeline = Pipeline( stages=[tokenizer, remover, ngrams, hashingTF, idf, normalizer, nb]) model = pipeline.fit(training) """ paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [10, 100, 1000]).addGrid(lr.regParam, [0.1, 0.01]).build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=2)
tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W+") # COMMAND ---------- # MAGIC %md # MAGIC Create a `HashingTF` transformer to hash words to buckets with counts, then use an `IDF` estimator to compute inverse-document frequency for buckets based on how frequently words have hashed to those buckets in the given documents. Next, normalize the tf-idf values so that the \\( l^2 \\) norm is one for each row. # COMMAND ---------- from pyspark.ml.feature import IDF, HashingTF, Normalizer hashingTF = HashingTF().setNumFeatures(10000).setInputCol(tokenizer.getOutputCol()).setOutputCol("hashingTF") idf = IDF().setMinDocFreq(10).setInputCol(hashingTF.getOutputCol()).setOutputCol("idf") normalizer = Normalizer().setInputCol(idf.getOutputCol()).setOutputCol("features") # COMMAND ---------- # MAGIC %md # MAGIC Now, let's build the `KMeans` estimator and a `Pipeline` that will contain all of the stages. We'll then call fit on the `Pipeline` which will give us back a `PipelineModel`. This will take about a minute to run. # COMMAND ---------- from pyspark.ml import Pipeline from pyspark.ml.clustering import KMeans kmeans = KMeans().setFeaturesCol("features").setPredictionCol("prediction").setK(5).setSeed(0) pipeline = Pipeline().setStages([tokenizer, hashingTF, idf, normalizer, kmeans]) model = pipeline.fit(parsed)