示例#1
0
def fit_kmeans(spark, products_df):
    step = 0

    step += 1
    tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")

    step += 1
    stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")

    step += 1
    tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)

    step += 1
    idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")

    step += 1
    normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")

    step += 1
    kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)

    kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])

    model = kmeans_pipeline.fit(products_df)
    words_prediction = model.transform(products_df)
    model.save("./kmeans")  # the whole machine learning instance is saved in a folder
    return model, words_prediction
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=4)

## Create H2OAutoML model
automl = H2OAutoML(convertUnknownCategoricalLevelsToNa=False,
                   seed=1,
                   maxRuntimeSecs=300, # 5 minutes
                   predictionCol="label")

## Remove all helper columns
colPruner = ColumnPruner(columns=[idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol()])

## Create the pipeline by defining all the stages
pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, hashingTF, idf, automl, colPruner])

## Train the pipeline model
data = load()
model = pipeline.fit(data)

##
## Make predictions on unlabeled data
## Spam detector
##
def isSpam(smsText, model, hamThreshold = 0.5):
    smsTextDF = spark.createDataFrame([(smsText,)], ["text"]) # create one element tuple
    prediction = model.transform(smsTextDF)
示例#3
0
    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4)

## Create H2ODeepLearning model
dl = H2ODeepLearning(epochs=10,
                     l1=0.001,
                     l2=0.0,
                     hidden=[200, 200],
                     featuresCols=[idf.getOutputCol()],
                     predictionCol="label")

## Remove all helper columns
colPruner = ColumnPruner(columns=[
    idf.getOutputCol(),
    hashingTF.getOutputCol(),
    stopWordsRemover.getOutputCol(),
    tokenizer.getOutputCol()
])

## Create the pipeline by defining all the stages
pipeline = Pipeline(
    stages=[tokenizer, stopWordsRemover, hashingTF, idf, dl, colPruner])

## Train the pipeline model
示例#4
0
StopWords = list(set(stopwords.words('english')))

labelIndexer = StringIndexer(inputCol="tags", outputCol="label").fit(train)
bs_text_extractor = BsTextExtractor(inputCol="post", outputCol="untagged_post")
RegexTokenizer = RegexTokenizer(inputCol=bs_text_extractor.getOutputCol(),
                                outputCol="words",
                                pattern="[^0-9a-z#+_]+")
StopwordRemover = StopWordsRemover(
    inputCol=RegexTokenizer.getOutputCol(),
    outputCol="filtered_words").setStopWords(StopWords)
CountVectorizer = CountVectorizer(inputCol=StopwordRemover.getOutputCol(),
                                  outputCol="countFeatures",
                                  minDF=5)
idf = IDF(inputCol=CountVectorizer.getOutputCol(), outputCol="features")
rf = RandomForestClassifier(labelCol="label",
                            featuresCol=idf.getOutputCol(),
                            numTrees=100,
                            maxDepth=4)
idx_2_string = IndexToString(inputCol="prediction", outputCol="predictedValue")
idx_2_string.setLabels(labelIndexer.labels)

# creating the pipeline
pipeline = Pipeline(stages=[
    labelIndexer, bs_text_extractor, RegexTokenizer, StopwordRemover,
    CountVectorizer, idf, rf, idx_2_string
])

# fitting the model
model = pipeline.fit(train)

# performing the prediction
示例#5
0
print "Text is cleaned"

sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ["review", "label"])
dfTrain, dfTest = df.randomSplit([0.8, 0.2])

print "Random split is done"

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english"))
)
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf")
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed")
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])


# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************

evaluator = MulticlassClassificationEvaluator(
    predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)

grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build()
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8, 0.2])

print "Random split is done"

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review",
    outputCol="wordsNoSw",
    stopwords=set(nltk.corpus.stopwords.words('english')))
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(),
                       outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = LogisticRegression(featuresCol=idf.getOutputCol(),
                        labelCol=string_indexer.getOutputCol(),
                        maxIter=30,
                        regParam=0.01)

pipeline = Pipeline(
    stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                              labelCol='target_indexed',
                                              metricName='precision')

# grid=(ParamGridBuilder()
#      .baseOn([evaluator.metricName,'precision'])
#      .addGrid(dt.maxDepth, [10,20])
#      .build())
示例#7
0
spark = SparkSession.builder.getOrCreate()

data = pd.read_csv('https://raw.githubusercontent.com/DaiZack/MLdatasets/master/imdb500.csv')
df = spark.createDataFrame(data)
textCol = 'review'
selfstopwords = ['br']
numOfTopics = 10
numOfKeywords = 5

tokenizer = RegexTokenizer(inputCol=textCol, outputCol='token', pattern='\\W+')
stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='clean0')
stopwords1 = StopWordsRemover(inputCol=stopwords.getOutputCol(), stopWords=selfstopwords,outputCol='clean')
cv = CountVectorizer(inputCol=stopwords1.getOutputCol(), outputCol='cv')
idf = IDF(inputCol=cv.getOutputCol(), outputCol='idf')
lda = LDA(featuresCol=idf.getOutputCol(), k=numOfTopics, maxIter=10)

pipe1 = Pipeline(stages=[tokenizer, stopwords,stopwords1,cv,idf, lda])

model = pipe1.fit(df)
output = model.transform(df)

def topicsTerms(vocab, termindices, leng=None):
  if not leng:
    return [voca[t] for t in termindices]
  return [vocab[t] for t in termindices][:leng]

def topicsTerm_udf(vocab, leng=None):
  return udf(lambda x: topicsTerms(vocab,x, leng))

topweights = udf(lambda x: x[:numOfKeywords])
示例#8
0
ngramer = NGram(n=2, inputCol='filtered_words', outputCol='ngrams')
text = ngramer.transform(text)
text.show(5)

count_vec = CountVectorizer(inputCol=ngramer.getOutputCol(),
                            outputCol='ft_features')
count_vec_model = count_vec.fit(text)
vocab = count_vec_model.vocabulary
text = count_vec_model.transform(text)
text.show(5)

idf = IDF(inputCol=count_vec.getOutputCol(), outputCol='features')
text = idf.fit(text).transform(text)

lda = LDA(featuresCol=idf.getOutputCol(), k=5, maxIter=10)
lda_model = lda.fit(text)

topics = lda_model.describeTopics()
# topics_words = topics.rdd.map(lambda x: x['termIndices']).map(lambda x:[vocab[i] for i in x]).collect()
get_topics_words = F.udf(lambda x: [vocab[i] for i in x],
                         ArrayType(StringType()))
topics = topics.withColumn('topic_words',
                           get_topics_words(F.col('termIndices')))
topics.show()

text = lda_model.transform(text)
text.show(5)
'''
schema_sdf = StructType([StructField('paragraph', StringType(), True)])
text = spark.read.options(header='false').schema(schema_sdf).csv('/user/devel/2020210990Renqiang/data/speech.txt',sep='\n')
print "Text is cleaned"

sqlContext = SQLContext(sc)

#**************************************************************
#*******Partie générique à mdifier dans les script*************
dfTrain = sqlContext.createDataFrame(rdd, ['review', 'label'])
#**************************************************************

tokenizer = Tokenizer(inputCol='review', outputCol='reviews_words')
hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(),
                       outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(),
                            labelCol=string_indexer.getOutputCol(),
                            maxDepth=10)

pipeline = Pipeline(stages=[tokenizer, hashing_tf, idf, string_indexer, dt])

#**************************************************************
#**************Partie de code générique************************
#*************à copier après le pipeline***********************
#**************************************************************

model = pipeline.fit(dfTrain)

print "The model is fitted"

#import test set
def cleanLower(doc):
    return doc.replace("<br /><br />"," ").lower()
rdd = labeledRdd.map(lambda doc : (cleanLower(doc[0]),doc[1]))

print "Text is cleaned"

sqlContext = SQLContext(sc)
dfTrain = sqlContext.createDataFrame(rdd, ['review', 'label'])

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review", outputCol="wordsNoSw",  
    stopwords=set(nltk.corpus.stopwords.words('english')))
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizerNoSw,
                            hashing_tf,
                            idf,
                            string_indexer,
                            dt])


model = pipeline.fit(dfTrain)

print "The model is fitted"

#import test set
test,names=lf.loadUknown('./data/test')
text_name=zip(test,names)
## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=4)


if algo == "gbm":
    ## Create GBM model
    algoStage = H2OGBM(ratio=0.8,
                 seed=1,
                 featuresCols=[idf.getOutputCol()],
                 predictionCol="label")
elif algo == "dl":
    ## Create H2ODeepLearning model
    algoStage = H2ODeepLearning(epochs=10,
                         seed=1,
                         l1=0.001,
                         l2=0.0,
                         hidden=[200, 200],
                         featuresCols=[idf.getOutputCol()],
                         predictionCol="label")
elif algo == "automl":
    ## Create H2OAutoML model
    algoStage = H2OAutoML(convertUnknownCategoricalLevelsToNa=True,
                       maxRuntimeSecs=60, # 1 minutes
                       maxModels=3,
示例#12
0
print "Text is cleaned"

sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])

print "Random split is done"

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review", outputCol="wordsNoSw",  
    stopwords=set(nltk.corpus.stopwords.words('english')))
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(),maxIter=30, regParam=0.01)

pipeline = Pipeline(stages=[tokenizerNoSw,
                            hashing_tf,
                            idf,
                            string_indexer,
                            dt])

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

# grid=(ParamGridBuilder()
#      .baseOn([evaluator.metricName,'precision'])
#      .addGrid(dt.maxDepth, [10,20])
#      .build())

#cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid,evaluator=evaluator)
tokenizer = Tokenizer(inputCol='review', outputCol='tokens')
stopWordRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='stoppedWords').setStopWords(stopWords)
countVector = CountVectorizer(inputCol=stopWordRemover.getOutputCol(), outputCol='vectors')
idf = IDF(inputCol=countVector.getOutputCol(), outputCol='idf')
pipline = Pipeline(stages=[tokenizer, stopWordRemover, countVector, idf])
model = pipline.fit(trainDF)
ptrainDF = model.transform(trainDF)
ptestDF = model.transform(testDF)
ptrainDF.show()

# %%
evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="f1")

# %%
lr = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol='class')
lrModel = lr.fit(ptrainDF)
predictionsLR = lrModel.transform(ptestDF)
evaluator.evaluate(predictionsLR)

# %%
lda = LDA(featuresCol=idf.getOutputCol(), maxIter=10, k=2)
ldaModel = lda.fit(ptrainDF)

# %%
naiveBayes = NaiveBayes(featuresCol=idf.getOutputCol(), labelCol='class')
naiveModel = naiveBayes.fit(ptrainDF)
predictionsNaive = naiveModel.transform(ptestDF)
evaluator.evaluate(predictionsNaive)

# %%
示例#14
0
filterer = Filterer(key='subreddit',
                    val='body',
                    inputCol='subreddit',
                    outputCol='body',
                    minlength=args.minlength)
tokenizer = RegexTokenizer(inputCol=cleaner.getOutputCol(),
                           outputCol="tokens",
                           pattern="\\W")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                           outputCol="swr_tokens")
cv = CountVectorizer(inputCol=remover.getOutputCol(),
                     outputCol="tf",
                     minDF=args.mindf,
                     vocabSize=args.vocabsize)
idf = IDF(inputCol=cv.getOutputCol(), outputCol="tfidf")
topkwords = TopKWords(inputCol=idf.getOutputCol(),
                      outputCol='top_words',
                      nwords=args.nwords)
cos_similarity = CosineSimilarity(inputCol='subreddit',
                                  outputCol='norm',
                                  spark=spark)
topksubreddits = TopKSubreddits(inputCol=cos_similarity.getOutputCol(),
                                outputCol='top_subreddits',
                                nsubreddits=args.nsubreddits)

pipeline = Pipeline(stages=[
    extractor, cleaner, filterer, tokenizer, remover, cv, idf, topkwords,
    cos_similarity, topksubreddits
])

# fit the model, extract the computed vocabulary
示例#15
0
tokenizer = Tokenizer(inputCol="reviews", outputCol="tokens")
countVector = CountVectorizer(inputCol=tokenizer.getOutputCol(),
                              outputCol='features')
idf = IDF(inputCol=countVector.getOutputCol(), outputCol='idf')
pipeline = Pipeline(stages=[tokenizer, countVector, idf])
pipelineModel = pipeline.fit(trainDF)

# %%
pTrainDF = pipelineModel.transform(trainDF)
pTestDF = pipelineModel.transform(testDF)

# %%
evaluator = MulticlassClassificationEvaluator(labelCol="class",
                                              predictionCol="prediction",
                                              metricName="f1")
lr = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol='class')
lrModel = lr.fit(pTrainDF)
predictionsLR = lrModel.transform(pTestDF)
evaluator.evaluate(predictionsLR)

# %%

inputComment = "Game of thrones is an awesome book. \
    George RR Martin has done a fantastic job at it.\n \
    Arya stark killed night king. \
    At the end of the third book we find out that Jon Snow is actually the son of rhaegar targaryen and lyanna stark. \
    Jon snow is not at all a bastard. \
    Bran stark becomes the king. \
    Arya stark. \
    Jon Snow was always the heir to the throne. \
    Arya Stark survives the whole battle of winterfell and goes on to kill the night king. \
示例#16
0
df = spark.read.csv('file:///home/zfar/Sentiment Analysis Dataset.csv',
                    header=True)

df = df.select(df['ItemID'], df['SentimentText'], df['label'])

training = df.selectExpr("cast(itemID as int) id", "SentimentText",
                         "cast(label as int) label")

tokenizer = Tokenizer(inputCol="SentimentText", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                           outputCol="filtered")
ngrams = NGram(n=2, inputCol=remover.getOutputCol(), outputCol="ngrams")
hashingTF = HashingTF(inputCol=ngrams.getOutputCol(), outputCol="rawfeatures")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idffeatures")
normalizer = Normalizer(inputCol=idf.getOutputCol(),
                        outputCol="features",
                        p=1.0)

#lr = LogisticRegression(maxIter=10, regParam=0.001)
nb = NaiveBayes(smoothing=1.0)
pipeline = Pipeline(
    stages=[tokenizer, remover, ngrams, hashingTF, idf, normalizer, nb])
model = pipeline.fit(training)
"""
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [10, 100, 1000]).addGrid(lr.regParam, [0.1, 0.01]).build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=2) 
示例#17
0
文件: TFIDF.py 项目: Inscrutive/spark
tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W+")

# COMMAND ----------

# MAGIC %md
# MAGIC Create a `HashingTF` transformer to hash words to buckets with counts, then use an `IDF` estimator to compute inverse-document frequency for buckets based on how frequently words have hashed to those buckets in the given documents.  Next, normalize the tf-idf values so that the \\( l^2 \\) norm is one for each row.

# COMMAND ----------

from pyspark.ml.feature import IDF, HashingTF, Normalizer

hashingTF = HashingTF().setNumFeatures(10000).setInputCol(tokenizer.getOutputCol()).setOutputCol("hashingTF")

idf = IDF().setMinDocFreq(10).setInputCol(hashingTF.getOutputCol()).setOutputCol("idf")

normalizer = Normalizer().setInputCol(idf.getOutputCol()).setOutputCol("features")

# COMMAND ----------

# MAGIC %md
# MAGIC Now, let's build the `KMeans` estimator and a `Pipeline` that will contain all of the stages.  We'll then call fit on the `Pipeline` which will give us back a `PipelineModel`.  This will take about a minute to run.

# COMMAND ----------

from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans

kmeans = KMeans().setFeaturesCol("features").setPredictionCol("prediction").setK(5).setSeed(0)

pipeline = Pipeline().setStages([tokenizer, hashingTF, idf, normalizer, kmeans])
model = pipeline.fit(parsed)