示例#1
0
def fit_kmeans(spark, products_df):
    step = 0

    step += 1
    tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer")

    step += 1
    stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords")

    step += 1
    tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16)

    step += 1
    idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf")

    step += 1
    normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer")

    step += 1
    kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20)

    kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans])

    model = kmeans_pipeline.fit(products_df)
    words_prediction = model.transform(products_df)
    model.save("./kmeans")  # the whole machine learning instance is saved in a folder
    return model, words_prediction
def tf_idf_feature(wordsData):
    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    for features_label in rescaledData.select("features", "id").take(3):
        print(features_label)
def textPredict(request):
    """6.文本聚类,热度预测"""
    label = request.POST['label']
    title = request.POST['title']

    conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    """处理数据集,生成特征向量"""
    dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
    print(dfTitles.dtypes)
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(dfTitles)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.show()
    for features_label in rescaledData.select("features", "rawFeatures").take(3):
        print(features_label)
    """决策树模型培训"""
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
    (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    model = pipeline.fit(trainingData)
    """模型测试"""
    predictions = model.transform(testData)
    predictions.show()
    predictions.select("prediction", "indexedLabel", "features").show(5)
    """用户数据测试,单个新闻测试"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    print("==================================================")
    myprediction.show()
    resultList = convertDfToList(myprediction)

    """模型评估"""
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    print(treeModel)

    sc.stop()
    return render(request,{'resultList':resultList})
def tfidf(dataframe, in_col1, out_col1, in_col2, out_col2, n):

    global idfModel
    
    hashingTF = HashingTF(inputCol=in_col1, outputCol=out_col1, numFeatures=n)
    featurizedData = hashingTF.transform(dataframe)
    idf = IDF(inputCol=in_col2, outputCol=out_col2)
    idfModel = idf.fit(featurizedData)
    dataframe = idfModel.transform(featurizedData)
    
    return dataframe
示例#5
0
def run_tf_idf_spark_ml(df, numFeatures=1 << 20):
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsData = tokenizer.transform(df)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures)
    featurizedData = hashingTF.transform(wordsData)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)

    return idfModel.transform(featurizedData)
示例#6
0
def extract_idf_features(p_df, input_col, output_col):
    """
    Extracts IDF features.
    :param p_df: A DataFrame.
    :param in_column: Name of the input column.
    :param out_column: Name of the output column.
    :return: A DataFrame.    
    """    
    idf = IDF(inputCol=input_col, outputCol=output_col)
    idfModel = idf.fit(p_df)
    return idfModel.transform(p_df)
def create_features(raw_data):
    #Create DataFrame
    data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2])))
    #Transform sentence into words
    tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
    words_df = tokenizer.transform(data_df)
    #Calculate term frequency
    hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5)
    featurized_df = hashingTF.transform(words_df)
    #Calculate inverse document frequency
    idf = IDF(inputCol='rawFeatures', outputCol='features')
    idfModel = idf.fit(featurized_df)
    return idfModel.transform(featurized_df)
def tf_feature_vectorizer(df,no_of_features,ip_col):
    #from pyspark.sql.functions import udf
    #from pyspark.sql.types import *
    output_raw_col = ip_col+"raw_features"
    output_col = ip_col+"features"
    hashingTF = HashingTF(inputCol=ip_col, outputCol=output_raw_col, numFeatures=no_of_features)
    featurizedData = hashingTF.transform(df)
    idf = IDF(inputCol=output_raw_col, outputCol=output_col)
    idfModel = idf.fit(featurizedData)
    rescaled_data = idfModel.transform(featurizedData)
    rescaled_data.show(5)
    print(rescaled_data.count())
    return rescaled_data
示例#9
0
文件: project.py 项目: sam46/Yelper
def makeTFIDF(sc, spark, reviews):
    # count vectorizer and tfidf
    # cv = CountVectorizer(inputCol='words_clean', outputCol='tf')
    # cvModel = cv.fit(reviews)
    # reviews = cvModel.transform(reviews)

    # HashingTF for fewer dimensions:
    hashingtf = HashingTF(inputCol='words_clean', outputCol='tf', numFeatures=1000)
    reviews = hashingtf.transform(reviews)

    # create TF-IDF matrix
    idf = IDF().setInputCol('tf').setOutputCol('tfidf')
    tfidfModel = idf.fit(reviews)
    reviews = tfidfModel.transform(reviews)
示例#10
0
 def test_idf(self):
     dataset = self.spark.createDataFrame([
         (DenseVector([1.0, 2.0]),),
         (DenseVector([0.0, 1.0]),),
         (DenseVector([3.0, 0.2]),)], ["tf"])
     idf0 = IDF(inputCol="tf")
     self.assertListEqual(idf0.params, [idf0.inputCol, idf0.minDocFreq, idf0.outputCol])
     idf0m = idf0.fit(dataset, {idf0.outputCol: "idf"})
     self.assertEqual(idf0m.uid, idf0.uid,
                      "Model should inherit the UID from its parent estimator.")
     output = idf0m.transform(dataset)
     self.assertIsNotNone(output.head().idf)
     # Test that parameters transferred to Python Model
     check_params(self, idf0m)
示例#11
0
文件: ml.py 项目: ribonj/lsir
def tf_idf(df, column):
    """
    Compute TF-IDF of a corpus.
    Transformation: array<string> --> vector
    """ 
    df = preprocess(df, column) # text to list of terms
    (df, voc) = count(df, column)
    
    # creates a TF-IDF model and uses it to compute the feature vector.
    idf = IDF(inputCol=column, outputCol='_'+column)
    model = idf.fit(df)
    df = model.transform(df)
    
    df = replace(df, column, '_'+column)
    return (df, voc)
示例#12
0
 def append_tf_idf(self, df):
     """
     Calculate term frequency and inverse document frequency
      based on at least 1 visit hourly in this case. Compares how often the tokens appeared
      at least once per hour compared to other tokens. Not used for the main purpose of the project.
     Args:
         :param df: Dataframe parameter.
     Returns:
         :return:  Dataframe with term frequency and inverse document frequency added in the columns
                     'rawFeatures' and 'features' respectively.
     """
     #Create TF column.
     hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=100000)
     tf = hashingTF.transform(df)
     tf.persist(StorageLevel.MEMORY_AND_DISK)
     #Create IDF column.
     idf = IDF(inputCol="rawFeatures", outputCol="features")
     idfModel = idf.fit(tf)
     tfidf = idfModel.transform(tf)
     return tfidf
示例#13
0
def get_top_words(dataset, signatures):
    # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer
    # Or translate comments in other languages using the free Microsoft Translate API.
    sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1)))

    if sentenceData.rdd.isEmpty():
        return dict()

    # Tokenize comments.
    tokenizer = Tokenizer(inputCol='user_comments', outputCol='words')
    wordsData = tokenizer.transform(sentenceData)

    # Remove duplicate words from comments.
    wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words'])

    if wordsData.rdd.isEmpty():
        print("[WARNING]: wordsData is empty, sentenceData wasn't.")
        return dict()

    # Clean comment words by removing puntuaction and stemming.
    def clean_word(w):
        return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower()))

    wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words'])

    # XXX: Useless with TF-IDF?
    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    cleanWordsData = remover.transform(wordsData)

    cv = CountVectorizer(inputCol='filtered', outputCol='features')
    model = cv.fit(cleanWordsData)
    featurizedData = model.transform(cleanWordsData)

    idf = IDF(inputCol='features', outputCol='tfidf_features')
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect()

    return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
示例#14
0
    return doc.replace("<br /><br />"," ").lower()
rdd = labeledRdd.map(lambda doc : (cleanLower(doc[0]),doc[1]))

print "Text is cleaned"

sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])

print "Random split is done"

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review", outputCol="wordsNoSw",  
    stopwords=set(nltk.corpus.stopwords.words('english')))
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(),maxIter=30, regParam=0.01)

pipeline = Pipeline(stages=[tokenizerNoSw,
                            hashing_tf,
                            idf,
                            string_indexer,
                            dt])

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

# grid=(ParamGridBuilder()
#      .baseOn([evaluator.metricName,'precision'])
#      .addGrid(dt.maxDepth, [10,20])
#      .build())
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                    outputCol="filtered",
                                    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
                                    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=4)

gbm = H2OGBM(splitRatio=0.8,
             seed=1,
             featuresCols=[idf.getOutputCol()],
             labelCol="label")

dl = H2ODeepLearning(epochs=10,
                     seed=1,
                     l1=0.001,
                     l2=0.0,
                     hidden=[200, 200],
                     featuresCols=[idf.getOutputCol()],
                     labelCol="label")
示例#16
0
from test_df
"""

test_df = spark.sql(query)
test_df = test_df.withColumn('id', F.col('id') - 1)
test_df.show(5)

########################################################################################################
# Build pipeline and run
indexer = StringIndexer(inputCol="category", outputCol="label")
tokenizer = RegexTokenizer(pattern=u'\W+',
                           inputCol="text",
                           outputCol="words",
                           toLowercase=False)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")
lr = LogisticRegression(maxIter=20, regParam=0.001)

# Builing model pipeline
pipeline = Pipeline(stages=[indexer, tokenizer, hashingTF, idf, lr])

# Train model on training set
model = pipeline.fit(
    train_df
)  #if you give new names to your indexed datasets, make sure to make adjustments here

# Model prediction on test set
pred = model.transform(test_df)  # ...and here

# Model prediction accuracy (F1-score)
pl = pred.select("label", "prediction").rdd.cache()
示例#17
0
# creating tokens/words from the sentence data
tokenizer = Tokenizer(inputCol="document", outputCol="words")
wordsData = tokenizer.transform(documentData)
print (documentData)
wordsData.show()

"""**a.Performing a task without NLP**"""

# applying tf on the words data
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=200)
tf = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors
# calculating the IDF
tf.cache()
idf = IDF(inputCol="rawFeatures", outputCol="features")
idf = idf.fit(tf)
tfidf = idf.transform(tf)
#displaying the results
tfidf.select("label", "features").show()


print("TF-IDF without NLP:")
for each in tfidf.collect():
    print(each)
    print(each['rawFeatures'])
spark.stop()

"""**b.Performing the task with lemmitization**"""

import nltk;
示例#18
0
from pyspark.ml.feature import Tokenizer
sentences_df = spark.createDataFrame(
    [(1, 'This is an introduction to Spark MLlib'),
     (2, 'Mllib includes libraries for classification and regression'),
     (3, 'It also contains supporting tools for pipelines')],
    ['id', 'sentences'])

sentences_df.show()

sent_token = Tokenizer(inputCol='sentence', outputCol='words')
sent_tokenized_df = sent_token.transform(sentences_df)

sent_tokenized_df.show()

from pyspark.ml.feature import HashingTF, IDF
sentences_df.take(1)
sent_tokenized_df.take(1)

hashingTF = HashingTF(inputCol='words',
                      outputCol='rawFeatures',
                      numFeatures=20)
sent_hfTF_df = hashingTF.transform(sent_tokenized_df)

sent_hfTF_df.take(1)

idf = IDF(inputCol='rawFeatures', outputCol='idf_features')
idfModel = idf.fit(sent_hfTF_df)
tfidf_df = idfModel.transform(sent_hfTF_df)

tfidf_df.take(1)
示例#19
0
	remover.setStopWords(sw)
	cleanDataTrain = remover.transform(wordsDataTrain)
	cleanDataTest = remover.transform(wordsDataTest)

	# Made onegrams
	onegram = NGram(n=1, inputCol="filtered",outputCol="onegram")
	onegramedDataTrain=onegram.transform(cleanDataTrain)
	onegramedDataTest = onegram.transform(cleanDataTest)

	#  Find hashed Term frequency value of word vector
	hashingTF = HashingTF(inputCol="onegram", outputCol="rawFeatures", numFeatures=100000)
	featurizedDataTrain = hashingTF.transform(onegramedDataTrain)
	featurizedDataTest = hashingTF.transform(onegramedDataTest)

	# Find IDF
	idf = IDF(inputCol="rawFeatures", outputCol="features")
	idfModelTrain = idf.fit(featurizedDataTrain)
	idfModelTest = idf.fit(featurizedDataTest)
	rescaledDataTrain = idfModelTrain.transform(featurizedDataTrain)
	rescaledDataTest = idfModelTest.transform(featurizedDataTest)

	# Final test and train data
	train_data = rescaledDataTrain.select('features', 'label')
	test_data = rescaledDataTest.select('features', 'label')

	# Multinomial naivebyes
	nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
	model = nb.fit(train_data)
	result = model.transform(test_data)
	result.show()
def main(sc, sqlContext):
    start = timer()

    stpwrds = stopwords.words('english')
    tbl_translate = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('S') or unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('N'))

    print '---Pegando produtos---'
    start_i = timer()
    productRDD = sc.parallelize(findProductsByCategory([]))
    print '####levou %d segundos' % (timer()-start_i)

    print '---Criando corpus---'
    start_i = timer()
    corpusRDD = (productRDD.map(lambda s: (s[0], word_tokenize(s[1].translate(tbl_translate).lower()), s[2], s[3]))
                           .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3] ))
                           .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP'], s[2], s[3]))
                           .cache())
    print '####levou %d segundos' % (timer()-start_i)

    print '---Pegando e persistindo dados de categoria e tokens---'
    start_i = timer()
    tokens = corpusRDD.flatMap(lambda x: x[1]).distinct().collect()
    numTokens = len(tokens)
    category = productRDD.map(lambda x: x[2]).distinct().collect()
    categoryAndSubcategory = productRDD.map(lambda x: (x[2], x[3])).distinct().collect()
    insertTokensAndCategories(tokens, category, categoryAndSubcategory)
    print '####levou %d segundos' % (timer()-start_i)    

    print '---Calculando TF-IDF dos produtos---'
    start_i = timer()
    wordsData = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], category=s[2], subcategory=s[3]))
    #persistir isso para que ele nao tenha que fazer de novo na predicaoo
    wordsDataDF = sqlContext.createDataFrame(wordsData)   

    #persistindo para a predicao
    wordsDataForPrediction = corpusRDD.map(lambda s: Row(label=s[0], words=s[1], type=s[2]))
    #persistir isso para que ele nao tenha que fazer de novo na predicaoo
    wordsDataForPredictionDF = sqlContext.createDataFrame(wordsDataForPrediction)   

    if os.path.exists("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"):
        shutil.rmtree("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")

    wordsDataForPredictionDF.write.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet") 

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    #VSM = rescaledData.map(lambda t: LabeledPoint(categoryAndSubcategory.index((t.category, t.subcategory)), t.features))
    VSM = rescaledData.map(lambda t: LabeledPoint(category.index(t.category), t.features))

    VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L)
    print '####levou %d segundos' % (timer()-start_i)    


    print '--Criando modelo Naive Bayes---'
    start_i = timer()
    model = NaiveBayes.train(VSMTrain)

    if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria"):
        shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria")

    model.save(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria')
    print '####levou %d segundos' % (timer()-start_i)    

    print '---Testando modelo Naive Bayes---'
    start_i = timer()
    prediction = VSMTest.map(lambda p : (categoryAndSubcategory[int(model.predict(p.features))], categoryAndSubcategory[int(p.label)]))
    acuraccy = float(prediction.filter(lambda (x, v): x[0]==v[0]).count())/float(prediction.count())
    print 'acuracidade de %f' % acuraccy
    print '####levou %d segundos' % (timer()-start_i)    
    
    print '---Pegando os posts---'

    start_i = timer()
    posts = list()
    wb = load_workbook(filename = '/home/ubuntu/recsys-tcc-ml/base_sentimentos.xlsx')
    sheet = wb['Menes']
    for row in sheet.iter_rows(row_offset=1):
        post = list()
        for cell in row:
            if cell.value is None:
                break
            post.append(1 if cell.value == 'Positive' or cell.value == 'Neutral' else 0 if cell.value == 'Negative' else removeAccents(cell.value))

        if len(post) > 0:            
            posts.append(tuple(post))

    print '####levou %d segundos' % (timer()-start_i)

    print '---Criando corpus---'
    start_i = timer()
    postsRDD = sc.parallelize(posts)
    postCorpusRDD = (postsRDD.map(lambda s: (s[1], word_tokenize(s[0].translate(tbl_translate).lower())))
                           .map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds]))
                           .map(lambda s: (s[0], [x[0] for x in pos_tag(s[1]) if x[1] == 'NN' or x[1] == 'NNP']))
                           .cache())

    print '####levou %d segundos' % (timer()-start_i)

    print '---Calculando TF-IDF dos Posts---'
    start_i = timer()
    wordsData = postCorpusRDD.map(lambda s: Row(label=s[0], words=s[1]))
    wordsDataDF = sqlContext.createDataFrame(wordsData)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)   

    VSM = rescaledData.map(lambda t: LabeledPoint(t.label, t.features))
    VSMTrain, VSMTest = VSM.randomSplit([8, 2], seed=0L)
    print '####levou %d segundos' % (timer()-start_i)   

    print '--Criando modelo SVM---'
    start_i = timer()
    model = SVMWithSGD.train(VSMTrain, iterations=100)
    
    if os.path.exists("/home/ubuntu/recsys-tcc-ml/models/svm"):
        shutil.rmtree("/home/ubuntu/recsys-tcc-ml/models/svm")

    model.save(sc, "/home/ubuntu/recsys-tcc-ml/models/svm")

    print '---Testando modelo SVM---'
    start_i = timer()
    prediction = VSMTest.map(lambda p: (p.label, model.predict(p.features)))
    acuraccy = prediction.filter(lambda (v, p): v != p).count() / float(prediction.count())
    
    print 'acuracidade de %f' % acuraccy

    print '####levou %d segundos' % (timer()-start_i)   

    print 'O processo todo levou %d segundos' % (timer()-start)
示例#21
0
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Descript","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)
# Evaluate the performance
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions) # 0.9725282146509521

# 2.Logistic Regression using TF-IDF Features
from pyspark.ml.feature import HashingTF, IDF
# Add HashingTF and IDF to transformation
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
# Redo Pipeline
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
# Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
# Build the model
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
# Train model with Training Data
lrModel = lr.fit(trainingData)
# Make predictions on Testing Data
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("Descript","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
示例#22
0
#%%
names_df = names_df.dropna(subset='name')
names_df.show()

#%%
tokenizer = Tokenizer(inputCol="name", outputCol="words")
wordsData = tokenizer.transform(names_df)
wordsData.show()

#%%
stopwords = []
stopwords.extend(StopWordsRemover.loadDefaultStopWords('english'))
remover = StopWordsRemover(inputCol="words",
                           outputCol="cleanedWords",
                           stopWords=stopwords)
cleanedWordsData = remover.transform(wordsData)
cleanedWordsData.show()

#%%
hashingTF = HashingTF(numFeatures=4096,
                      inputCol="cleanedWords",
                      outputCol="tfFeatures")
tfWordsData = hashingTF.transform(cleanedWordsData)
tfWordsData.show()

#%%
idf = IDF(inputCol="tfFeatures", outputCol="tfIdfFeatures")
idfModel = idf.fit(tfWordsData)
results = idfModel.transform(tfWordsData)
results.show()
示例#23
0
pipeline_model = pipeline.fit(train_data)

test_predicted = pipeline_model.transform(test_data)

#utilisation d'une pipeline avec tf-idf

train_data_2 = load_dataframe_text_unsplitted("20ng-train-all-terms.txt")
test_data_2 = load_dataframe_text_unsplitted("20ng-test-all-terms.txt")

vectorizer = CountVectorizer(inputCol="words", outputCol="bag_of_words")

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")

idf = IDF(inputCol="rawFeatures", outputCol="bag_of_words")

#pipeline_tf_idf = Pipeline(stages=[label_indexer, HashingTF, IDF,  classifier])
pipeline_tf_idf = Pipeline(
    stages=[label_indexer, tokenizer, hashingTF, idf, classifier])
pipeline_model_tf_idf = pipeline_tf_idf.fit(train_data_2)
test_predicted_tf_idf = pipeline_model_tf_idf.transform(test_data_2)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="label_index",
    predictionCol="label_index_predicted",
    metricName="accuracy")
accuracy = evaluator.evaluate(test_predicted)
print("Accuracy = {:.2f}".format(accuracy))
示例#24
0
    ['product_uid', 'id', 'search_term_clean', 'relevance', 'text_clean'])

# Step 1: split text field into words
tokenizer = Tokenizer(inputCol="text_clean", outputCol="text_token")
fulldata = tokenizer.transform(fulldata)
print "Tokenized Text:"
print fulldata.head()
print "################"
# Step 2: compute term frequencies
hashingTF = HashingTF(inputCol="text_token", outputCol="tf", numFeatures=10000)
fulldata = hashingTF.transform(fulldata)
print "TERM frequencies:"
print fulldata.head()
print "################"
# Step 3: compute inverse document frequencies
idf = IDF(inputCol="tf", outputCol="tf_idf")
idfModel = idf.fit(fulldata)
fulldata = idfModel.transform(fulldata)
print "IDF :"
print fulldata.head()
print "################"

#OK we do the same for the search term
# Step 1: split text field into words
tokenizer = Tokenizer(inputCol="search_term_clean", outputCol="search_token")
fulldata = tokenizer.transform(fulldata)
print "Tokenized Search:"
print fulldata.head()
print "################"
# Step 2: compute term frequencies
hashingTF = HashingTF(inputCol="search_token",
示例#25
0
#Computing setniment column based on rating
sentiment = when(col("rating") <= 5, 0).otherwise(1)

df = df.withColumn("sentiment", sentiment)
df = df.withColumn('length', length(df['review']))

# %% [markdown]
# ## Feature Transformation

# %%
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer

tokenizer = Tokenizer(inputCol="review", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec')
idf = IDF(inputCol="c_vec", outputCol="tf_idf")
pos_neg = StringIndexer(inputCol='sentiment', outputCol='label')

# %%
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# %%
clean_up = VectorAssembler(inputCols=['tf_idf', 'length'],
                           outputCol='features')

# %%
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
示例#26
0
smsDf = sqlContext.createDataFrame(smsXformed, ["label", "message"])
smsDf.cache()
smsDf.select("label", "message").show()

#Split training and testing
(trainingData, testData) = smsDf.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()
testData.collect()

#Setup pipeline
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.feature import IDF

tokenizer = Tokenizer(inputCol="message", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), \
        outputCol="tempfeatures")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
nbClassifier = NaiveBayes()

pipeline = Pipeline(stages=[tokenizer, hashingTF, \
                idf, nbClassifier])

nbModel = pipeline.fit(trainingData)

prediction = nbModel.transform(testData)
prediction.groupBy("label", "prediction").count().show()
#print(data.head(5))
    
##creating rdd file
sc = SparkContext("local", "app")
sqc = SQLContext(sc)
df = sqc.createDataFrame(data, ['type', 'text'])

#NEW VARIABLE GENERATION
dataCleaned = df.map(lambda x: (1 if x['type'] == 'spam' else 0, tokenize(x['text'])))
dataClean = dataCleaned.map(lambda x: (float(x[0]), x[1]))
dfClean = sqc.createDataFrame(dataClean, ['label', 'words'])
dfClean.show(5)

hashingTF = HashingTF(inputCol="words", outputCol="rawtf-idf", numFeatures=1000)
tf = hashingTF.transform(dfClean)
idf = IDF(inputCol="rawtf-idf", outputCol="features").fit(tf)
dfFinal = idf.transform(tf)

# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(dfFinal)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dfFinal)

# Split the data into training and test sets (20% held out for testing)
(trainingData, testData) = dfFinal.randomSplit([0.8, 0.2])


# Train the model.
#rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
nb = NaiveBayes(smoothing = 1.0, labelCol="indexedLabel", featuresCol="indexedFeatures")
示例#28
0
	def trainModel(self):
		
		logger.info("Training the model...")		

		query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;'''

		def SQLtoURL(query):
    
    			data = query.replace('\n', ' ').replace('\t',' ').replace('   ',' ').replace('  ',' ')
    			return data


		def QueryXXXXX(query, file = None):
   
    			session = Session()
    			response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},)
    			return response.content

		table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table']
		title_list = [x['c'] for x in table['rows']]
		table_cols = [d['label'] for d in table['cols']]  
		def convert_row(row):
    			rowlist = [d['v'] for d in row]
    			return rowlist

		rd = self.sc.parallelize(title_list).map(convert_row)
		titleData = self.spark.createDataFrame(rd, table_cols)
		titleData = titleData.dropna()
		
		hebrew_stopwords = stop_words()
		def rmv(words):
    			for punc in punctuation:
        			words = words.replace(punc,"")
    			for hword in hebrew_stopwords:
        			words = words.replace(hword, " ")
    			return words

		self.spark.udf.register("rmv", rmv, StringType())
		titleData.registerTempTable("wordstable")
		cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable")
		tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words")
		wordsData = tokenizer.transform(cleanedSentenceData)

		cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0)
		cvModel = cv.fit(wordsData)
		featurizedData = cvModel.transform(wordsData)

		idf = IDF(inputCol="rawFeatures", outputCol="features")
		idfModel = idf.fit(featurizedData)
		rescaledData = idfModel.transform(featurizedData)

		lda = LDA(k=100)
		ldaModel = lda.fit(rescaledData)
		postFactorizedData = ldaModel.transform(rescaledData)

		norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist")
		scaledFactorizedNormalizedData = norm.transform(postFactorizedData)
		
		self.model = scaledFactorizedNormalizedData
		
		logger.info("model is built!")
示例#29
0
#configuring spark
conf = SparkConf()
conf.setAppName("part2_uni")
conf.set("spark.executor.memory", "2g")
sc = SparkContext(conf=conf)

#reading input
lines = sc.wholeTextFiles("/cosc6339_s17/books-longlist/")
#configuring SparkSession
spark = SparkSession(sc)
hasattr(lines, "toDF")

#tokeinizing the words and converting into dataframes
tokenize = lines.map(part2).toDF(["bookname", "words"])

#converting into unigrams
unigram = NGram(n=2, inputCol="words", outputCol="unigrams")
unigramdataframe = unigram.transform(tokenize)

#finding the tf value
hashingTF = HashingTF(inputCol="unigrams", outputCol="unigram-tf")
tf = hashingTF.transform(unigramdataframe)

#finding the idf value
idf = IDF(inputCol="unigram-tf", outputCol="unigram-tf-idf")
idfModel = idf.fit(tf)
tfidfignore = idfModel.transform(tf)

#saving the output
tfidfignore.rdd.saveAsTextFile("/bigd12/output2_2")
示例#30
0
                                       '45G').set('spark.driver.maxResultSize',
                                                  '10G')
sc = SparkContext(conf=spark)
sqlContext = SQLContext(sc)

df = sqlContext.createDataFrame(indexNewsList, ["label", "text"])

tokenizer = Tokenizer(inputCol="text", outputCol="words")

remover = StopWordsRemover(inputCol="words", outputCol="filtered")

hashingTF = HashingTF(inputCol="filtered",
                      outputCol="rawFeatures",
                      numFeatures=10000)

idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)

nb = NaiveBayes(smoothing=1.0, modelType="multinomial")

rf = RandomForestClassifier(labelCol="label",
                            featuresCol="features",
                            numTrees=100,
                            maxDepth=15,
                            maxBins=32)

pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf])
pipeline1 = Pipeline(stages=[tokenizer, remover, hashingTF, idf, rf])
pipeline_nb = pipeline1.fit(df)
pipelineFit = pipeline.fit(df)
pipeline_nb.write().overwrite().save("model_rf1")
dataset = pipelineFit.transform(df)
示例#31
0
# 停用词
add_stopwords = [
    'the', 'of', 'in', 'a', 'an', 'at', 'as', 'on', 'for', 'it', 'we', 'you',
    'want', 'up', 'to', 'if', 'are', 'is', 'and', 'our', 'with', 'from', '-',
    'your', 'so'
]
stopwords_remover = StopWordsRemover(
    inputCol='desc_words',
    outputCol='desc_words_filtered').setStopWords(add_stopwords)
df = stopwords_remover.transform(df)

# 计算每篇文档的TF-IDF
hashingTF = HashingTF(inputCol='desc_words_filtered',
                      outputCol="desc_words_tf")
tf = hashingTF.transform(df).cache()
idf = IDF(inputCol='desc_words_tf', outputCol="desc_words_tfidf").fit(tf)
tfidf = idf.transform(tf).cache()
print('\n 每个酒店的TFIDF')
tfidf.select('desc_words_tfidf').show(truncate=False)

# 数据规范化
from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="desc_words_tfidf", outputCol="norm")
tfidf = normalizer.transform(tfidf)
tfidf.select("id", "norm").show()

import pyspark.sql.functions as psf
from pyspark.sql.types import DoubleType
dot_udf = psf.udf(lambda x, y: float(x.dot(y)), DoubleType())
#tfidf = tfidf.alias("a1").join(tfidf.alias("a2"), psf.col("a1.id") < psf.col("a2.id")).withColumn('similarity', dot_udf("a1.norm", "a2.norm"))
#tfidf.show()
示例#32
0
def main(input_file, bus_parquet, input_model):
    data = spark.read.parquet(input_file)
    df_business = spark.read.parquet(bus_parquet)
    
    df = data.select('business_id', 'text')
    #df_review = df.groupby('business_id').agg(functions.collect_set('text')).show(100)
    review_rdd = df.rdd.map(tuple).reduceByKey(operator.add)
    review_df = spark.createDataFrame(review_rdd).withColumnRenamed('_1', 'business_id').withColumnRenamed('_2', 'text')
    
    # create text preprocessing pipeline
    # Build the pipeline
    # tokenize review
    regexTokenizer = RegexTokenizer(gaps=False, pattern='\w+', inputCol='text', outputCol='text_token')
    #yelpTokenDF = regexTokenizer.transform(review_df)
    
    # filter stopwords
    stopWordsRemover = StopWordsRemover(inputCol='text_token', outputCol='nonstopwrd')
    #yelp_remove_df = stopWordsRemover.transform(yelpTokenDF)

    # TF
    countVectorizer = CountVectorizer(inputCol = 'nonstopwrd', outputCol='raw_features', minDF=2)
    #yelp_CountVec = cv.transform(yelp_remove_df)

    # IDF
    idf = IDF(inputCol="raw_features", outputCol="features")

    pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, countVectorizer, idf])
    #tfidf_model = pipeline.fit(review_df)
    #tfidf_model.write().overwrite().save('tfidf_model')

    tfidf_model = PipelineModel.load('tfidf_model')
    result_tfidf = tfidf_model.transform(review_df) 
    yelp = result_tfidf

    lda = LDA(k=15, maxIter=100)
    # already saved model
    #model = lda.fit(yelp)
    # save model
    #model.write().overwrite().save(input_model)

    model = LocalLDAModel.load(input_model)
    # lda output column topicDistribution
    lda_df = model.transform(yelp)

    # test result
    x = sc.parallelize([('aaa', 'chicken cheese burger')]).toDF(['business_id', 'text'])
    x_tfidf = tfidf_model.transform(x)
    lda_x = model.transform(x_tfidf)
    
    input_vec = lda_x.select('topicDistribution').collect()[0][0]
    lda_vec = lda_df.select('business_id', 'topicDistribution').rdd.map(lambda x: (x[0], x[1])).collect()

    # compute similarity
    t = sc.parallelize((i[0], float(CosineSim(input_vec, i[1]))) for i in lda_vec)

    # recommendation's cosine values
    similarity = spark.createDataFrame(t).withColumnRenamed('_1', 'business_id').withColumnRenamed('_2', 'similarity_score')
    df_result = df_business.join(similarity, 'business_id', 'right').select(similarity['business_id'] ,'similarity_score', 'categories').orderBy('similarity_score', ascending = False)
    
    result = getKeyWordsRecoms('chicken cheese burger', 20, tfidf_model, model, lda_vec)
    result.show()
from pyspark.ml.feature import CountVectorizer

# we will remove words that appear in 5 docs or less
cv = CountVectorizer(minTF=1., minDF=5., vocabSize=2**17)  .setInputCol("filtered")  .setOutputCol("tf")


# In[24]:

# we now create a pipelined transformer
cv_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv]).fit(airportCleanDF)


# In[25]:

from pyspark.ml.feature import IDF
idf = IDF().    setInputCol('tf').    setOutputCol('tfidf')


# In[26]:

idf_pipeline = Pipeline(stages=[cv_pipeline, idf]).fit(airportCleanDF)


# In[27]:

tfidf_df = idf_pipeline.transform(airportCleanDF)


# In[28]:

tfidf_df.printSchema()
示例#34
0
print('data frame where common words are removed'.upper())
remover.transform(sentence_DF).show()

# n-grams: sequence of tokens of consecutive 'n' words  
from pyspark.ml.feature import NGram
ngram = NGram(n=2, inputCol='words', outputCol='grams')
ngram.transform(tok_final).show() 
tok_final_n = ngram.transform(tok_final)
tok_final_n.select('grams').show(truncate=False)
# The n-grams help explore relationships between close words

from pyspark.ml.feature import HashingTF, IDF

hashing_tf = HashingTF(inputCol='words',outputCol='rawFeatures')
feature_data = hashing_tf.transform(tok_final)
idf = IDF(inputCol='rawFeatures',outputCol='Features')
idf_model = idf.fit(feature_data)
rescaled_data = idf_model.transform(feature_data)

# See how words were transformed into numbers, this is ready for a supervides machine learning algorithm
rescaled_data.select('id','Features').show(truncate=False)

############

from pyspark.ml.feature import CountVectorizer
df = spark.createDataFrame([(0, ['hello', 'are', 'you', 'man']),(1,['hello', 'hello', 'man', 'I', 'am', 'great', 'I', 'am', 'fantastic', 'you', '?', 'you', 'okay', '?'])],['id', 'tokens'])
cv = CountVectorizer(inputCol='tokens', outputCol='countVec', vocabSize=10, minDF=2.0) 
# minDF: minimum number of documents in which a word must belong in order to be considered as a feature
cv.fit(df).transform(df).show(truncate=False)

print("Note that 'hello' and 'you' were repeated twice in the last document")
示例#35
0
featurizeData = hashingTF.transform(wordsData)
featurizeData.select("words", "rawFeatures").show(truncate=False)
#%%
""" CountVectorizer词频统计(可以将词频对应上单词)"""
countVector = CountVectorizer(inputCol="words",
                              outputCol="rawFeatures",
                              minDF=2)
cvModel = countVector.fit(wordsData)
cv_df = cvModel.transform(wordsData)
cv_df.show(4, False)
#%%
# voc=cvModel.vocabulary
# getKeywordFunc=udf()
# %%
""" IDF模型训练 """
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(cv_df)
rescaledData = idfModel.transform(cv_df)
# %%
list = rescaledData.collect()
# with open("./collect_file.txt","w+") as f:
#     f.write(str(list))
# %%
Features = rescaledData.select("features").toPandas()
Words = rescaledData.select("words").toPandas()
#%%
features_dict = Features.to_dict()
# with open("./features_dict.txt","w") as f:
#     f.write(str(features_dict["features"]))
# %%
features_numpy = np.array(Features)
示例#36
0
    data = spark.read.load(sys.argv[1])

    df = data.filter((col('date') >= '1895') & (col('seq') =='1')) \
            .select(year('date').alias('year'), 'id', 'text')

    # https://danvatterott.com/blog/2018/07/08/aggregating-sparse-and-dense-vectors-in-pyspark/
    def dense_to_array(v):
        new_array = list([float(x) for x in v])
        return new_array

    dense_to_array_udf = udf(dense_to_array, ArrayType(FloatType()))

    indexer = StringIndexer(inputCol="id", outputCol="label")
    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    vectorizer = CountVectorizer(inputCol="tokens", outputCol="rawFeatures")
    idf = IDF(inputCol="rawFeatures", outputCol="vector", minDocFreq=1)

    pipeline = Pipeline(stages=[indexer, tokenizer, vectorizer, idf])
    model = pipeline.fit(df)

    results = model.transform(df) \
        .select(year('date').alias('year'), 'label', 'vector') \
        .withColumn('vector', dense_to_array_udf('vector'))

    results = model.transform(df).select('year', 'label', 'vector')

    results.write \
        .partitionBy('year') \
        .format('csv') \
        .options(compression='gzip', sep='\t', header='true') \
        .save(sys.argv[2])
示例#37
0
df_seg = df.withColumn('seg', seg_udf(df.content)).select('seg')
df_seg.show()
# 将分词做成
tokenizer = Tokenizer(inputCol='seg', outputCol='words')
df_seg_arr = tokenizer.transform(df_seg).select('words')
df_seg_arr.show()

# 切词之后的文本特征的处理
tf = HashingTF(numFeatures=1 << 18,
               binary=False,
               inputCol='words',
               outputCol='rawfeatures')
df_tf = tf.transform(df_seg_arr).select('rawfeatures')
df_tf.show()

idf = IDF(inputCol='rawfeatures', outputCol='features')
idfModel = idf.fit(df_tf)
df_tfidf = idfModel.transform(df_tf)
df_tfidf.show()

# 切分训练集和预测集
splits = df_tfidf.randomSplit([0.7, 0.3], 1234)
train = splits[0]
test = splits[1]

# 定义模型
kmeans = KMeans(featuresCol="features",
                predictionCol="prediction",
                k=6,
                initMode="k-means||",
                initSteps=5,
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

sentenceData = spark.createDataFrame([
                                      (0.0, "a b b a a a b"),
                                      (0.0, "a b a b b a a"),
                                      (1.0, "b a b bb aa b")
                                     ], ["label", "sentence"])

# TF
tokenizer      = Tokenizer(inputCol="sentence", outputCol="words")
wordsData      = tokenizer.transform(sentenceData)
hashingTF      = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors

# IDF
idf            = IDF(inputCol="rawFeatures", outputCol="features")
idfModel       = idf.fit(featurizedData)
rescaledData   = idfModel.transform(featurizedData)

rescaledData.show(20, False)

spark.stop()
    review_text = BeautifulSoup(raw_review).text
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                                            
    # 
    # 4. Remove stop words
    meaningful_words =  [w for w in words if not w in stops]   
    #
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return " ".join( meaningful_words)   

stops = set(stopwords.words("english")) 
lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv")
rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys()
parts = rows.map(lambda l: l.split("\t"))

review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), 
	review=review_to_words(p[2])))
schemeReview = sqlContext.createDataFrame(review)
tokenizer = Tokenizer(inputCol="review", outputCol="words")
wordsData = tokenizer.transform(schemeReview)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
selectData = rescaledData.select("label","features")
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                    outputCol="filtered",
                                    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
                                    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=4)


if algo == "gbm":
    ## Create GBM model
    algoStage = H2OGBM(ratio=0.8,
                 seed=1,
                 featuresCols=[idf.getOutputCol()],
                 predictionCol="label")
elif algo == "dl":
    ## Create H2ODeepLearning model
    algoStage = H2ODeepLearning(epochs=10,
                         seed=1,
                         l1=0.001,
                         l2=0.0,
示例#41
0
wrangled.show(4, truncate=False)

--------------------------------------------------
# Exercise_11 
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF

# Remove stop words.
wrangled = StopWordsRemover(inputCol='words', outputCol='terms')\
      .transform(sms)

# Apply the hashing trick
wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024)\
      .transform(wrangled)

# Convert hashed symbols to TF-IDF
tf_idf = IDF(inputCol='hash', outputCol='features')\
      .fit(wrangled).transform(wrangled)
      
tf_idf.select('terms', 'features').show(4, truncate=False)

--------------------------------------------------
# Exercise_12 
# Split the data into training and testing sets
sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13)

# Fit a Logistic Regression model to the training data
logistic = LogisticRegression(regParam=0.2).fit(sms_train)

# Make predictions on the testing data
prediction = logistic.transform(sms_test)
tfIdfIn = tokenized\
  .where("array_contains(DescOut, 'red')")\
  .select("DescOut")\
  .limit(10)
tfIdfIn.show(10, False)


# COMMAND ----------

from pyspark.ml.feature import HashingTF, IDF
tf = HashingTF()\
  .setInputCol("DescOut")\
  .setOutputCol("TFOut")\
  .setNumFeatures(10000)
idf = IDF()\
  .setInputCol("TFOut")\
  .setOutputCol("IDFOut")\
  .setMinDocFreq(2)


# COMMAND ----------

idf.fit(tf.transform(tfIdfIn)).transform(tf.transform(tfIdfIn)).show(10, False)


# COMMAND ----------

from pyspark.ml.feature import Word2Vec
# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
示例#43
0
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("tf_idf_sample") \
    .master("local[*]") \
    .getOrCreate()

df1 = spark.createDataFrame([
    (0, "a a a b b c"),
    (0, "a b c"),
    (1, "a c a a d")]).toDF("label", "sentence")

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

# 각 문장을 단어로 분리
df2 = tokenizer.transform(df1)

hashingTF = HashingTF(inputCol="words", outputCol="TF-Features", numFeatures=20)
df3 = hashingTF.transform(df2)

df3.cache()

idf = IDF(inputCol="TF-Features", outputCol="Final-Features")
idfModel = idf.fit(df3)

rescaledData = idfModel.transform(df3)
rescaledData.select("words", "TF-Features", "Final-Features").show()

spark.stop
示例#44
0
文件: TFIDF.py 项目: Inscrutive/spark
from pyspark.ml.feature import RegexTokenizer

tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W+")

# COMMAND ----------

# MAGIC %md
# MAGIC Create a `HashingTF` transformer to hash words to buckets with counts, then use an `IDF` estimator to compute inverse-document frequency for buckets based on how frequently words have hashed to those buckets in the given documents.  Next, normalize the tf-idf values so that the \\( l^2 \\) norm is one for each row.

# COMMAND ----------

from pyspark.ml.feature import IDF, HashingTF, Normalizer

hashingTF = HashingTF().setNumFeatures(10000).setInputCol(tokenizer.getOutputCol()).setOutputCol("hashingTF")

idf = IDF().setMinDocFreq(10).setInputCol(hashingTF.getOutputCol()).setOutputCol("idf")

normalizer = Normalizer().setInputCol(idf.getOutputCol()).setOutputCol("features")

# COMMAND ----------

# MAGIC %md
# MAGIC Now, let's build the `KMeans` estimator and a `Pipeline` that will contain all of the stages.  We'll then call fit on the `Pipeline` which will give us back a `PipelineModel`.  This will take about a minute to run.

# COMMAND ----------

from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans

kmeans = KMeans().setFeaturesCol("features").setPredictionCol("prediction").setK(5).setSeed(0)
示例#45
0
    stopwords = StopWordsRemover(inputCol="tokens", outputCol="tokens_filtered")
    newsgroups = stopwords.transform(newsgroups)
    newsgroups = newsgroups.drop('tokens')
    
    count_vec = CountVectorizer(inputCol="tokens_filtered", outputCol="tf_features", vocabSize=num_features, minDF=2.0)
    count_vec_model = count_vec.fit(newsgroups)
    vocab = count_vec_model.vocabulary
    newsgroups = count_vec_model.transform(newsgroups)
    newsgroups = newsgroups.drop('tokens_filtered')
    
    #hashingTF = HashingTF(inputCol="tokens_filtered", outputCol="tf_features", numFeatures=num_features)
    #newsgroups = hashingTF.transform(newsgroups)
    #newsgroups = newsgroups.drop('tokens_filtered')

    idf = IDF(inputCol="tf_features", outputCol="features")
    newsgroups = idf.fit(newsgroups).transform(newsgroups)
    newsgroups = newsgroups.drop('tf_features')
    
    lda = LDA(k=num_topics, featuresCol="features", seed=0)
    model = lda.fit(newsgroups)
 
    topics = model.describeTopics()
    topics.show()
    
    model.topicsMatrix()
    
    topics_rdd = topics.rdd

    topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
  yield
  end = time()
  print(f'Elapsed time: {end - start: .4f}s')


algorithm = algorithms[args.algorithm]
spark = SparkSession.builder.appName(args.app_name).getOrCreate()

with timer():
  print('[INFO] Reading time')
  rdd = spark.sparkContext.textFile(os.path.join('dataset', 'train.ft.txt'))
  rdd.cache()

tokenizer = Tokenizer(inputCol = 'rawContent', outputCol = 'words')
hashing_tf = HashingTF(numFeatures = args.num_features, inputCol = 'words', outputCol = 'rawFeatures')
idf = IDF(inputCol = 'rawFeatures', outputCol = 'features')
label_indexer = StringIndexer(inputCol = 'rawLabel', outputCol = 'label')

with timer():
  print('[INFO] Preprocessing time')
  df = rdd.map(lambda x: (x[:10], x[11: ])).toDF(['rawLabel', 'rawContent'])
  df = tokenizer.transform(df)
  tf_df = hashing_tf.transform(df)
  # tf_df.cache()
  idf_model = idf.fit(tf_df)
  encoded_df = idf_model.transform(tf_df)
  training_df = encoded_df.select('rawLabel', 'features')
  labelModel = label_indexer.fit(training_df)
  training_df = labelModel.transform(training_df)

if algorithm in ['LogisticRegression', 'LinearSVC']:
just_text = news_types.map(lambda line: [line[0], line[4]])

fields = [StructField('id', StringType(), True),StructField('text', StringType(), True)]
schema = StructType(fields)

#We assume that there is an rdd called just_text with lists containing the tweet id and text in each row
#just_text = [[id1, text1][id2, text2]....]
data_df = sqlContext.createDataFrame(just_text, schema)

tokenizer = Tokenizer(inputCol = "text", outputCol ="words")
tokenizedData = tokenizer.transform(data_df)
hashingTF = HashingTF(inputCol = "words", outputCol = "tf", numFeatures = 2**16)
tfData = hashingTF.transform(tokenizedData)

idf = IDF(inputCol = "tf", outputCol = "features")
idfModel = idf.fit(tfData)

finalData = idfModel.transform(tfData)

model = LogisticRegressionModel.load('/user/maria_dev/user/maria_dev/sentimentModel')

predictions = model.transform(finalData)

predictions2 = predictions.select(predictions.id, predictions.text, predictions.prediction)

#To create a regular rdd
predictions_rdd = predictions2.rdd.map(list)

predictions_without_text = predictions_rdd.map(lambda line: [line[0], line[2]])
示例#48
0
#spark = pyspark.sql.SparkSession(sc)

df = spark.read.format("csv").option("inferschema", "true").option(
    "header", "true").option("delimiter", "\t").load("trainReviews.tsv")

tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(df)
wordsData.show(5)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
tf = hashingTF.transform(wordsData)
tf.show(10)

tf.head().rawFeatures

idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

ml = LogisticRegression(featuresCol="features",
                        labelCol='category',
                        regParam=0.01)
mlModel = ml.fit(tfidf.limit(5000))
res_train = mlModel.transform(tfidf)
extract_prob = F.udf(lambda x: float(x[1]), T.FloatType())
res_train.withColumn("proba", extract_prob("probability")).select(
    "id", "proba", "prediction").show()

test_df = spark.read.format("csv").option("inferschema", "true").option(
    "header", "true").option("delimiter", "\t").load("testReviews.tsv")

tokenizer = Tokenizer(inputCol="text", outputCol="words")
示例#49
0
# 분류 모델 구축
# 모델링 매개변수
numFeatures = 5000
minDocFreq = 50
numTrees = 1000

# 머신 러닝 파이프라인 구축
inx1 = StringIndexer(inputCol="hour", outputCol="hour-inx")
inx2 = StringIndexer(inputCol="month", outputCol="month-inx")
inx3 = StringIndexer(inputCol="dayofweek", outputCol="dow-inx")
inx4 = StringIndexer(inputCol="sentiment", outputCol="label")
hashingTF = HashingTF(numFeatures=numFeatures,
                      inputCol="words",
                      outputCol="hash-tf")
idf = IDF(minDocFreq=minDocFreq, inputCol="hash-tf", outputCol="hash-tfidf")
va = VectorAssembler(inputCols=[
    "hour-inx", "month-inx", "dow-inx", "hash-tfidf", "pscore", "nscore"
],
                     outputCol="features")
rf = RandomForestClassifier(numTrees=numTrees,
                            maxDepth=4,
                            maxBins=32,
                            labelCol="label",
                            seed=42)
p = Pipeline(stages=[inx1, inx2, inx3, inx4, hashingTF, idf, va, rf])

# 훈련용 테스용 데이터 분류
(trainSet, testSet) = hc.table("fm").randomSplit([0.7, 0.3])
trainData = trainSet.cache()
testData = testSet.cache()
示例#50
0
rdd = labeledRdd.map(lambda doc: (cleanLower(doc[0]), doc[1]))

print "Text is cleaned"

sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ["review", "label"])
dfTrain, dfTest = df.randomSplit([0.8, 0.2])

print "Random split is done"

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english"))
)
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf")
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed")
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])


# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************

evaluator = MulticlassClassificationEvaluator(
    predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)
示例#51
0
文件: nlp.py 项目: Rachneet/PySpark
# remover.transform(sentence_df).show()
#
# # n-gram
# from pyspark.ml.feature import NGram
#
# ngram = NGram(n=2, inputCol='tokens', outputCol='grams')
# ngram.transform(sentence_df).show()
# ngram.transform(sentence_df).select('grams').show(truncate=False)

from pyspark.ml.feature import HashingTF, IDF, CountVectorizer

hashing_tf = HashingTF(inputCol='words', outputCol='rawFeatures')
featurized_data = hashing_tf.transform(rg_tokenized)
# featurized_data.show()

idf = IDF(inputCol='rawFeatures', outputCol='features')
idf_model = idf.fit(featurized_data).transform(featurized_data)

idf_model.select('id', 'features').show()

# count vectorizer
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" ")),
], ['id', 'words'])
cv = CountVectorizer(inputCol='words',
                     outputCol='features',
                     vocabSize=3,
                     minDF=2.0)
model = cv.fit(df).transform(df)
示例#52
0
|4  |Win a cash prize or a prize worth|1    |
+---+---------------------------------+-----+
only showing top 4 rows
"""

from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])
print(pipeline)


"""
Cross validating simple flight duration model
You've already built a few models for predicting flight duration and evaluated 
them with a simple train/test split. However, 
cross-validation provides a much better way to evaluate model performance.

In this exercise you're going to train a simple model for flight duration using cross-validation. 
Travel time is usually strongly correlated with distance, 
示例#53
0
    text = text.split()
    return text


# In[5]:

clean_udf = udf(text_split, ArrayType(StringType()))
df = df.withColumn("body", clean_udf("body"))

# In[6]:

#following section transforms the text using TFIDF
start = time.clock()
hashingTF = HashingTF(inputCol="body", outputCol="term_freq")
df = hashingTF.transform(df)
idf = IDF(inputCol="term_freq", outputCol="tfidf")
idfModel = idf.fit(df)
df = idfModel.transform(df)
print("pyspark TFIDF processing time: {0:.5f} s".format(time.clock() - start))

# ## 4. Building a Naive Bayes Classifier
#
# The first step is to convert the topics (nominal) to a list of discrete integers

# In[7]:

#Using the OneHotEncoder to convert the topics into discrete integers
stringIndexer = StringIndexer(inputCol="topic", outputCol="topicIndex")
model = stringIndexer.fit(df)
indexed = model.transform(df)
def main(sc, sqlContext):

    #start = timer()

    #print '---Pegando usuario, posts, tokens e categorias do MongoDB---'
    #start_i = timer()
    user = findUserById(iduser)
    posts = findPosts(user) 
    
    tokens, category, categoryAndSubcategory = getTokensAndCategories()
    postsRDD = (sc.parallelize(posts).map(lambda s: (s[0], word_tokenize(s[1].lower()), s[2], s[3]))
                    .map(lambda p: (p[0], [x for x in p[1] if x in tokens] ,p[2], p[3]))
                    .cache())

    

    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Pegando produtos do MongoDB---'
    #start_i = timer()

    #print '####levou %d segundos' % (timer() - start_i)
    
    #print '---Criando corpusRDD---'
    #start_i = timer()
    stpwrds = stopwords.words('portuguese')
    corpusRDD = (postsRDD.map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3]))
                         .filter(lambda x: len(x[1]) >= 20 or (x[2] == u'Post' and len(x[1])>0))
                         .cache())
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Calculando TF-IDF---'
    #start_i = timer()
    wordsData = corpusRDD.map(lambda s: Row(label=int(s[0]), words=s[1], type=s[2]))
    wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll(sqlContext.read.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"))


    numTokens = len(tokens)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)

    idfModel = idf.fit(featurizedData)
    tfIDF = idfModel.transform(featurizedData).cache()

    postTFIDF = (tfIDF
                    .filter(tfIDF.type==u'Post')
                    #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4])))
                    .cache())

    #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1)
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Carregando modelo---'
    #start_i = timer()
    NB = NaiveBayesModel.load(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria')
    SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm")
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Usando o modelo---'
    #start_i = timer()
    predictions = (postTFIDF
                        .map(lambda p: (NB.predict(p.features), p[0], SVM.predict(p.features)))
                        .filter(lambda p: p[2]==1)
                        .map(lambda p: (p[0], p[1]))
                        .groupByKey()
                        .mapValues(list)
                        .collect())

    #print '####levou %d segundos' % (timer() - start_i)
    #print '---Calculando similaridades---'
    #start_i = timer()
    suggestions = []

    for prediction in predictions:
        category_to_use = category[int(prediction[0])]
        #print ' Calculando similaridades para a categoria: {}'.format(category_to_use)
        tf = tfIDF.filter(tfIDF.type==category_to_use).cache()
        for post in prediction[1]:
            postVector = postTFIDF.filter(postTFIDF.label == post).map(lambda x: x.features).collect()[0]
            sim = (tf
                    .map(lambda x: (post, x.label, cossine(x.features, postVector)))
                    .filter(lambda x: x[2]>=threshold)
                    .collect())
            if len(sim) > 0:
                suggestions.append(sim)

    #print '####levou %d segundos' % (timer() - start_i)

    if len(suggestions) > 0:
        #print '---Inserindo recomendacoes no MongoDB---'
        #start_i = timer()
        insertSuggestions(suggestions, iduser, posts)
normalizerBi = Normalizer(inputCol="bigrams",outputCol='normBigrams',p=2.0)
dfNorm = normalizerUni.transform(dfVect2)
dfNorm2 = normalizerBi.transform(dfNorm)
print "DataFrame(bi-gram): normalisé"
dfNorm2.select('words','normWords').show()
# La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements 
# non nuls et pas leur valeur
# On passe au TFIDF
# Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs
# à n'importz quelle colonne (bigrammes, avec stop words ou sans...)
from pyspark.ml.feature import HashingTF
htf = HashingTF(inputCol='words',outputCol='wordsTF',numFeatures=10000)
dfTrainTF = htf.transform(dfTrainTokNoSw)
# INverse doc frequency
from pyspark.ml.feature import IDF
idf = IDF(inputCol=htf.getOutputCol(),outputCol="wordsTFIDF")
idfModel = idf.fit(dfTrainTF)
dfTrainTFIDF = idfModel.transform(dfTrainTF)
dfTrainTFIDF.select('review','wordsTF','wordsTFIDF').show()

# Je sais que cette étape m'a été utile une fois, la ça a pas trop l'air
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainTFIDF)
dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF)
dfTrainFinal.select('review','label','target_indexed').show()



#**********************************************************************
#-----------Training the model for prediction--------------------------