Пример #1
0
def main(args):
    textFiles = sc.wholeTextFiles(maindir + '4').map(readContents)
    #print "READ second {} check ".format(textFiles.take(10))
    '''
        filter the rows based on all the index available in
        training file else drop
        http://stackoverflow.com/questions/24718697/pyspark-drop-rows
    '''

    htmldf = sqlContext.createDataFrame(textFiles)
    htmldf.cache()


    traindf = getCleanedRDD(maindir + 'train_v2.csv', ["id", "images", "links", "text", "label"], htmldf)
    traindf.write.save(maindir+"output/train_4.parquet", format="parquet")



    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
    lr = LogisticRegression(maxIter=20, regParam=0.01)
    rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="label")
    rf = RandomForestClassifier(labelCol="features", numTrees=3, maxDepth=4)
    #https://databricks.com/blog/2015/07/29/new-features-in-machine-learning-pipelines-in-spark-1-4.html
    #http://spark.apache.org/docs/latest/api/python/pyspark.ml.html

    #w2v = Word2Vec(inputCol="text", outputCol="w2v")

    rfc = RandomForestClassifier(labelCol="label", numTrees=3, maxDepth=4)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])



    # Fit the pipeline to training documents.
    model = pipeline.fit(traindf)

    print '-----------------------------------------------------------------------------'
    testdf = getCleanedRDD(maindir + 'test.csv', ["id", "images", "links", "text", "label"], htmldf)
    #print testdf.count()



    # Make predictions on test documents and print columns of interest.
    prediction = model.transform(testdf)
    #print('prediction', prediction)

    '''
    pand = prediction.toPandas()
    pand.to_csv('testpanda.csv', sep='\t', encoding='utf-8')	
    print "Done!!! CSV"

    '''
    #prediction.select('id','probability','prediction').write.format('com.databricks.spark.csv').option("header", "true").save(maindir + 'output/result_lr0.csv')
    # ('prediction', DataFrame[id: string, images: bigint, links: bigint, text: string, label: double,
    # words: array<string>, features: vector, rawPrediction: vector, probability: vector, prediction: double])

    '''
    #write in scala
    selected = prediction.select("id", "probability", "prediction")
    for row in selected.collect():
        print row
    '''
    sc.stop()
Пример #2
0
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

#spark-submit --master local[*] --packages com.databricks:spark-csv_2.10:1.2.0 cluster.py

sc = SparkContext()
sqlContext = SQLContext(sc)
text = sc.textFile('file:/Users/wangmengyuan/Desktop/rr/listings.txt').map(lambda l:l.split('\t'))\
 .map(lambda l: (l[0],l[1]))
df = sqlContext.createDataFrame(text, ["houseid", "description"])
tokenizer = Tokenizer(inputCol="description", outputCol="tokens")
tokenized = tokenizer.transform(df).cache()
remover = StopWordsRemover(inputCol="tokens",
                           outputCol="stopWordsRemovedTokens")
stopWordsRemoved_df = remover.transform(tokenized).cache()
hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                      outputCol="rawFeatures",
                      numFeatures=200)
tfVectors = hashingTF.transform(stopWordsRemoved_df).cache()
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)
idfModel = idf.fit(tfVectors)
tfIdfVectors = idfModel.transform(tfVectors).cache()
normalizer = Normalizer(inputCol="features", outputCol="normFeatures")
l2NormData = normalizer.transform(tfIdfVectors)
kmeans = KMeans().setK(10).setMaxIter(20)
km_model = kmeans.fit(l2NormData)
clustersTable = km_model.transform(l2NormData)

#save to hdfs
df1 = clustersTable[['houseid', 'prediction']]
#df1.select('houseid', 'prediction').write.format('com.databricks.spark.csv').save('cluster.csv')
df1.select('houseid', 'prediction').show(20)

from pyspark.ml.feature import CountVectorizer

count_vec=CountVectorizer(inputCol='refined_tokens',outputCol='features')

cv_df=count_vec.fit(refined_df).transform(refined_df)

cv_df.select(['user_id',"business_id", "review_id", 'refined_tokens','features']).show(1,True, True)

count_vec.fit(refined_df).vocabulary


from pyspark.ml.feature import HashingTF,IDF

hashing_vec=HashingTF(inputCol='refined_tokens',outputCol='tf_features')

hashing_df=hashing_vec.transform(refined_df)

hashing_df.select(['user_id','refined_tokens','tf_features']).show(4,True, True)

tf_idf_vec=IDF(inputCol='tf_features',outputCol='tf_idf_features')

tf_idf_df=tf_idf_vec.fit(hashing_df).transform(hashing_df)

tf_idf_df.select('tf_idf_features').show(1,True, True)
tf_idf_df.show(1, True, True)

def get_dummy(df, indexCol, categoricalCols,
              continuousCols, labelCol, dropLast=False):
    from pyspark.ml import Pipeline
Пример #4
0
tokenizer = RegexTokenizer(inputCol="text",
                           outputCol="words",
                           minTokenLength=3,
                           gaps=False,
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(
    inputCol=tokenizer.getOutputCol(),
    outputCol="filtered",
    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4)

## Create GBM model
gbm = H2OGBM(ratio=0.8,
             featuresCols=[idf.getOutputCol()],
             predictionCol="label")

## Remove all helper columns
colPruner = ColumnPruner(columns=[
    idf.getOutputCol(),
    hashingTF.getOutputCol(),
    stopWordsRemover.getOutputCol(),
Пример #5
0
# MAGIC %md ### Define the Pipeline
# MAGIC The pipeline for the model consist of the following stages:
# MAGIC - A Tokenizer to split the tweets into individual words.
# MAGIC - A StopWordsRemover to remove common words such as "a" or "the" that have little predictive value.
# MAGIC - A HashingTF class to generate numeric vectors from the text values.
# MAGIC - A LogisticRegression algorithm to train a binary classification model.

# COMMAND ----------

# convert sentence to words' list
tokenizer = Tokenizer(inputCol="text", outputCol="SentimentWords")
# remove stop words
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                       outputCol="MeaningfulWords")
# convert word to number as word frequency
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
# set the model
lr = LogisticRegression(labelCol="label",
                        featuresCol="features",
                        maxIter=10,
                        regParam=0.01)

# process pipeline with the series of transforms - 4 transforms
pipeline = Pipeline(stages=[tokenizer, swr, hashTF, lr])

# COMMAND ----------

# MAGIC %md ### Run the Pipeline as an Estimator
# MAGIC The pipeline itself is an estimator, and so it has a **fit** method that we called to run the pipeline on a specified DataFrame. In this case, we ran the pipeline on the training data to train a model.

# COMMAND ----------
Пример #6
0
spark = SparkSession.builder.appName("TfIdf-Ngram").getOrCreate()
documents = spark.read.text("dataset/*.txt")
documents = documents.withColumn("doc_id",
                                 F.row_number().over(Window.orderBy('value')))

documents.printSchema()
# creating tokens/words from the sentence data
tokenizer = Tokenizer(inputCol="value", outputCol="words")
wordsData = tokenizer.transform(documents)

ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
ngramDataFrame = ngram.transform(wordsData)

# applying tf on the words data
hashingTF = HashingTF(inputCol="ngrams",
                      outputCol="rawFeatures",
                      numFeatures=20)
featurizedData = hashingTF.transform(ngramDataFrame)
# alternatively, CountVectorizer can also be used to get term frequency vectors

# calculating the IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

# displaying the results
rescaledData.select("doc_id", "features").show(truncate=False)
# closing the spark session
spark.stop()
Пример #7
0
def getvalue3():
    if request.method == 'POST':
        subreddit_input = request.form['subreddit']
        #subreddit_input = 'World Politics'
        subreddit_filter = requests.get(
            url + 'reddit_post.json?orderBy="subreddit"&equalTo="' +
            str(subreddit_input) + '"')
        subreddits = json.loads(subreddit_filter.text)
        results = []
        for x in subreddits:
            try:
                results.append(subreddits[x])
            except KeyError:
                continue
        data = pd.DataFrame.from_dict(results, orient='columns')
        data1 = spark.createDataFrame(pd.DataFrame(data["title"]))
        data1.show(truncate=False)
        clean_data_udf = udf(clean_data, StringType())
        data1 = data1.withColumn("new_title", clean_data_udf("title"))
        data1.show()
        tokenizer = Tokenizer(inputCol="new_title", outputCol="words")
        data1 = tokenizer.transform(data1)
        data1.show()
        remover = StopWordsRemover(inputCol="words", outputCol="rm_words")
        data1 = remover.transform(data1)
        data1.show()
        hashingTF = HashingTF(inputCol="rm_words",
                              outputCol="rawFeatures",
                              numFeatures=2000)
        data1 = hashingTF.transform(data1)
        data1.show()
        data1.select("rm_words").show(truncate=False)
        data1.select("rawFeatures").show(truncate=False)
        idf = IDF(inputCol="rawFeatures", outputCol="features")
        idfModel = idf.fit(data1)
        data1 = idfModel.transform(data1)
        data1.select("features").show(truncate=False)
        kmeans = KMeans(k=2, featuresCol="features").setSeed(1)
        kmeans_model = kmeans.fit(data1)
        data1 = kmeans_model.transform(data1)
        data1.select("prediction").show(50)
        data["prediction"] = data1.select("prediction").toPandas()
        print(data["prediction"].value_counts())

        #topic_generator(subreddit_input)
        topic1 = data[data['prediction'] == 0]['title'].reset_index(drop=True)
        topic2 = data[data['prediction'] == 1]['title'].reset_index(drop=True)
        topic1_1 = topic1[0]
        topic1_2 = topic1[1]
        topic1_3 = topic1[2]
        topic1_4 = topic1[3]
        topic1_5 = topic1[4]
        topic2_1 = topic2[0]
        topic2_2 = topic2[1]
        topic2_3 = topic2[2]
        topic2_4 = topic2[3]
        topic2_5 = topic2[4]
        return render_template('title_topic_update.html',
                               topic1_1=topic1_1,
                               topic1_2=topic1_2,
                               topic1_3=topic1_3,
                               topic1_4=topic1_4,
                               topic1_5=topic1_5,
                               topic2_1=topic2_1,
                               topic2_2=topic2_2,
                               topic2_3=topic2_3,
                               topic2_4=topic2_4,
                               topic2_5=topic2_5)
    else:
        return render_template('title_topic.html')
    # 變成n字一組
    # NGram is a feature transformer that converts the input array of strings into an array of n-grams. Null values in the input array are ignored. It returns an array of n-grams where each n-gram is represented by a space-separated string of words. When the input is empty, an empty array is returned. When the input array length is less than n (number of elements per n-gram), no n-grams are returned.
    ngram_df = NGram(n=2, inputCol="words",
                     outputCol="ngrams").transform(words)

    ngram_df.show(truncate=False)
    ngram_df.select("ngrams").show(truncate=False)

    # TF-IDF is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.[1] It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling.
    df = words.select("words")
    df.show(truncate=False)

    # Hashing TF is TF with hashing enabled to allow the feature vector to be a set value
    df_tf = HashingTF(
        inputCol="words",
        outputCol="hashing_tf",
        numFeatures=15  #預設是262144維
    ).transform(df)

    df_tf.show()
    df_tf.select("words").show(truncate=False)
    df_tf.select("hashing_tf").show(truncate=False)
    #第一個list代表詞的index,第2個list代表詞出現次數

    # IDF
    df_tf_idf = IDF(inputCol="hashing_tf",
                    outputCol="tf_idf").fit(df_tf).transform(df_tf)

    df_tf_idf.show()
    df_tf_idf.select("words").show(truncate=False)
    df_tf_idf.select("hashing_tf").show(truncate=False)  # Hashing TF
Пример #9
0
    def test_save_load_pipeline_estimator(self):
        temp_path = tempfile.mkdtemp()
        training = self.spark.createDataFrame([
            (0, "a b c d e spark", 1.0),
            (1, "b d", 0.0),
            (2, "spark f g h", 1.0),
            (3, "hadoop mapreduce", 0.0),
            (4, "b spark who", 1.0),
            (5, "g d a y", 0.0),
            (6, "spark fly", 1.0),
            (7, "was mapreduce", 0.0),
        ], ["id", "text", "label"])

        # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
        tokenizer = Tokenizer(inputCol="text", outputCol="words")
        hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                              outputCol="features")

        ova = OneVsRest(classifier=LogisticRegression())
        lr1 = LogisticRegression().setMaxIter(5)
        lr2 = LogisticRegression().setMaxIter(10)

        pipeline = Pipeline(stages=[tokenizer, hashingTF, ova])

        paramGrid = ParamGridBuilder() \
            .addGrid(hashingTF.numFeatures, [10, 100]) \
            .addGrid(ova.classifier, [lr1, lr2]) \
            .build()

        tvs = TrainValidationSplit(
            estimator=pipeline,
            estimatorParamMaps=paramGrid,
            evaluator=MulticlassClassificationEvaluator())
        tvsPath = temp_path + "/tvs"
        tvs.save(tvsPath)
        loadedTvs = TrainValidationSplit.load(tvsPath)
        self.assert_param_maps_equal(loadedTvs.getEstimatorParamMaps(),
                                     paramGrid)
        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)

        # Run train validation split, and choose the best set of parameters.
        tvsModel = tvs.fit(training)

        # test save/load of CrossValidatorModel
        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
        self.assertEqual(len(loadedModel.bestModel.stages),
                         len(tvsModel.bestModel.stages))
        for loadedStage, originalStage in zip(loadedModel.bestModel.stages,
                                              tvsModel.bestModel.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)

        # Test nested pipeline
        nested_pipeline = Pipeline(
            stages=[tokenizer, Pipeline(stages=[hashingTF, ova])])
        tvs2 = TrainValidationSplit(
            estimator=nested_pipeline,
            estimatorParamMaps=paramGrid,
            evaluator=MulticlassClassificationEvaluator())
        tvs2Path = temp_path + "/tvs2"
        tvs2.save(tvs2Path)
        loadedTvs2 = TrainValidationSplit.load(tvs2Path)
        self.assert_param_maps_equal(loadedTvs2.getEstimatorParamMaps(),
                                     paramGrid)
        self.assertEqual(loadedTvs2.getEstimator().uid,
                         tvs2.getEstimator().uid)

        # Run train validation split, and choose the best set of parameters.
        tvsModel2 = tvs2.fit(training)
        # test save/load of CrossValidatorModel
        tvsModelPath2 = temp_path + "/tvsModel2"
        tvsModel2.save(tvsModelPath2)
        loadedModel2 = TrainValidationSplitModel.load(tvsModelPath2)
        self.assertEqual(loadedModel2.bestModel.uid, tvsModel2.bestModel.uid)
        loaded_nested_pipeline_model = loadedModel2.bestModel.stages[1]
        original_nested_pipeline_model = tvsModel2.bestModel.stages[1]
        self.assertEqual(loaded_nested_pipeline_model.uid,
                         original_nested_pipeline_model.uid)
        self.assertEqual(len(loaded_nested_pipeline_model.stages),
                         len(original_nested_pipeline_model.stages))
        for loadedStage, originalStage in zip(
                loaded_nested_pipeline_model.stages,
                original_nested_pipeline_model.stages):
            self.assertEqual(loadedStage.uid, originalStage.uid)
Пример #10
0
train_df.fillna('', inplace = True)
train_df = spark.createDataFrame(train_df)
test_df = pd.read_csv('test.csv')
test_df.fillna('', inplace = True)
test_df = spark.createDataFrame(test_df)


out_cols = [i for i in train_df.columns if i not in ['id', 'comment_text']]
#
train_df.filter(F.col('toxic') == 1).show(5)

#Tokenizer:
tokenizer = Tokenizer(inputCol = 'comment_text', outputCol = 'words')
words_data = tokenizer.transform(train_df)
#
hashing_tf = HashingTF(inputCol = 'words', outputCol = 'rawFeatures')
tf = hashing_tf.transform(words_data)

tf.select('rawFeatures').take(5)

tf.count(), len(tf.columns)

idf = IDF(inputCol = 'rawFeatures', outputCol = 'features')
idfModel = idf.fit(tf)
tf_idf = idfModel.transform(tf)

####
#Performing the logistic regression:
REG = 0.01
lr = LogisticRegression(featuresCol = 'features', labelCol = 'toxic',
	regParam = REG)
Пример #11
0
def main():

    spark = SQLContext(SparkContext.getOrCreate())

    # read data
    yahoo = spark.read.csv(f'{BUILDDIR}/yahoo.csv', header=True)
    data = yahoo.select(['sector', 'description']).dropna()

    # tokenize texts based on regular expression
    tokenize = RegexTokenizer(inputCol='description',
                              outputCol='words_all',
                              pattern=r'\W')

    # remove stop words
    stopwords = '\n'.join((DATADIR / 'stopwords' / f).read_text().strip()
                          for f in ('mysql.txt', 'nltk.txt')).splitlines()
    remove_stopwords = StopWordsRemover(
        inputCol='words_all', outputCol='words_clean').setStopWords(stopwords)

    # get words frequency using simple count (bag of words)
    add_wordcount = CountVectorizer(inputCol='words_clean',
                                    outputCol='words_count',
                                    vocabSize=1000,
                                    minDF=2)

    # get tf-idf words frequencies
    add_wordtf = HashingTF(inputCol='words_clean',
                           outputCol='words_tf',
                           numFeatures=10000)
    add_wordidf = IDF(inputCol='words_tf',
                      outputCol='words_tfidf',
                      minDocFreq=2)

    # prepare output values
    index_target = StringIndexer(inputCol='sector', outputCol='label')

    # data preparation pipeline
    pipeline_wordcount = Pipeline(stages=[
        tokenize,
        remove_stopwords,
        add_wordcount,
        add_wordtf,
        add_wordidf,
        index_target,
    ])
    # apply data preparation pipeline
    model_wordcount = pipeline_wordcount.fit(data)
    prepared = model_wordcount.transform(data)

    breakpoint()

    # split to training and testing
    training, testing = prepared.randomSplit([0.8, 0.2], seed=100500)

    # fit logistic regression models

    logistic_wordcount = LogisticRegression(regParam=0.3,
                                            elasticNetParam=0,
                                            featuresCol='words_count',
                                            labelCol='label',
                                            predictionCol='prediction',
                                            probabilityCol='probability')

    logistic_tfidf = LogisticRegression(regParam=0.3,
                                        elasticNetParam=0,
                                        featuresCol='words_tfidf',
                                        labelCol='label',
                                        predictionCol='prediction',
                                        probabilityCol='probability')

    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                                  metricName='accuracy')
    for model, name in ((logistic_wordcount,
                         'Word count + Logistic regression'),
                        (logistic_tfidf, 'TF-IDF + Logistic regression')):
        predicted = model.fit(training).transform(testing)
        print(f'{name} model accuracy = {evaluator.evaluate(predicted)}')
Пример #12
0
A quick reminder about these concepts:

The hashing trick provides a fast and space-efficient way to map a very large (possibly infinite) set of items (in this case, all words contained in the SMS messages) onto a smaller, finite number of values.
The TF-IDF matrix reflects how important a word is to each document. It takes into account both the frequency of the word within each document but also the frequency of the word across all of the documents in the collection.
The tokenized SMS data are stored in sms in a column named words. You've cleaned up the handling of spaces in the data so that the tokenized text is neater.

Instructions
100 XP
Import the StopWordsRemover, HashingTF and IDF classes.
Create a StopWordsRemover object (input column words, output column terms). Apply to sms.
Create a HashingTF object (input results from previous step, output column hash). Apply to wrangled.
Create an IDF object (input results from previous step, output column features). Apply to wrangled.
'''
SOLUTION

from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF

# Remove stop words.
wrangled = StopWordsRemover(inputCol='words', outputCol='terms')\
      .transform(sms)

# Apply the hashing trick
wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024)\
      .transform(wrangled)

# Convert hashed symbols to TF-IDF
tf_idf = IDF(inputCol='hash', outputCol='features')\
      .fit(wrangled).transform(wrangled)

tf_idf.select('terms', 'features').show(4, truncate=False)
Пример #13
0
###############################################################################################
# Pipeline
###############################################################################################
# Tokenize by word
tokenizer = Tokenizer(inputCol="text", outputCol="words")
# Remove stop words in the text
stopword = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                            outputCol="no_stops",
                            stopWords=swords)
# The cheaper way to do TF-IDF
# Creates a hash that contains the term frequency
# This mean there are no pairs with the value 0
# It'll output: (number_of_words {index_from_previous: value, ...}) with no value = 0
# If the value is 0, the index_from_previous will skip so there can be key that go
# 0, 1, 6, 8, ... etc all based on the contents of the previous step
hashingTF = HashingTF(inputCol=stopword.getOutputCol(), outputCol="hashing")
# Performs the IDF part in TF-IDF
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="features1",
          minDocFreq=5)
# Appends output Token-Stopwords-HashingTF-IDF with output of Vader
assembler = VectorAssembler(inputCols=["features1", "vader"],
                            outputCol="features")
# Initialize Logistic Regression
lr = LogisticRegression(maxIter=10, regParam=0.001)
# Creates pipeline
pipeline = Pipeline(
    stages=[tokenizer, stopword, hashingTF, idf, assembler, lr])

###############################################################################################
# Fit model to training set
# Pour les opérations de traitement du langage, il est d'usage de normaliser (L2)
# les vecteurs de features : c'est ce qui marche le mieux apparemment.
from pyspark.ml.feature import Normalizer
normalizerUni = Normalizer(inputCol='words', outputCol='normWords', p=2.0)
normalizerBi = Normalizer(inputCol="bigrams", outputCol='normBigrams', p=2.0)
dfNorm = normalizerUni.transform(dfVect2)
dfNorm2 = normalizerBi.transform(dfNorm)
print "DataFrame(bi-gram): normalisé"
dfNorm2.select('words', 'normWords').show()
# La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements
# non nuls et pas leur valeur
# On passe au TFIDF
# Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs
# à n'importz quelle colonne (bigrammes, avec stop words ou sans...)
from pyspark.ml.feature import HashingTF
htf = HashingTF(inputCol='words', outputCol='wordsTF', numFeatures=10000)
dfTrainTF = htf.transform(dfTrainTokNoSw)
# INverse doc frequency
from pyspark.ml.feature import IDF
idf = IDF(inputCol=htf.getOutputCol(), outputCol="wordsTFIDF")
idfModel = idf.fit(dfTrainTF)
dfTrainTFIDF = idfModel.transform(dfTrainTF)
dfTrainTFIDF.select('review', 'wordsTF', 'wordsTFIDF').show()

# Je sais que cette étape m'a été utile une fois, la ça a pas trop l'air
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainTFIDF)
dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF)
dfTrainFinal.select('review', 'label', 'target_indexed').show()
data,Y=lf.loadLabeled("./data/train")
labeledData = zip(data,[y.item() for y in Y])
labeledRdd = sc.parallelize(labeledData)
def cleanLower(doc):
    return doc.replace("<br /><br />"," ").lower()
rdd = labeledRdd.map(lambda doc : (cleanLower(doc[0]),doc[1]))

print "Text is cleaned"

sqlContext = SQLContext(sc)
dfTrain = sqlContext.createDataFrame(rdd, ['review', 'label'])

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review", outputCol="wordsNoSw",  
    stopwords=set(nltk.corpus.stopwords.words('english')))
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizerNoSw,
                            hashing_tf,
                            idf,
                            string_indexer,
                            dt])


model = pipeline.fit(dfTrain)

print "The model is fitted"
Пример #16
0
# COMMAND ----------

# MAGIC %md
# MAGIC ### Prepare the Pipeline
# MAGIC For compataibility with Azure Model Management, make sure you are training the model on a cluster with Spark less than 2.3.0 since Model Management runs on Spark 2.1.1 and the Linear Regression model has a new param (epsilon) added in 2.3.0.

# COMMAND ----------

tkn = Tokenizer().setInputCol("abstract").setOutputCol("tokens")

englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover().setStopWords(englishStopWords).setInputCol(
    "tokens").setOutputCol("tokens_no_stop")

tf = HashingTF().setInputCol("tokens_no_stop").setOutputCol(
    "TFOut").setNumFeatures(1000)
idf = IDF().setInputCol("TFOut").setOutputCol("IDFOut").setMinDocFreq(1)
assem = VectorAssembler().setInputCols(["TFOut"]).setOutputCol("features")
rename = SQLTransformer().setStatement(
    "SELECT features, amt as label FROM __THIS__")
reg = LinearRegression()

pipe = Pipeline().setStages([tkn, stops, tf, idf, assem, rename, reg])

# COMMAND ----------

# MAGIC %md
# MAGIC ### Fit the Pipeline

# COMMAND ----------

# ## Learning pipeline

# In[8]:

from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StopWordsRemover
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.classification import LogisticRegression

tokenizer = Tokenizer(inputCol='summary', outputCol='words')

pipeline = Pipeline(stages=[
    tokenizer, 
    StopWordsRemover(inputCol='words', outputCol='filtered_words'),
    HashingTF(inputCol='filtered_words', outputCol='rawFeatures', numFeatures=120000),
    IDF(inputCol='rawFeatures', outputCol='features'),
    LogisticRegression(regParam=.3, elasticNetParam=.01)
])


# ## Testing the model accuracy

# In[9]:

model = pipeline.fit(train_reviews)


# In[10]:

from pyspark.ml.evaluation import BinaryClassificationEvaluator
Пример #18
0
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import sys
import string

spark = SparkSession.builder\
        .appName("datasetTraining")\
        .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

dataset = spark.read.csv('/bherr006/datasetTraining/training.1600000.processed.noemoticon.csv', header=False, inferSchema=True)

(trainSet, valSet, testSet) = dataset.randomSplit([0.98, 0.01, 0.01], seed = 2000)

tokenizer = Tokenizer(inputCol="_c5", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol="tf")
idf = IDF(inputCol="tf", outputCol="features", minDocFreq=5)
labelStringIndex = StringIndexer(inputCol="_c0", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer,hashtf,idf,labelStringIndex])

pipelineFit = pipeline.fit(trainSet)
trainDf = pipelineFit.transform(trainSet)
valDf = pipelineFit.transform(valSet)

lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(trainDf)
predictions = lrModel.transform(valDf)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print(evaluator.evaluate(predictions))

Пример #19
0
# tokenizer to create a "terms" column so for example:
# from content=u'We start learning Spark'  we have terms=[u'we', u'start', u'learning', u'spark']
tokenizer = Tokenizer(inputCol="content", outputCol="terms")
termsData = tokenizer.transform(data)

# remover to remove stop words that don't contribute so for example
# from terms=[u'we', u'start', u'learning', u'spark'] we have filtered=[u'start', u'learning', u'spark']
remover = StopWordsRemover(inputCol="terms", outputCol="filtered")
filteredTermsData = remover.transform(termsData)

# http://spark.apache.org/docs/latest/ml-features.html
# Both HashingTF and CountVectorizer can be used to generate the term frequency vectors.
# HashingTF is a Transformer which takes sets of terms and converts those sets into fixed-length feature vectors. In text processing, a “set of terms” might # be a bag of words. HashingTF utilizes the hashing trick.
# so from filtered=[u'start', u'learning', u'spark'] we have rawFeatures=SparseVector(262144, {29470: 1.0, 62173: 1.0, 181346: 1.0})
tf = HashingTF(inputCol="filtered",
               outputCol="rawFeatures").transform(filteredTermsData)

# IDF: IDF is an Estimator which is fit on a dataset and produces an IDFModel. The IDFModel takes feature vectors (generally created from HashingTF or
# CountVectorizer) and scales each column. Intuitively, it down-weights columns which appear frequently in a corpus.
idf = IDF(inputCol="rawFeatures", outputCol="features").fit(tf)

# TF-IDF
tfidf = idf.transform(tf)

labels = data.map(lambda doc: doc["label"]  # Standard Python dict access 
                  )

# Training and Test datasets
# Here feature#5 contains the data for training, for example
# [Row(label=0.0, content=u'We start learning Spark', terms=[u'we', u'start', u'learning', u'spark'], filtered=[u'start', u'learning', u'spark'],
# rawFeatures=SparseVector(262144, {29470: 1.0, 62173: 1.0, 181346: 1.0}), features=SparseVector(262144, {29470: 0.9163, 62173: 0.9163, 181346: 0.9163}))]
Пример #20
0

# In[14]:


# filter rows where n_killed > 2
notes_length_df = notes_length_df.filter(notes_length_df.label <= 2)


# In[15]:


# create features
tokenizer = Tokenizer(inputCol="notes", outputCol="token_notes")
stopremove = StopWordsRemover(inputCol='token_notes',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="token_notes", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')


# In[16]:


# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'notes_length'], outputCol='features')


# In[17]:


# Create and run a data processing Pipeline
data_prep_pipeline = Pipeline(stages=[tokenizer, stopremove, hashingTF, idf, clean_up])
        docs = docs.union(
            next_docs.map(lambda (doc, lines):
                          (format_text(lines), float(curr_cat))))
        curr_cat += 1

    training_rows = docs.sample(False, train_fraction)
    testing_rows = docs.subtract(training_rows)

    # Prepare training and test documents, which are labeled.
    LabeledDocument = Row("text", "label")
    train = training_rows.map(lambda x: LabeledDocument(*x)).toDF()
    test = testing_rows.map(lambda x: LabeledDocument(*x)).toDF()

    # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="rawFeatures")  #outputCol="features")
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    lr = LogisticRegression(maxIter=1000, regParam=0.001)
    #pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    p0 = Pipeline(stages=[tokenizer, hashingTF, idf, lr])
    #m0 = p0.fit(train)
    #pipeline = Pipeline(stages=[m0, lr])
    pipeline = p0

    # Fit the pipeline to training documents.
    model = pipeline.fit(train)
    print('\n\n --------------- RESULT ----------------------\n\n')
    print(model.transform(test).head())
    print('\n\n ---------------------------------------------\n\n')
Пример #22
0
# load data
df0 = spark.read.csv("./jobs_clean.csv", header=True, multiLine=True, inferSchema=True)
df1 = pd.read_csv('./jobs_clean.csv')
#df0.show()
print('The number of jobs:',df0.count())
print('\nthe distinct jobs name: ', df1.job.unique())
print('\nThere are', len(df1.job.unique())-1, 'different kinds of jobs in the table.')

# split the desc field
tokenizer = Tokenizer(inputCol='desc_clean', outputCol='desc_words')
df = tokenizer.transform(df0)
#df.show()
#df.select('desc_words').show(10)

# compute TF-IDF
hashingTF = HashingTF(inputCol='desc_words', outputCol='desc_words_tf')
tf = hashingTF.transform(df).cache()
idf = IDF(inputCol='desc_words_tf', outputCol='desc_words_tfidf').fit(tf)
tfidf = idf.transform(tf).cache()
#print('tfidf for each job:', tfidf.select('desc_words_tfidf').show(10,truncate=False))

# data normalization
from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="desc_words_tfidf", outputCol="norm")
tfidf = normalizer.transform(tfidf)
#tfidf.select("id", "norm").show(6)

# compute similarity between jobs and resume
import pyspark.sql.functions as psf 
from pyspark.sql.types import DoubleType
print('\nCompute the similarity between jobs and resume...')
def news_classifier():

    data = spark.read.option("mode", "DROPMALFORMED").load("/news_data.csv", format="csv", header="true", inferSchema='true')

    data.first()
    data.printSchema()

    #There is a field in the data called constituent_id, which is basically the company which the news headline is about. We want to drop that column from our data.
    drop_list = ['constituent_id']

    data = data.select([column for column in data.columns if column not in drop_list])

    data.show(5)

    data.printSchema()

    # regular expression tokenizer
    regexTokenizer = RegexTokenizer(inputCol="news_title", outputCol="words", pattern="\\W")

    # remove stop words
    stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")

    #compute bigrams
    ngram = NGram(n=2, inputCol="filtered", outputCol="ngrams")

    # Add HashingTF and IDF to transformation
    hashingTF = HashingTF(inputCol="ngrams", outputCol="rawFeatures", numFeatures=10000)
    idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

    #string indexer
    label_stringIdx = StringIndexer(inputCol = "weekly_returns", outputCol = "label")

    #create processing pipeline
    pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, ngram, hashingTF, idf, label_stringIdx])

    # Fit the pipeline to training data.
    pipelineFit = pipeline.fit(data)
    dataset = pipelineFit.transform(data)

    dataset.show(5)

    # Randomly split data into training and test sets. set seed for reproducibility
    (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
    print("Training Dataset Count: " + str(trainingData.count()))
    print("Test Dataset Count: " + str(testData.count()))

    # Build a Logistic Regression model
    lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0, family="multinomial")

    # Train model with Training Data
    lrModel = lr.fit(trainingData)

    predictions = lrModel.transform(testData)

    predictions.filter(predictions['prediction'] == 0) \
        .select("news_title","weekly_returns","probability","label","prediction") \
        .orderBy("probability", ascending=False) \
        .show(n = 10, truncate = 30)

    #multiclass evaluator
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    print(evaluator.evaluate(predictions))

    #save predictions to csv

    predictions = predictions.select("news_title", "weekly_returns", "prediction")
    predictions.write.format("csv").save("/Desktop/predictions-spark.csv")

    #save machine learning model
    model_path = "/Desktop/Spark_Model"
    lrModel.save(model_path)

    #load model again, to make sure it works
    ml_model = lrModel.load(model_path)
    predictions2 = ml_model.transform(testData)

    #make predictions with loaded model
    predictions2.filter(predictions2['prediction'] == 0) \
        .select("news_title","weekly_returns","probability","label","prediction") \
        .orderBy("probability", ascending=False) \
        .show(n = 10, truncate = 30)

    #end spark session
    spark.stop()
        .master("local") \
        .appName("Compare Multiclass Models") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
numfeatures=2000
numclasses = 19
# Load news category data
raw_data = sc.textFile("data/news_sections_abstract2016.txt")
lines = raw_data.map(lambda line: line.split("  ")).map(lambda line: (line[0]," ".join(line[1:])))
sentenceData = spark.createDataFrame(lines,["label", "sentence"])

# Map sentence data to hashingTF
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=numfeatures)
featurizedData = hashingTF.transform(wordsData)
#featurizedData.show()

# Map string labels to integer
df = featurizedData.select('label','features')
data0 = df.replace(['World','Sports','Fashion & Style','Books','Music', \
            'Television','Movies','Technology','Science','Food','Real Estate','Theater', \
            'Health','Travel','Education','Your Money','Politics','Economy','Art & Design'] \
            ,['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','0'],'label')

category = ['Art & Design','World','Sports','Fashion & Style','Books','Music', \
            'Television','Movies','Technology','Science','Food','Real Estate','Theater', \
            'Health','Travel','Education','Your Money','Politics','Economy']
dictionary = {'Art & Design':0,'World':1,'Sports':2,'Fashion & Style':3,'Books':4,'Music':5, \
            'Television':6,'Movies':7,'Technology':8,'Science':9,'Food':10,'Real Estate':11,'Theater':12, \
Пример #25
0
from environment import spark
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = spark.createDataFrame(
    [(0.0, "Hi I I I heard about Spark"),
     (0.0, "I wish Java could use case classes"),
     (1.0, "Logistic regression models are neat")], ["label", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words",
                      outputCol="rawFeatures",
                      numFeatures=100)
featurizedData = hashingTF.transform(wordsData)
featurizedData.show(truncate=False)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.show(truncate=False)
def main():
    time_start = time.time()
    data = 'train_review.json'  # sys.argv[1]
    sc = elly_func.start_spark('Final_Project')

    # total pairs = 1029758
    textRDD = sc.textFile(data).map(
        elly_func.tojson).map(lambda x: ((x['user_id'], x['business_id']), x[
            'text'])).reduceByKey(lambda a, b: a + b).mapValues(
                remove_blank).map(lambda x: (x[0][0], x[0][1], x[1]))

    # Create DataFrame
    tableA = spark_session(textRDD).createDataFrame(
        textRDD, ['user_id', 'business_id', 'text'])

    # Remove stopwords
    remover = StopWordsRemover(inputCol="text", outputCol="filtered")

    # text不用了 拉掉
    df = remover.transform(tableA).drop('text')

    # 轉換成tokenizer要的形式 string
    test = df.withColumn("sentence", df["filtered"].cast("string"))

    # 將句子篩選有用字詞
    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    wordsData = tokenizer.transform(test)

    # TF-IDF 高頻200字篩選
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=200)
    featurizedData = hashingTF.transform(wordsData)

    # 字詞向量化
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    # user與business與字詞關係 按字邏輯排列
    user_profile = rescaledData.select('user_id', 'business_id',
                                       'features').orderBy("user_id")
    business_profile = rescaledData.select('business_id',
                                           'features').orderBy("business_id")

    def set2list(x):
        temp = []
        for i in x:
            temp.append(i)
        return temp

    # user與business與字詞關係 字典建立

    # 91730
    # Total number of [('business_id', ['case','eat',...]),...]
    business_dic = business_profile.rdd.map(lambda x: (x[0], list(x[
        1].indices))).reduceByKey(lambda a, b: a + b).mapValues(
            lambda x: set2list(set(x))).collectAsMap()

    # 13167
    user_dic = user_profile.rdd.map(lambda x: (x[0], list(x[
        2].indices))).reduceByKey(lambda a, b: a + b).mapValues(
            lambda x: set2list(set(x))).collectAsMap()
    # user_profile_dic = {'user1': [word2, word8, word24,....],.....}
    user_bus = sc.textFile(data).map(
        elly_func.tojson).map(lambda x: ((x['user_id'], x['business_id']))
                              ).reduceByKey(lambda a, b: a + b).collectAsMap()

    #   差cos sim
    time_end = time.time()
    print('Duration:', time_end - time_start)
Пример #27
0
def test_gen_estimator_metadata(spark_session):  # pylint: disable=unused-argument
    tokenizer1 = Tokenizer(inputCol="text1", outputCol="words1")
    hashingTF1 = HashingTF(inputCol=tokenizer1.getOutputCol(),
                           outputCol="features1")

    tokenizer2 = Tokenizer(inputCol="text2", outputCol="words2")
    hashingTF2 = HashingTF(inputCol=tokenizer2.getOutputCol(),
                           outputCol="features2")

    vecAssembler = VectorAssembler(inputCols=["features1", "features2"],
                                   outputCol="features")

    lor = LogisticRegression(maxIter=10)
    ova = OneVsRest(classifier=lor)
    sub_pipeline1 = Pipeline(stages=[tokenizer1, hashingTF1])
    sub_pipeline2 = Pipeline(stages=[tokenizer2, hashingTF2])
    sub_pipeline3 = Pipeline(stages=[vecAssembler, ova])

    paramGrid = (ParamGridBuilder().addGrid(lor.maxIter, [10, 20]).addGrid(
        lor.regParam, [0.1, 0.01]).build())
    eva = MulticlassClassificationEvaluator()
    crossval = CrossValidator(estimator=sub_pipeline3,
                              estimatorParamMaps=paramGrid,
                              evaluator=eva,
                              numFolds=2)

    top_pipeline = Pipeline(stages=[sub_pipeline1, sub_pipeline2, crossval])

    metadata = _gen_estimator_metadata(top_pipeline)

    expected_hierarchy = {
        "name":
        "Pipeline_1",
        "stages": [
            {
                "name": "Pipeline_2",
                "stages": [{
                    "name": "Tokenizer_1"
                }, {
                    "name": "HashingTF_1"
                }]
            },
            {
                "name": "Pipeline_3",
                "stages": [{
                    "name": "Tokenizer_2"
                }, {
                    "name": "HashingTF_2"
                }]
            },
            {
                "name": "CrossValidator",
                "evaluator": {
                    "name": "MulticlassClassificationEvaluator"
                },
                "tuned_estimator": {
                    "name":
                    "Pipeline_4",
                    "stages": [
                        {
                            "name": "VectorAssembler"
                        },
                        {
                            "name": "OneVsRest",
                            "classifier": {
                                "name": "LogisticRegression"
                            }
                        },
                    ],
                },
            },
        ],
    }
    assert metadata.hierarchy == expected_hierarchy
    assert metadata.uid_to_indexed_name_map == {
        top_pipeline.uid: "Pipeline_1",
        sub_pipeline1.uid: "Pipeline_2",
        tokenizer1.uid: "Tokenizer_1",
        hashingTF1.uid: "HashingTF_1",
        sub_pipeline2.uid: "Pipeline_3",
        tokenizer2.uid: "Tokenizer_2",
        hashingTF2.uid: "HashingTF_2",
        crossval.uid: "CrossValidator",
        sub_pipeline3.uid: "Pipeline_4",
        vecAssembler.uid: "VectorAssembler",
        ova.uid: "OneVsRest",
        lor.uid: "LogisticRegression",
        eva.uid: "MulticlassClassificationEvaluator",
    }
    assert (metadata.uid_to_indexed_name_map[
        metadata.param_search_estimators[0].uid] == "CrossValidator")
train = train.withColumn("comment_text", stemmer_udf("comment_text"))

def check_clean(toxic, severe_toxic, obscene, threat, insult, identity_hate):
    if (toxic + severe_toxic + obscene + threat + insult + identity_hate) > 0:
        return 0
    else:
        return 1


mergeCols = udf(lambda toxic, severe_toxic, obscene, threat, insult, identity_hate: check_clean(toxic, severe_toxic, obscene, threat, insult, identity_hate), IntegerType())

train = train.withColumn("clean", mergeCols(train["toxic"], train["severe_toxic"], train["obscene"], train["threat"], train["insult"], train["identity_hate"]))

tokenizer = Tokenizer().setInputCol("comment_text").setOutputCol("words")
remover= StopWordsRemover().setInputCol("words").setOutputCol("filtered").setCaseSensitive(False)
hashingTF = HashingTF().setNumFeatures(1000).setInputCol("filtered").setOutputCol("rawFeatures")
idf = IDF().setInputCol("rawFeatures").setOutputCol("features").setMinDocFreq(0)
nb = NaiveBayes(labelCol="label", featuresCol="features")
pipeline=Pipeline(stages=[tokenizer, remover, hashingTF, idf, nb])

train = train.drop('toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate')
train = train.withColumnRenamed("clean", "label")

training_spark_df_binary, testing_spark_df_binary = train.randomSplit([0.8, 0.2], seed = 2018)



paramGrid = ParamGridBuilder()\
    .addGrid(hashingTF.numFeatures,[1000]) \
    .addGrid(nb.smoothing, [1]) \
    .build()
Пример #29
0
conf = SparkConf()
conf.setAppName( "part2_uni" )
conf.set("spark.executor.memory", "2g")
sc = SparkContext(conf = conf)

#reading input
lines =sc.wholeTextFiles("/cosc6339_s17/books-longlist/")
#configuring SparkSession
spark=SparkSession(sc)
hasattr(lines, "toDF")

#tokeinizing the words and converting into dataframes
tokenize=lines.map(part2).toDF(["bookname", "words"])

#converting into unigrams
unigram = NGram(n=1, inputCol = "words", outputCol = "unigrams")
unigramdataframe = unigram.transform(tokenize)

#finding the tf value
hashingTF = HashingTF(inputCol = "unigrams", outputCol = "unigram-tf")
tf = hashingTF.transform(unigramdataframe)

#finding the idf value
idf = IDF(inputCol = "unigram-tf", outputCol = "unigram-tf-idf")
idfModel = idf.fit(tf)
tfidfignore = idfModel.transform(tf)

#saving the output
tfidfignore.rdd.saveAsTextFile("/bigd12/output2_1")

Пример #30
0
    test.cache()

    regexTokenizer = RegexTokenizer(gaps=False,
                                    pattern="\\w+",
                                    inputCol="name",
                                    outputCol="name_parts",
                                    toLowercase=True)

    stopWords = ["mr", "mrs", "miss", "master", "jr", "j", "c", "d"]

    remover = StopWordsRemover(inputCol="name_parts",
                               outputCol="filtered_name_parts",
                               stopWords=stopWords)

    hashingTF = HashingTF(numFeatures=1000,
                          inputCol="filtered_name_parts",
                          outputCol="text_features")

    sexIndexer = StringIndexer(inputCol="sex",
                               outputCol="sexIndexed",
                               handleInvalid="keep")

    embarkedIndexer = StringIndexer(inputCol="embarked",
                                    outputCol="embarkedIndexed",
                                    handleInvalid="keep")

    imputer = Imputer(strategy="mean",
                      inputCols=[
                          "pclass", "sibsp", "parch", "sexIndexed",
                          "embarkedIndexed", "age", "fare"
                      ],