Exemplo n.º 1
0
def getModel():
    sc = getSparkContext()
    sqlContext = SQLContext(sc)
    data = sqlContext.read.format('com.databricks.spark.csv').options(
        header='true', inferschema='true'
    ).load(
        'C:/Users/kgasw/PycharmProjects/bigdata_project/twitter_traffic_data_static.csv'
    )
    regexTokenizer = RegexTokenizer(inputCol="text",
                                    outputCol="words",
                                    pattern="\\W")
    stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")
    countVectors = CountVectorizer(inputCol="filtered",
                                   outputCol="features",
                                   vocabSize=10000,
                                   minDF=5)
    label_stringIdx = StringIndexer(inputCol="class", outputCol="label")
    pipeline = Pipeline(stages=[
        regexTokenizer, stopwordsRemover, countVectors, label_stringIdx
    ])
    pipelineFit = pipeline.fit(data)
    dataset = pipelineFit.transform(data)
    (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
    print("Training Dataset Count: " + str(trainingData.count()))
    print("Test Dataset Count: " + str(testData.count()))
    lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
    lrModel = lr.fit(trainingData)
    predictions = lrModel.transform(testData)
    predictions.filter(predictions['prediction'] == 1) \
        .select("text","class","probability","label","prediction") \
        .orderBy("probability", ascending=False) \
        .show(n = 10, truncate = 30)
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
    print(evaluator.evaluate(predictions))
    return lrModel, pipelineFit
Exemplo n.º 2
0
def frequency_vector_DataFrame(trainDF, cluster_count):
    regTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="[^a-z]")
    dfTokenizer = regTokenizer.transform(trainDF)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    df_remover = remover.transform(dfTokenizer)

    # feature extraction using Word2vec
    word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="word2vec")
    vectors = word2Vec.fit(df_remover).getVectors()
    vectors_DF = vectors.select(vectors.word, vectors.vector.alias("features"))

    #  DF as kmeans
    kmeans = KMeans().setK(cluster_count).setSeed(1)
    km_model = kmeans.fit(vectors_DF)

    # Broadcast operation after getting the words and predictions
    vocabDF = km_model.transform(vectors_DF).select("word", "prediction")
    vocabDict = dict(vocabDF.rdd.collect())
    vocab_dict = sc.broadcast(vocabDict)

    # Cluster vector is in RDD form
    reviewsDF = df_remover.select(df_remover.filtered, df_remover.label).rdd
    clusterVectorRdd = reviewsDF.map(partial(word_to_cluster, vocab_dict=vocab_dict))


    cluster_frequency_feature_Rdd = clusterVectorRdd.map(partial(cluster_frequency_vector, cluster_count=cluster_count))

    cluster_freqDF = cluster_frequency_feature_Rdd.map(lambda (x, y): Row(x, y)).toDF()
    cluster_freq_featureDF = cluster_freqDF.select(cluster_freqDF._1.alias("features"), cluster_freqDF._2.alias("label"))

    return cluster_freq_featureDF
Exemplo n.º 3
0
def main():
    spark = SparkSession.builder.appName('nlp').getOrCreate()
    data = spark.read.csv("./data/smsspamcollection/SMSSpamCollection",
                          inferSchema=True, sep='\t')
    data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1',
                                                                    'text')
    data.show()
    data = data.withColumn('length', length(data['text']))
    data.show()
    data.groupby('class').mean().show()
    tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
    stopremove = StopWordsRemover(inputCol='token_text',
                                  outputCol='stop_tokens')
    count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec')
    idf = IDF(inputCol="c_vec", outputCol="tf_idf")
    ham_spam_to_num = StringIndexer(inputCol='class', outputCol='label')
    clean_up = VectorAssembler(inputCols=['tf_idf', 'length'],
                               outputCol='features')
    nb = NaiveBayes()
    data_prep_pipe = Pipeline(stages=[ham_spam_to_num, tokenizer, stopremove,
                                      count_vec, idf, clean_up])
    cleaner = data_prep_pipe.fit(data)
    clean_data = cleaner.transform(data)
    clean_data = clean_data.select(['label', 'features'])
    clean_data.show()
    (training, testing) = clean_data.randomSplit([0.7, 0.3])
    spam_predictor = nb.fit(training)
    data.printSchema()
    test_results = spam_predictor.transform(testing)
    test_results.show()
    acc_eval = MulticlassClassificationEvaluator()
    acc = acc_eval.evaluate(test_results)
    print("Accuracy of model at predicting spam was: {}".format(acc))
Exemplo n.º 4
0
  def remove_stop_words(self, df1, input_col, output_col='filtered'):
    "Remove stop words -> https://spark.apache.org/docs/2.2.0/ml-features.html#stopwordsremover"
    from pyspark.ml.feature import StopWordsRemover
    remover = StopWordsRemover(inputCol=input_col, outputCol=output_col)
    df2 = remover.transform(df1)

    return df2
Exemplo n.º 5
0
    def test_stop_words_remover(self):
        data = self.spark.createDataFrame([(["a", "b", "c"], )], ["text"])
        model = StopWordsRemover(inputCol="text",
                                 outputCol="words",
                                 stopWords=["b"])

        feature_count = len(data.columns)
        model_onnx = convert_sparkml(
            model, 'Sparkml StopWordsRemover',
            [('text', StringTensorType([1, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().words.values
        data_np = data.toPandas().text.values
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlStopWordsRemover")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Exemplo n.º 6
0
def preprocess(inputCol=["text", "label"], n=4):
    tokenizer = [Tokenizer(inputCol="text", outputCol="words")]
    remover = [StopWordsRemover(inputCol="words", outputCol="filtered")]
    ngrams = [
        NGram(n=i, inputCol="filtered", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=2**14,
                        inputCol="{0}_grams".format(i),
                        outputCol="{0}_tf".format(i)) for i in range(1, n + 1)
    ]
    idf = [
        IDF(inputCol="{0}_tf".format(i),
            outputCol="{0}_tfidf".format(i),
            minDocFreq=2) for i in range(1, n + 1)
    ]

    assembler = [
        VectorAssembler(
            inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
            outputCol="rawFeatures")
    ]
    label_stringIdx = [StringIndexer(inputCol="label", outputCol="labels")]
    selector = [
        ChiSqSelector(numTopFeatures=2**14,
                      featuresCol='rawFeatures',
                      outputCol="features")
    ]
    lr = [LogisticRegression(maxIter=1000)]
    return Pipeline(stages=tokenizer + remover + ngrams + cv + idf +
                    assembler + label_stringIdx + selector + lr)
Exemplo n.º 7
0
def trainData():
    #rdd = sc.parallelize(rdd)
    #rdd.foreach(print)
    #rdd = sc.textFile("/ccga/SentimentAnalysisDataset.csv")
    '''#################################################TRAINING DATA SET#################################################'''
    rddTrain = sc.textFile("/ccga/set100k.csv")
    r = rddTrain.mapPartitions(lambda x: csv.reader(x))
    parts = r.map(lambda x: Row(sentence=str.strip(x[3]), label=int(x[1])))
    spark = getSparkSessionInstance(rddTrain.context.getConf())
    partsDF = spark.createDataFrame(parts)
    #partsDF.show(truncate=False)
    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    tokenized = tokenizer.transform(partsDF)
    #okenized.show(truncate=False)
    remover = StopWordsRemover(inputCol="words", outputCol="base_words")
    base_words = remover.transform(tokenized)
    #base_words.show(truncate=False)
    train_data_raw = base_words.select("base_words", "label")
    #train_data_raw.show(truncate=False)
    #base_words = train_data_raw.select("base_words")
    #base_words_rdd = base_words.rdd
    #print(base_words_rdd.collect())
    #base_words_map = base_words_rdd.flatMap(lambda x: x[0])
    #base_words_rdd.collect()
    word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features")
    model = word2Vec.fit(train_data_raw)
    final_train_data = model.transform(train_data_raw)
    #final_train_data.show()
    final_train_data = final_train_data.select("label", "features")
    #final_train_data.show(truncate=False)
    lr = LogisticRegression(maxIter=1000, regParam=0.001, elasticNetParam=0.0001)
    lrModel = lr.fit(final_train_data)
    trained = lrModel.transform(final_train_data)
    return lrModel
    '''#################################################TRAINING DATA SET#################################################'''
Exemplo n.º 8
0
def UsefulnessPredictionLDA(trainingdata, model):
    # Data Preprocessing
    tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens_word")

    remover = StopWordsRemover(inputCol="tokens_word",
                               outputCol="filtered_tokens_word")
    cv = CountVectorizer(inputCol="filtered_tokens_word",
                         outputCol="raw_features",
                         minDF=2.0)
    idf = IDF(inputCol="raw_features", outputCol="features")

    # Extract LDA topic feature
    lda = LDA(k=30, maxIter=10)
    if model == 'RandomForest':
        model = RandomForestRegressor(featuresCol="topicDistribution")
    pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lda, model])
    evaluator_rmse = RegressionEvaluator(labelCol="label",
                                         predictionCol="prediction",
                                         metricName="rmse")
    paramGrid = ParamGridBuilder() \
        .addGrid(cv.vocabSize, [150, 200, 250]) \
        .build()
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator_rmse,
                              numFolds=4)  # use 3+ folds in practice
    cvModel = crossval.fit(trainingdata)
    # Explain params for the selected model
    print cvModel.explainParams()
    return cvModel
Exemplo n.º 9
0
def filter_stop_words(df):
    stop_words_nltk = stopwords.words('english')
    remover = StopWordsRemover(inputCol='tokens_noise',
                               outputCol='tokens',
                               stopWords=stop_words_nltk)

    return remover.transform(df)
Exemplo n.º 10
0
    def df_to_words(logger, df: DataFrame, input_col: str, output_col: str = "words", pattern: str = "\\W+",
                    to_lowercase: bool = True,
                    case_sensitive: bool = False) -> DataFrame:
        """
        Take each string in a column and parse it to a list of words via Tokenization and remove stop words.
        Args:
            logger: Logger instance used to log events
            df: Dataframe used
            input_col: Selected input column name
            output_col: Output column name
            pattern: The regex pattern used to tokenized
            to_lowercase: If all the word should be trim
            case_sensitive: Does the stop words should be case sensitive

        Returns: The modified dataframe

        """
        try:
            intermediate_output = output_col + "intermediate"
            regex_tokenizer = RegexTokenizer(inputCol=input_col, outputCol=intermediate_output, pattern=pattern,
                                             toLowercase=to_lowercase)
            remover = StopWordsRemover(inputCol=intermediate_output, outputCol=output_col, caseSensitive=case_sensitive)
            logger.info("Parsing to words the dataframe")
            return remover.transform(regex_tokenizer.transform(df)).drop(intermediate_output)
        except Exception as e:
            logger.error("Parsing to words failed: {}".format(e), traceback.format_exc())
            raise e
Exemplo n.º 11
0
def UsefulnessPredictionLDAWithoutCV(trainingdata, model):
    # Data Preprocessing
    tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens_word")
    remover = StopWordsRemover(inputCol="tokens_word",
                               outputCol="filtered_tokens_word")
    cv = CountVectorizer(inputCol="filtered_tokens_word",
                         outputCol="raw_features",
                         minDF=2.0,
                         vocabSize=250)
    idf = IDF(inputCol="raw_features", outputCol="features")

    # Extract LDA topic feature
    lda = LDA(k=30, maxIter=10)
    if model == 'RandomForest':
        model = RandomForestRegressor(featuresCol="topicDistribution")

    pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lda, model])
    evaluator_rmse = RegressionEvaluator(labelCol="label",
                                         predictionCol="prediction",
                                         metricName="rmse")

    cvModel = pipeline.fit(trainingdata)

    # Explain params for the selected model
    print cvModel.explainParams()
    return cvModel
Exemplo n.º 12
0
def process(rdd):
    try:
        # Get the singleton instance of SparkSession
        spark = getSparkSessionInstance(rdd.context.getConf())

        # Convert RDD[String] to RDD[Row] to DataFrame
        rowRdd = rdd.map(lambda w: Row(word=w))
        wordsDataFrame = spark.createDataFrame(rowRdd,["sentence"])
        wordsDataFrame = wordsDataFrame.select(f.lower(f.col("sentence")).alias("sentence"))
        wordsDataFrame = wordsDataFrame.select(
            f.regexp_replace(f.col("sentence"), "(\d+)", "").alias(
                "sentence"))
        

        # return wordsDataFrame
        tokenizer = Tokenizer(inputCol="replaced", outputCol="words")
        tokenized = tokenizer.transform(wordsDataFrame)

        remover = StopWordsRemover(inputCol="words", outputCol="filtered")
        removed = remover.transform(tokenized)
        removed.select("filtered").show(20,False)
        # return removed



    except:
        pass
Exemplo n.º 13
0
def aggregate_spark(data, columns, args):
    import pyspark.sql.functions as F
    from pyspark.ml.feature import StopWordsRemover, RegexTokenizer

    input_data = data.withColumn(columns["col"],
                                 F.lower(F.col(columns["col"])))
    regexTokenizer = RegexTokenizer(inputCol=columns["col"],
                                    outputCol="token_list",
                                    pattern="\\W")
    regexTokenized = regexTokenizer.transform(data)

    remover = StopWordsRemover(inputCol="token_list",
                               outputCol="filtered_word_list")
    vocab_rows = (remover.transform(regexTokenized).select(
        F.explode(F.col("filtered_word_list")).alias("word")).groupBy(
            "word").count().orderBy(F.col("count").desc()).limit(
                args["vocab_size"]).select("word").collect())

    vocab = [row["word"] for row in vocab_rows]
    reverse_dict = {
        word: idx + len(args["reserved_indices"])
        for idx, word in enumerate(vocab)
    }

    return {**reverse_dict, **args["reserved_indices"]}
Exemplo n.º 14
0
def create_TFIDF_v0(trainData,
                    applyData,
                    inputCol="text",
                    outputCol="features",
                    minDocFreq=3,
                    numFeatures=20):
    tokenizer = RegexTokenizer(pattern="[.:\s]+",
                               inputCol=inputCol,
                               outputCol="z_words")
    wordsData1 = tokenizer.transform(trainData)
    wordsData2 = tokenizer.transform(applyData)

    remover = StopWordsRemover(inputCol="z_words",
                               outputCol="z_filtered",
                               stopWords=STOPWORDS_v0)
    wordsDataFiltered1 = remover.transform(wordsData1)
    wordsDataFiltered2 = remover.transform(wordsData2)

    hashingTF = HashingTF(inputCol="z_filtered",
                          outputCol="z_rawFeatures",
                          numFeatures=numFeatures)
    featurizedData1 = hashingTF.transform(wordsDataFiltered1)
    featurizedData2 = hashingTF.transform(wordsDataFiltered2)
    # alternatively, CountVectorizer can also be used to get term frequency vectors

    idf = IDF(inputCol="z_rawFeatures",
              outputCol=outputCol,
              minDocFreq=minDocFreq)
    idfModel = idf.fit(featurizedData1)

    rescaledData = idfModel.transform(featurizedData2)
    return rescaledData.drop("z_words", "z_filtered", "z_rawFeatures",
                             inputCol)
Exemplo n.º 15
0
def kmeans_from_csv(file="liveTweetsLocation.csv",
                    outfile="liveTweetsLocationKmeans.csv",
                    k=8):
    df = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \
        (file)
    df.show()
    # df2.show()

    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    remover = StopWordsRemover(inputCol="tokens",
                               outputCol="stopWordsRemovedTokens")
    hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                          outputCol="rawFeatures",
                          numFeatures=2**20)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    kmeans = KMeans(k=k,
                    seed=1,
                    featuresCol='features',
                    maxIter=10,
                    initMode='random')
    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, kmeans])
    model = pipeline.fit(df)
    results = model.transform(df)
    results.cache()
    results.groupBy("prediction").count().show(
    )  # Note "display" is for Databricks; use show() for OSS Apache Spark
    # results.filter(results.prediction == 1).show(200,False)
    results.show()
    results.toPandas().to_csv(outfile)
    results.drop("location").drop("tokens").drop(
        "stopWordsRemovedTokens").drop("rawFeatures").drop(
            "features").toPandas().to_csv('prettyPrint.csv')
Exemplo n.º 16
0
def get_pipeline(vector_size=50, class_num=5, stopwords=None):
    '''
	构建pipeline
        该demo pipeline包含以下步骤:
	1. labelIndexer 将标签索引,从字符装化为整数
        2. tokenizer 将句子分成单词
        3. remover 去除停用词
        4. word2vec 使用word2vec将文本转化为低维度向量
        5. mpc 神经网络分类器
    '''
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexLabel")
    tokenizer = Tokenizer(inputCol="text", outputCol="raw_words")
    remover = StopWordsRemover(inputCol="raw_words",
                               outputCol="words",
                               stopWords=stopwords)
    word2vec = Word2Vec(vectorSize=vector_size,
                        minCount=2,
                        inputCol="words",
                        outputCol="vector")
    layers = [vector_size, (vector_size + class_num) / 2, class_num]
    mpc = MultilayerPerceptronClassifier(maxIter=100,
                                         layers=layers,
                                         seed=1234,
                                         featuresCol="vector",
                                         labelCol="indexLabel")
    pipeline = Pipeline(
        stages=[labelIndexer, tokenizer, remover, word2vec, mpc])
    return pipeline
Exemplo n.º 17
0
def gmmresults():
    df1 = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \
        ("canadatweets.csv")
    df2 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("products.csv")
    df3 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("products.csv")
    df4 = sqlContext.read.format("csv").option("header", "true").option(
        "mode", "DROPMALFORMED").load("claritin.csv")
    df = df1.unionAll(df2)
    df = df.unionAll(df3)
    df = df.unionAll(df4)
    df.show()
    # df2.show()

    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    remover = StopWordsRemover(inputCol="tokens",
                               outputCol="stopWordsRemovedTokens")
    hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                          outputCol="rawFeatures",
                          numFeatures=20000)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    gmm = GaussianMixture(k=8, featuresCol='features')
    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, gmm])
    model = pipeline.fit(df)
    results = model.transform(df)
    results.cache()
    results.groupBy("prediction").count().show(
    )  # Note "display" is for Databricks; use show() for OSS Apache Spark
    results.filter(results.prediction == 1).show(200, False)
    results.show()
    results.toPandas().to_csv(
        'gmmresultsCanadaAndProductsAndDisastersAndClaritin.csv')
Exemplo n.º 18
0
def trainNaiveBayesModel(data, directory=""):
    tokenizer = Tokenizer().setInputCol("comment_text").setOutputCol("words")
    remover = StopWordsRemover().setInputCol("words").setOutputCol(
        "filtered").setCaseSensitive(False)
    hashingTF = HashingTF().setNumFeatures(1000).setInputCol(
        "filtered").setOutputCol("rawFeatures")
    idf = IDF().setInputCol("rawFeatures").setOutputCol(
        "features").setMinDocFreq(0)
    nb = NaiveBayes(labelCol="label", featuresCol="features")
    pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, nb])

    paramGrid = ParamGridBuilder()\
      .addGrid(hashingTF.numFeatures,[200, 500, 1000, 5000]) \
      .addGrid(nb.smoothing, [0.5, 1, 1.5, 2]) \
      .build()

    crossval = TrainValidationSplit(
        estimator=pipeline,
        estimatorParamMaps=paramGrid,
        evaluator=BinaryClassificationEvaluator().setMetricName(
            'areaUnderPR'
        ),  # set area Under precision-recall curve as the evaluation metric
        # 80% of the data will be used for training, 20% for validation.
        trainRatio=0.8)

    cvModel = crossval.fit(data)
    modelName = directory + "NaiveBayesModel"
    cvModel.bestModel.write().overwrite().save(modelName)

    return modelName
Exemplo n.º 19
0
def main():
    in_directory = sys.argv[1]
    out_directory = sys.argv[2]

    comments = spark.read.json(in_directory, schema=schema)
    
    comments.cache() 
   
    wordbreak = r'[%s\s]+' % (re.escape(string.punctuation + '0123456789'),)

    # NLP processing code adapted from https://spark.apache.org/docs/latest/ml-features.html
    regexTokenizer = RegexTokenizer(inputCol="body", outputCol="words", minTokenLength=3, pattern=wordbreak)
    # alternatively, pattern="\\w+", gaps(False)

    countTokens = udf(lambda words: len(words), IntegerType())

    regexTokenized = regexTokenizer.transform(comments)
    docs = regexTokenized.select("body", "words", "subreddit")

    docs.cache()

    #extra_stop_words = ["www","http","gt"]

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    docs = remover.transform(docs).withColumn("tokens", countTokens(col("filtered")))

    docs = docs.drop("body")
    docs = docs.drop("words") 

    docs.groupBy("subreddit").agg(functions.avg("tokens")).show()

    # threshold for post length
    lthresh = 60
    uthresh = 100
    docs = docs.filter(docs['tokens'] > lthresh)
    docs = docs.filter(docs['tokens'] < uthresh)


    logs = docs.groupBy("subreddit").agg(functions.count("*")).show()


    #adds rank per subreddit type into a new column called rank
    ranked = docs.withColumn("rank", rank().over(Window.partitionBy("subreddit").orderBy(desc("tokens"))))

    #ranked.cache()

    group_size = 230
    #take group_size biggest docs from each group type
    ranked = ranked.filter(ranked['rank'] <= group_size)
    
    #convert arrays to columns so we can write csv
    for i in range(uthresh):
        ranked = ranked.withColumn('{0}'.format(i), ranked.filtered.getItem(i))

    #drop filtered so we can write to csv
    ranked = ranked.drop('filtered')
    ranked = ranked.drop('rank')
    ranked.show()

    ranked.write.csv(out_directory, mode='overwrite')
Exemplo n.º 20
0
    def featureExtract(self, trainDataframe, predictionDataframe):
        pipeline = None
        try:
            pipeline = Pipeline.load(ROOT_PATH + '/pipeline')
        except Exception:
            print Exception.message
            self.logger.error(Exception)
        if pipeline is None:
            # tokenizer = Tokenizer(inputCol="keywords", outputCol="words")
            remover = StopWordsRemover(inputCol="keywords",
                                       outputCol="filtered")
            # 设置停用词
            remover.setStopWords(self.cuttingMachine.chineseStopwords())
            hashingTF = HashingTF(inputCol=remover.getOutputCol(),
                                  outputCol="features")
            idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idff")
            # lr = LogisticRegression(maxIter=10, regParam=0.001)
            pipeline = Pipeline(stages=[remover, hashingTF, idf])
        model = pipeline.fit(trainDataframe)
        pipeline.write().overwrite().save(ROOT_PATH + '/pipeline')
        resultDataframe = model.transform(predictionDataframe)
        resultDataframe.show()
        selected = resultDataframe.select("filtered", "features", "idff")

        for row in selected.collect():
            filtered, features, idff = row
            self.logger.info("features: %s", features)
            self.logger.info("idff: %s", idff)
            self.logger.info(
                "filtered: %s",
                str(filtered).decode("unicode_escape").encode("utf-8"))
        return selected
def random_text_classifier(input_loc, output_loc):
    """
    This is a dummy function to show how to use spark, It is supposed to mock
    the following steps
        1. clean input data
        2. use a pre-trained model to make prediction 
        3. write predictions to a HDFS output

    Since this is meant as an example, we are going to skip building a model,
    instead we are naively going to mark reviews having the text "good" as positive and
    the rest as negative 
    """

    # read input
    df_raw = spark.read.option("header", True).csv(input_loc)
    # perform text cleaning

    # Tokenize text
    tokenizer = Tokenizer(inputCol="review_str", outputCol="review_token")
    df_tokens = tokenizer.transform(df_raw).select("cid", "review_token")

    # Remove stop words
    remover = StopWordsRemover(inputCol="review_token",
                               outputCol="review_clean")
    df_clean = remover.transform(df_tokens).select("cid", "review_clean")

    # function to check presence of good
    df_out = df_clean.select(
        "cid",
        array_contains(df_clean.review_clean, "good").alias("positive_review"))
    # parquet is a popular column storage format, we use it here
    df_out.write.mode("overwrite").parquet(output_loc)
Exemplo n.º 22
0
    def featureExtractLr(self, trainDataframe, predictionDataframe):
        pipeline = None
        try:
            # pipeline = PipelineModel.load(ROOT_PATH+'/logistic')
            pipeline = Pipeline.load(ROOT_PATH + '/logistic')
        except Exception:
            print Exception.message
            self.logger.error(Exception)
        if pipeline is None:
            # tokenizer = Tokenizer(inputCol="keywords", outputCol="words")
            remover = StopWordsRemover(inputCol="keywords",
                                       outputCol="filtered")
            # 设置停用词
            remover.setStopWords(self.cuttingMachine.chineseStopwords())
            hashingTF = HashingTF(inputCol=remover.getOutputCol(),
                                  outputCol="features")
            lr = LogisticRegression(maxIter=10,
                                    regParam=0.001).setElasticNetParam(0.8)
            pipeline = Pipeline(stages=[remover, hashingTF, lr])
        model = pipeline.fit(trainDataframe)
        pipeline.write().overwrite().save(ROOT_PATH + '/logistic')
        # model.write().overwrite().save(ROOT_PATH+'/logistic')
        resultDataframe = model.transform(predictionDataframe)
        resultDataframe.show()
        selected = resultDataframe.select("id", "features", "probability",
                                          "prediction")

        for row in selected.collect():
            rid, features, prob, prediction = row
            self.logger.info("features: %s", features)
            self.logger.info("prob: %s", str(prob))
            self.logger.info("prediction: %s", str(prediction))
Exemplo n.º 23
0
    def get_pd_keyword(self):

        df_spark = self.df_spark

        # Step 1. Text cleasing with punctuations

        REGEX = '[_,?\\-.!?@#$%^&*+\/\d]'
        df_spark = df_spark.withColumn(
            "description_clean",
            regexp_replace(df_spark.description, REGEX, ' '))

        # Step 2. Tokenization
        # df_spark = df_spark.drop("description_token")

        tokenizer = Tokenizer(inputCol='description_clean',
                              outputCol='description_token')
        df_spark = tokenizer.transform(df_spark)

        # Stemming
        # nltk.download('wordnet')
        lemmatizer = WordNetLemmatizer()

        def lemm_function(list):
            list_clean = []
            for item in list:
                list_clean.append(lemmatizer.lemmatize(item))

            return list_clean

        udf_lemm_function = F.udf(lemm_function, ArrayType(StringType()))

        df_spark = df_spark.withColumn(
            "description_lemm", udf_lemm_function(df_spark.description_token))

        # Step 3. Remove stopword

        stopwords_list = StopWordsRemover.loadDefaultStopWords("english")
        stopwords_customize_list = ["app", "apps"]
        stopwords_list = np.append(stopwords_list, stopwords_customize_list)

        stopwords = StopWordsRemover(inputCol="description_lemm",
                                     outputCol="description_no_stop",
                                     stopWords=stopwords_list)
        stopwords.getStopWords()
        df_spark = stopwords.transform(df_spark)

        df_pd_desc_final = df_spark.toPandas()

        # ### Note: IDF vector must be trained with large corpus, otherwise lose the advance of IDF

        # get the "description" column
        joinF = lambda x: " ".join(x)
        df_pd_desc_final["description_final"] = df_pd_desc_final[
            "description_no_stop"].apply(joinF)

        corpus_list = df_pd_desc_final["description_final"].tolist()

        df_pd_desc_final = get_tfidf(corpus_list, df_pd_desc_final, self.topn)

        return df_pd_desc_final
Exemplo n.º 24
0
def pipeline(df):
    print(df.head())
    df = df.withColumn("length", length(df['Speech']))
    # Create the data processing pipeline functions here (note: StringIndexer will be used to encode
    # your target variable column. This column should be named 'label' so our model will recognize it later)
    review_data = Tokenizer(inputCol="Speech", outputCol="Words")
    reviewed = review_data.transform(df)
    #reviewed.show()
    remover = StopWordsRemover(inputCol="Words", outputCol="filtered")
    newFrame = remover.transform(reviewed)
    #newFrame.show()
    hashing = HashingTF(inputCol="filtered",
                        outputCol="hashedValues",
                        numFeatures=pow(2, 10))
    # Transform in a DF
    hashed_df = hashing.transform(newFrame)
    hashed_df.show(truncate=False)
    idf = IDF(inputCol="hashedValues", outputCol="feature")
    idfModel = idf.fit(hashed_df)
    rescaledData = idfModel.transform(hashed_df)
    rescaledData.select("words", "feature").show(truncate=False)
    # indexer = StringIndexer(inputCol="Party_Affliation", outputCol="label")

    # indexed = indexer.fit(rescaledData).transform(rescaledData)

    assembler = VectorAssembler(inputCols=["feature", "length"],
                                outputCol="features")

    return assembler.transform(rescaledData)
Exemplo n.º 25
0
def remove_stop_words(df):
    remover = StopWordsRemover(inputCol="rawTokens",
                               outputCol="tokens",
                               stopWords=stopwords.words("english"))
    df_tokens = remover.transform(df)

    return df_tokens
Exemplo n.º 26
0
def testPipelineSerialization(craiglistDataset):
    [traningDataset, testingDataset] = craiglistDataset.randomSplit([0.9, 0.1],
                                                                    42)

    tokenizer = RegexTokenizer(inputCol="jobtitle",
                               minTokenLength=2,
                               outputCol="tokenized")
    stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                        outputCol="stopWordsRemoved")
    w2v = H2OWord2Vec(sentSampleRate=0,
                      epochs=10,
                      inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="w2v")
    gbm = H2OGBM(labelCol="category", featuresCols=[w2v.getOutputCol()])

    pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, w2v, gbm])

    pipeline.write().overwrite().save("file://" +
                                      os.path.abspath("build/w2v_pipeline"))
    loadedPipeline = Pipeline.load("file://" +
                                   os.path.abspath("build/w2v_pipeline"))
    model = loadedPipeline.fit(traningDataset)
    expected = model.transform(testingDataset)

    model.write().overwrite().save("file://" +
                                   os.path.abspath("build/w2v_pipeline_model"))
    loadedModel = PipelineModel.load(
        "file://" + os.path.abspath("build/w2v_pipeline_model"))
    result = loadedModel.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(expected, result)
Exemplo n.º 27
0
def fit_tfidf_pipeline(content_df):
    tokenizer = RegexTokenizer(). \
        setGaps(False). \
        setPattern('\\p{L}+'). \
        setInputCol('content'). \
        setOutputCol('words')

    sw = StopWordsRemover() \
        .setStopWords(stop_words) \
        .setCaseSensitive(False) \
        .setInputCol("words") \
        .setOutputCol("filtered")

    cv = CountVectorizer(). \
        setInputCol('filtered'). \
        setOutputCol('tf'). \
        setMinTF(1). \
        setMinDF(10). \
        setVocabSize(2 ** 17)

    # fit dataframe_df
    cv_transformer = Pipeline(stages=[tokenizer, sw, cv]).fit(content_df)

    idf = IDF(minDocFreq=10). \
        setInputCol('tf'). \
        setOutputCol('tfidf')

    tfidf_transformer = Pipeline(stages=[cv_transformer, idf]).fit(content_df)

    return tfidf_transformer
Exemplo n.º 28
0
 def test_stopwordsremover(self):
     dataset = self.spark.createDataFrame([Row(input=["a", "panda"])])
     stopWordRemover = StopWordsRemover(inputCol="input",
                                        outputCol="output")
     # Default
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["panda"])
     self.assertEqual(type(stopWordRemover.getStopWords()), list)
     self.assertTrue(
         isinstance(stopWordRemover.getStopWords()[0], basestring))
     # Custom
     stopwords = ["panda"]
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a"])
     # with language selection
     stopwords = StopWordsRemover.loadDefaultStopWords("turkish")
     dataset = self.spark.createDataFrame(
         [Row(input=["acaba", "ama", "biri"])])
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
     # with locale
     stopwords = ["BELKİ"]
     dataset = self.spark.createDataFrame([Row(input=["belki"])])
     stopWordRemover.setStopWords(stopwords).setLocale("tr")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
Exemplo n.º 29
0
def clean_words(news):
    """
    Function to clean the data from the news
    :return: Dataframe containing the list of words in the text and the total number of times they occur
    """

    # just alphanumeric
    news_clean = news.where(news["NewsText"].isNotNull()).withColumn(
        'NewsTextClean', f.regexp_replace('NewsText', "[^0-9a-zA-Z ]", " "))

    # splits the new column
    news_clean_split = news_clean.select(
        f.split(news_clean["NewsTextClean"], " ").alias("NewsTextClean"))

    # removes stop words
    remover = StopWordsRemover(inputCol="NewsTextClean",
                               outputCol="NoStopWords")
    news_without_stopwords = remover.transform(news_clean_split)

    # splits words in rows and count occurrence of each of them
    news_without_stopwords_str = news_without_stopwords.withColumn(
        'NoStopWordsStr', f.concat_ws(' ', 'NoStopWords'))
    news_to_show = news_without_stopwords_str.withColumn('word', f.explode(f.col('NoStopWords'))) \
        .groupBy('word') \
        .count() \
        .sort('count', ascending=False) \
        .where('word <>""')

    return news_to_show
def random_text_classifier(input_loc, output_loc):
    """
    This is a dummy function that mocks the following steps:

        1. clean input data (tokenization, remove stop words)
        2. use a pre-trained model to make prediction 
        3. write predictions to a HDFS output

    Naively marks reviews having the text "good" as positive and
    the rest as negative 
    """

    # read input
    df_raw = spark.read.option("header", True).csv(input_loc)
    # perform text cleaning

    # Tokenize text
    tokenizer = Tokenizer(inputCol='review_str', outputCol='review_token')
    df_tokens = tokenizer.transform(df_raw).select('cid', 'review_token')

    # Remove stop words
    remover = StopWordsRemover(inputCol='review_token',
                               outputCol='review_clean')
    df_clean = remover.transform(df_tokens).select('cid', 'review_clean')

    # function to check presence of good and naively assume its a positive review
    df_out = df_clean.select(
        'cid',
        array_contains(df_clean.review_clean, "good").alias('positive_review'))

    df_out.write.mode("overwrite").parquet(output_loc)