def getModel(): sc = getSparkContext() sqlContext = SQLContext(sc) data = sqlContext.read.format('com.databricks.spark.csv').options( header='true', inferschema='true' ).load( 'C:/Users/kgasw/PycharmProjects/bigdata_project/twitter_traffic_data_static.csv' ) regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered") countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) label_stringIdx = StringIndexer(inputCol="class", outputCol="label") pipeline = Pipeline(stages=[ regexTokenizer, stopwordsRemover, countVectors, label_stringIdx ]) pipelineFit = pipeline.fit(data) dataset = pipelineFit.transform(data) (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100) print("Training Dataset Count: " + str(trainingData.count())) print("Test Dataset Count: " + str(testData.count())) lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) lrModel = lr.fit(trainingData) predictions = lrModel.transform(testData) predictions.filter(predictions['prediction'] == 1) \ .select("text","class","probability","label","prediction") \ .orderBy("probability", ascending=False) \ .show(n = 10, truncate = 30) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") print(evaluator.evaluate(predictions)) return lrModel, pipelineFit
def frequency_vector_DataFrame(trainDF, cluster_count): regTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="[^a-z]") dfTokenizer = regTokenizer.transform(trainDF) remover = StopWordsRemover(inputCol="words", outputCol="filtered") df_remover = remover.transform(dfTokenizer) # feature extraction using Word2vec word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="word2vec") vectors = word2Vec.fit(df_remover).getVectors() vectors_DF = vectors.select(vectors.word, vectors.vector.alias("features")) # DF as kmeans kmeans = KMeans().setK(cluster_count).setSeed(1) km_model = kmeans.fit(vectors_DF) # Broadcast operation after getting the words and predictions vocabDF = km_model.transform(vectors_DF).select("word", "prediction") vocabDict = dict(vocabDF.rdd.collect()) vocab_dict = sc.broadcast(vocabDict) # Cluster vector is in RDD form reviewsDF = df_remover.select(df_remover.filtered, df_remover.label).rdd clusterVectorRdd = reviewsDF.map(partial(word_to_cluster, vocab_dict=vocab_dict)) cluster_frequency_feature_Rdd = clusterVectorRdd.map(partial(cluster_frequency_vector, cluster_count=cluster_count)) cluster_freqDF = cluster_frequency_feature_Rdd.map(lambda (x, y): Row(x, y)).toDF() cluster_freq_featureDF = cluster_freqDF.select(cluster_freqDF._1.alias("features"), cluster_freqDF._2.alias("label")) return cluster_freq_featureDF
def main(): spark = SparkSession.builder.appName('nlp').getOrCreate() data = spark.read.csv("./data/smsspamcollection/SMSSpamCollection", inferSchema=True, sep='\t') data = data.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text') data.show() data = data.withColumn('length', length(data['text'])) data.show() data.groupby('class').mean().show() tokenizer = Tokenizer(inputCol="text", outputCol="token_text") stopremove = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens') count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec') idf = IDF(inputCol="c_vec", outputCol="tf_idf") ham_spam_to_num = StringIndexer(inputCol='class', outputCol='label') clean_up = VectorAssembler(inputCols=['tf_idf', 'length'], outputCol='features') nb = NaiveBayes() data_prep_pipe = Pipeline(stages=[ham_spam_to_num, tokenizer, stopremove, count_vec, idf, clean_up]) cleaner = data_prep_pipe.fit(data) clean_data = cleaner.transform(data) clean_data = clean_data.select(['label', 'features']) clean_data.show() (training, testing) = clean_data.randomSplit([0.7, 0.3]) spam_predictor = nb.fit(training) data.printSchema() test_results = spam_predictor.transform(testing) test_results.show() acc_eval = MulticlassClassificationEvaluator() acc = acc_eval.evaluate(test_results) print("Accuracy of model at predicting spam was: {}".format(acc))
def remove_stop_words(self, df1, input_col, output_col='filtered'): "Remove stop words -> https://spark.apache.org/docs/2.2.0/ml-features.html#stopwordsremover" from pyspark.ml.feature import StopWordsRemover remover = StopWordsRemover(inputCol=input_col, outputCol=output_col) df2 = remover.transform(df1) return df2
def test_stop_words_remover(self): data = self.spark.createDataFrame([(["a", "b", "c"], )], ["text"]) model = StopWordsRemover(inputCol="text", outputCol="words", stopWords=["b"]) feature_count = len(data.columns) model_onnx = convert_sparkml( model, 'Sparkml StopWordsRemover', [('text', StringTensorType([1, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().words.values data_np = data.toPandas().text.values paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlStopWordsRemover") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def preprocess(inputCol=["text", "label"], n=4): tokenizer = [Tokenizer(inputCol="text", outputCol="words")] remover = [StopWordsRemover(inputCol="words", outputCol="filtered")] ngrams = [ NGram(n=i, inputCol="filtered", outputCol="{0}_grams".format(i)) for i in range(1, n + 1) ] cv = [ CountVectorizer(vocabSize=2**14, inputCol="{0}_grams".format(i), outputCol="{0}_tf".format(i)) for i in range(1, n + 1) ] idf = [ IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=2) for i in range(1, n + 1) ] assembler = [ VectorAssembler( inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)], outputCol="rawFeatures") ] label_stringIdx = [StringIndexer(inputCol="label", outputCol="labels")] selector = [ ChiSqSelector(numTopFeatures=2**14, featuresCol='rawFeatures', outputCol="features") ] lr = [LogisticRegression(maxIter=1000)] return Pipeline(stages=tokenizer + remover + ngrams + cv + idf + assembler + label_stringIdx + selector + lr)
def trainData(): #rdd = sc.parallelize(rdd) #rdd.foreach(print) #rdd = sc.textFile("/ccga/SentimentAnalysisDataset.csv") '''#################################################TRAINING DATA SET#################################################''' rddTrain = sc.textFile("/ccga/set100k.csv") r = rddTrain.mapPartitions(lambda x: csv.reader(x)) parts = r.map(lambda x: Row(sentence=str.strip(x[3]), label=int(x[1]))) spark = getSparkSessionInstance(rddTrain.context.getConf()) partsDF = spark.createDataFrame(parts) #partsDF.show(truncate=False) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") tokenized = tokenizer.transform(partsDF) #okenized.show(truncate=False) remover = StopWordsRemover(inputCol="words", outputCol="base_words") base_words = remover.transform(tokenized) #base_words.show(truncate=False) train_data_raw = base_words.select("base_words", "label") #train_data_raw.show(truncate=False) #base_words = train_data_raw.select("base_words") #base_words_rdd = base_words.rdd #print(base_words_rdd.collect()) #base_words_map = base_words_rdd.flatMap(lambda x: x[0]) #base_words_rdd.collect() word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features") model = word2Vec.fit(train_data_raw) final_train_data = model.transform(train_data_raw) #final_train_data.show() final_train_data = final_train_data.select("label", "features") #final_train_data.show(truncate=False) lr = LogisticRegression(maxIter=1000, regParam=0.001, elasticNetParam=0.0001) lrModel = lr.fit(final_train_data) trained = lrModel.transform(final_train_data) return lrModel '''#################################################TRAINING DATA SET#################################################'''
def UsefulnessPredictionLDA(trainingdata, model): # Data Preprocessing tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens_word") remover = StopWordsRemover(inputCol="tokens_word", outputCol="filtered_tokens_word") cv = CountVectorizer(inputCol="filtered_tokens_word", outputCol="raw_features", minDF=2.0) idf = IDF(inputCol="raw_features", outputCol="features") # Extract LDA topic feature lda = LDA(k=30, maxIter=10) if model == 'RandomForest': model = RandomForestRegressor(featuresCol="topicDistribution") pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lda, model]) evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") paramGrid = ParamGridBuilder() \ .addGrid(cv.vocabSize, [150, 200, 250]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator_rmse, numFolds=4) # use 3+ folds in practice cvModel = crossval.fit(trainingdata) # Explain params for the selected model print cvModel.explainParams() return cvModel
def filter_stop_words(df): stop_words_nltk = stopwords.words('english') remover = StopWordsRemover(inputCol='tokens_noise', outputCol='tokens', stopWords=stop_words_nltk) return remover.transform(df)
def df_to_words(logger, df: DataFrame, input_col: str, output_col: str = "words", pattern: str = "\\W+", to_lowercase: bool = True, case_sensitive: bool = False) -> DataFrame: """ Take each string in a column and parse it to a list of words via Tokenization and remove stop words. Args: logger: Logger instance used to log events df: Dataframe used input_col: Selected input column name output_col: Output column name pattern: The regex pattern used to tokenized to_lowercase: If all the word should be trim case_sensitive: Does the stop words should be case sensitive Returns: The modified dataframe """ try: intermediate_output = output_col + "intermediate" regex_tokenizer = RegexTokenizer(inputCol=input_col, outputCol=intermediate_output, pattern=pattern, toLowercase=to_lowercase) remover = StopWordsRemover(inputCol=intermediate_output, outputCol=output_col, caseSensitive=case_sensitive) logger.info("Parsing to words the dataframe") return remover.transform(regex_tokenizer.transform(df)).drop(intermediate_output) except Exception as e: logger.error("Parsing to words failed: {}".format(e), traceback.format_exc()) raise e
def UsefulnessPredictionLDAWithoutCV(trainingdata, model): # Data Preprocessing tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens_word") remover = StopWordsRemover(inputCol="tokens_word", outputCol="filtered_tokens_word") cv = CountVectorizer(inputCol="filtered_tokens_word", outputCol="raw_features", minDF=2.0, vocabSize=250) idf = IDF(inputCol="raw_features", outputCol="features") # Extract LDA topic feature lda = LDA(k=30, maxIter=10) if model == 'RandomForest': model = RandomForestRegressor(featuresCol="topicDistribution") pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lda, model]) evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") cvModel = pipeline.fit(trainingdata) # Explain params for the selected model print cvModel.explainParams() return cvModel
def process(rdd): try: # Get the singleton instance of SparkSession spark = getSparkSessionInstance(rdd.context.getConf()) # Convert RDD[String] to RDD[Row] to DataFrame rowRdd = rdd.map(lambda w: Row(word=w)) wordsDataFrame = spark.createDataFrame(rowRdd,["sentence"]) wordsDataFrame = wordsDataFrame.select(f.lower(f.col("sentence")).alias("sentence")) wordsDataFrame = wordsDataFrame.select( f.regexp_replace(f.col("sentence"), "(\d+)", "").alias( "sentence")) # return wordsDataFrame tokenizer = Tokenizer(inputCol="replaced", outputCol="words") tokenized = tokenizer.transform(wordsDataFrame) remover = StopWordsRemover(inputCol="words", outputCol="filtered") removed = remover.transform(tokenized) removed.select("filtered").show(20,False) # return removed except: pass
def aggregate_spark(data, columns, args): import pyspark.sql.functions as F from pyspark.ml.feature import StopWordsRemover, RegexTokenizer input_data = data.withColumn(columns["col"], F.lower(F.col(columns["col"]))) regexTokenizer = RegexTokenizer(inputCol=columns["col"], outputCol="token_list", pattern="\\W") regexTokenized = regexTokenizer.transform(data) remover = StopWordsRemover(inputCol="token_list", outputCol="filtered_word_list") vocab_rows = (remover.transform(regexTokenized).select( F.explode(F.col("filtered_word_list")).alias("word")).groupBy( "word").count().orderBy(F.col("count").desc()).limit( args["vocab_size"]).select("word").collect()) vocab = [row["word"] for row in vocab_rows] reverse_dict = { word: idx + len(args["reserved_indices"]) for idx, word in enumerate(vocab) } return {**reverse_dict, **args["reserved_indices"]}
def create_TFIDF_v0(trainData, applyData, inputCol="text", outputCol="features", minDocFreq=3, numFeatures=20): tokenizer = RegexTokenizer(pattern="[.:\s]+", inputCol=inputCol, outputCol="z_words") wordsData1 = tokenizer.transform(trainData) wordsData2 = tokenizer.transform(applyData) remover = StopWordsRemover(inputCol="z_words", outputCol="z_filtered", stopWords=STOPWORDS_v0) wordsDataFiltered1 = remover.transform(wordsData1) wordsDataFiltered2 = remover.transform(wordsData2) hashingTF = HashingTF(inputCol="z_filtered", outputCol="z_rawFeatures", numFeatures=numFeatures) featurizedData1 = hashingTF.transform(wordsDataFiltered1) featurizedData2 = hashingTF.transform(wordsDataFiltered2) # alternatively, CountVectorizer can also be used to get term frequency vectors idf = IDF(inputCol="z_rawFeatures", outputCol=outputCol, minDocFreq=minDocFreq) idfModel = idf.fit(featurizedData1) rescaledData = idfModel.transform(featurizedData2) return rescaledData.drop("z_words", "z_filtered", "z_rawFeatures", inputCol)
def kmeans_from_csv(file="liveTweetsLocation.csv", outfile="liveTweetsLocationKmeans.csv", k=8): df = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \ (file) df.show() # df2.show() tokenizer = Tokenizer(inputCol="text", outputCol="tokens") remover = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens") hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=2**20) idf = IDF(inputCol="rawFeatures", outputCol="features") kmeans = KMeans(k=k, seed=1, featuresCol='features', maxIter=10, initMode='random') pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, kmeans]) model = pipeline.fit(df) results = model.transform(df) results.cache() results.groupBy("prediction").count().show( ) # Note "display" is for Databricks; use show() for OSS Apache Spark # results.filter(results.prediction == 1).show(200,False) results.show() results.toPandas().to_csv(outfile) results.drop("location").drop("tokens").drop( "stopWordsRemovedTokens").drop("rawFeatures").drop( "features").toPandas().to_csv('prettyPrint.csv')
def get_pipeline(vector_size=50, class_num=5, stopwords=None): ''' 构建pipeline 该demo pipeline包含以下步骤: 1. labelIndexer 将标签索引,从字符装化为整数 2. tokenizer 将句子分成单词 3. remover 去除停用词 4. word2vec 使用word2vec将文本转化为低维度向量 5. mpc 神经网络分类器 ''' labelIndexer = StringIndexer(inputCol="label", outputCol="indexLabel") tokenizer = Tokenizer(inputCol="text", outputCol="raw_words") remover = StopWordsRemover(inputCol="raw_words", outputCol="words", stopWords=stopwords) word2vec = Word2Vec(vectorSize=vector_size, minCount=2, inputCol="words", outputCol="vector") layers = [vector_size, (vector_size + class_num) / 2, class_num] mpc = MultilayerPerceptronClassifier(maxIter=100, layers=layers, seed=1234, featuresCol="vector", labelCol="indexLabel") pipeline = Pipeline( stages=[labelIndexer, tokenizer, remover, word2vec, mpc]) return pipeline
def gmmresults(): df1 = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \ ("canadatweets.csv") df2 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("products.csv") df3 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("products.csv") df4 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("claritin.csv") df = df1.unionAll(df2) df = df.unionAll(df3) df = df.unionAll(df4) df.show() # df2.show() tokenizer = Tokenizer(inputCol="text", outputCol="tokens") remover = StopWordsRemover(inputCol="tokens", outputCol="stopWordsRemovedTokens") hashingTF = HashingTF(inputCol="stopWordsRemovedTokens", outputCol="rawFeatures", numFeatures=20000) idf = IDF(inputCol="rawFeatures", outputCol="features") gmm = GaussianMixture(k=8, featuresCol='features') pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, gmm]) model = pipeline.fit(df) results = model.transform(df) results.cache() results.groupBy("prediction").count().show( ) # Note "display" is for Databricks; use show() for OSS Apache Spark results.filter(results.prediction == 1).show(200, False) results.show() results.toPandas().to_csv( 'gmmresultsCanadaAndProductsAndDisastersAndClaritin.csv')
def trainNaiveBayesModel(data, directory=""): tokenizer = Tokenizer().setInputCol("comment_text").setOutputCol("words") remover = StopWordsRemover().setInputCol("words").setOutputCol( "filtered").setCaseSensitive(False) hashingTF = HashingTF().setNumFeatures(1000).setInputCol( "filtered").setOutputCol("rawFeatures") idf = IDF().setInputCol("rawFeatures").setOutputCol( "features").setMinDocFreq(0) nb = NaiveBayes(labelCol="label", featuresCol="features") pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, nb]) paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures,[200, 500, 1000, 5000]) \ .addGrid(nb.smoothing, [0.5, 1, 1.5, 2]) \ .build() crossval = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator().setMetricName( 'areaUnderPR' ), # set area Under precision-recall curve as the evaluation metric # 80% of the data will be used for training, 20% for validation. trainRatio=0.8) cvModel = crossval.fit(data) modelName = directory + "NaiveBayesModel" cvModel.bestModel.write().overwrite().save(modelName) return modelName
def main(): in_directory = sys.argv[1] out_directory = sys.argv[2] comments = spark.read.json(in_directory, schema=schema) comments.cache() wordbreak = r'[%s\s]+' % (re.escape(string.punctuation + '0123456789'),) # NLP processing code adapted from https://spark.apache.org/docs/latest/ml-features.html regexTokenizer = RegexTokenizer(inputCol="body", outputCol="words", minTokenLength=3, pattern=wordbreak) # alternatively, pattern="\\w+", gaps(False) countTokens = udf(lambda words: len(words), IntegerType()) regexTokenized = regexTokenizer.transform(comments) docs = regexTokenized.select("body", "words", "subreddit") docs.cache() #extra_stop_words = ["www","http","gt"] remover = StopWordsRemover(inputCol="words", outputCol="filtered") docs = remover.transform(docs).withColumn("tokens", countTokens(col("filtered"))) docs = docs.drop("body") docs = docs.drop("words") docs.groupBy("subreddit").agg(functions.avg("tokens")).show() # threshold for post length lthresh = 60 uthresh = 100 docs = docs.filter(docs['tokens'] > lthresh) docs = docs.filter(docs['tokens'] < uthresh) logs = docs.groupBy("subreddit").agg(functions.count("*")).show() #adds rank per subreddit type into a new column called rank ranked = docs.withColumn("rank", rank().over(Window.partitionBy("subreddit").orderBy(desc("tokens")))) #ranked.cache() group_size = 230 #take group_size biggest docs from each group type ranked = ranked.filter(ranked['rank'] <= group_size) #convert arrays to columns so we can write csv for i in range(uthresh): ranked = ranked.withColumn('{0}'.format(i), ranked.filtered.getItem(i)) #drop filtered so we can write to csv ranked = ranked.drop('filtered') ranked = ranked.drop('rank') ranked.show() ranked.write.csv(out_directory, mode='overwrite')
def featureExtract(self, trainDataframe, predictionDataframe): pipeline = None try: pipeline = Pipeline.load(ROOT_PATH + '/pipeline') except Exception: print Exception.message self.logger.error(Exception) if pipeline is None: # tokenizer = Tokenizer(inputCol="keywords", outputCol="words") remover = StopWordsRemover(inputCol="keywords", outputCol="filtered") # 设置停用词 remover.setStopWords(self.cuttingMachine.chineseStopwords()) hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idff") # lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[remover, hashingTF, idf]) model = pipeline.fit(trainDataframe) pipeline.write().overwrite().save(ROOT_PATH + '/pipeline') resultDataframe = model.transform(predictionDataframe) resultDataframe.show() selected = resultDataframe.select("filtered", "features", "idff") for row in selected.collect(): filtered, features, idff = row self.logger.info("features: %s", features) self.logger.info("idff: %s", idff) self.logger.info( "filtered: %s", str(filtered).decode("unicode_escape").encode("utf-8")) return selected
def random_text_classifier(input_loc, output_loc): """ This is a dummy function to show how to use spark, It is supposed to mock the following steps 1. clean input data 2. use a pre-trained model to make prediction 3. write predictions to a HDFS output Since this is meant as an example, we are going to skip building a model, instead we are naively going to mark reviews having the text "good" as positive and the rest as negative """ # read input df_raw = spark.read.option("header", True).csv(input_loc) # perform text cleaning # Tokenize text tokenizer = Tokenizer(inputCol="review_str", outputCol="review_token") df_tokens = tokenizer.transform(df_raw).select("cid", "review_token") # Remove stop words remover = StopWordsRemover(inputCol="review_token", outputCol="review_clean") df_clean = remover.transform(df_tokens).select("cid", "review_clean") # function to check presence of good df_out = df_clean.select( "cid", array_contains(df_clean.review_clean, "good").alias("positive_review")) # parquet is a popular column storage format, we use it here df_out.write.mode("overwrite").parquet(output_loc)
def featureExtractLr(self, trainDataframe, predictionDataframe): pipeline = None try: # pipeline = PipelineModel.load(ROOT_PATH+'/logistic') pipeline = Pipeline.load(ROOT_PATH + '/logistic') except Exception: print Exception.message self.logger.error(Exception) if pipeline is None: # tokenizer = Tokenizer(inputCol="keywords", outputCol="words") remover = StopWordsRemover(inputCol="keywords", outputCol="filtered") # 设置停用词 remover.setStopWords(self.cuttingMachine.chineseStopwords()) hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001).setElasticNetParam(0.8) pipeline = Pipeline(stages=[remover, hashingTF, lr]) model = pipeline.fit(trainDataframe) pipeline.write().overwrite().save(ROOT_PATH + '/logistic') # model.write().overwrite().save(ROOT_PATH+'/logistic') resultDataframe = model.transform(predictionDataframe) resultDataframe.show() selected = resultDataframe.select("id", "features", "probability", "prediction") for row in selected.collect(): rid, features, prob, prediction = row self.logger.info("features: %s", features) self.logger.info("prob: %s", str(prob)) self.logger.info("prediction: %s", str(prediction))
def get_pd_keyword(self): df_spark = self.df_spark # Step 1. Text cleasing with punctuations REGEX = '[_,?\\-.!?@#$%^&*+\/\d]' df_spark = df_spark.withColumn( "description_clean", regexp_replace(df_spark.description, REGEX, ' ')) # Step 2. Tokenization # df_spark = df_spark.drop("description_token") tokenizer = Tokenizer(inputCol='description_clean', outputCol='description_token') df_spark = tokenizer.transform(df_spark) # Stemming # nltk.download('wordnet') lemmatizer = WordNetLemmatizer() def lemm_function(list): list_clean = [] for item in list: list_clean.append(lemmatizer.lemmatize(item)) return list_clean udf_lemm_function = F.udf(lemm_function, ArrayType(StringType())) df_spark = df_spark.withColumn( "description_lemm", udf_lemm_function(df_spark.description_token)) # Step 3. Remove stopword stopwords_list = StopWordsRemover.loadDefaultStopWords("english") stopwords_customize_list = ["app", "apps"] stopwords_list = np.append(stopwords_list, stopwords_customize_list) stopwords = StopWordsRemover(inputCol="description_lemm", outputCol="description_no_stop", stopWords=stopwords_list) stopwords.getStopWords() df_spark = stopwords.transform(df_spark) df_pd_desc_final = df_spark.toPandas() # ### Note: IDF vector must be trained with large corpus, otherwise lose the advance of IDF # get the "description" column joinF = lambda x: " ".join(x) df_pd_desc_final["description_final"] = df_pd_desc_final[ "description_no_stop"].apply(joinF) corpus_list = df_pd_desc_final["description_final"].tolist() df_pd_desc_final = get_tfidf(corpus_list, df_pd_desc_final, self.topn) return df_pd_desc_final
def pipeline(df): print(df.head()) df = df.withColumn("length", length(df['Speech'])) # Create the data processing pipeline functions here (note: StringIndexer will be used to encode # your target variable column. This column should be named 'label' so our model will recognize it later) review_data = Tokenizer(inputCol="Speech", outputCol="Words") reviewed = review_data.transform(df) #reviewed.show() remover = StopWordsRemover(inputCol="Words", outputCol="filtered") newFrame = remover.transform(reviewed) #newFrame.show() hashing = HashingTF(inputCol="filtered", outputCol="hashedValues", numFeatures=pow(2, 10)) # Transform in a DF hashed_df = hashing.transform(newFrame) hashed_df.show(truncate=False) idf = IDF(inputCol="hashedValues", outputCol="feature") idfModel = idf.fit(hashed_df) rescaledData = idfModel.transform(hashed_df) rescaledData.select("words", "feature").show(truncate=False) # indexer = StringIndexer(inputCol="Party_Affliation", outputCol="label") # indexed = indexer.fit(rescaledData).transform(rescaledData) assembler = VectorAssembler(inputCols=["feature", "length"], outputCol="features") return assembler.transform(rescaledData)
def remove_stop_words(df): remover = StopWordsRemover(inputCol="rawTokens", outputCol="tokens", stopWords=stopwords.words("english")) df_tokens = remover.transform(df) return df_tokens
def testPipelineSerialization(craiglistDataset): [traningDataset, testingDataset] = craiglistDataset.randomSplit([0.9, 0.1], 42) tokenizer = RegexTokenizer(inputCol="jobtitle", minTokenLength=2, outputCol="tokenized") stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="stopWordsRemoved") w2v = H2OWord2Vec(sentSampleRate=0, epochs=10, inputCol=stopWordsRemover.getOutputCol(), outputCol="w2v") gbm = H2OGBM(labelCol="category", featuresCols=[w2v.getOutputCol()]) pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, w2v, gbm]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/w2v_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/w2v_pipeline")) model = loadedPipeline.fit(traningDataset) expected = model.transform(testingDataset) model.write().overwrite().save("file://" + os.path.abspath("build/w2v_pipeline_model")) loadedModel = PipelineModel.load( "file://" + os.path.abspath("build/w2v_pipeline_model")) result = loadedModel.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(expected, result)
def fit_tfidf_pipeline(content_df): tokenizer = RegexTokenizer(). \ setGaps(False). \ setPattern('\\p{L}+'). \ setInputCol('content'). \ setOutputCol('words') sw = StopWordsRemover() \ .setStopWords(stop_words) \ .setCaseSensitive(False) \ .setInputCol("words") \ .setOutputCol("filtered") cv = CountVectorizer(). \ setInputCol('filtered'). \ setOutputCol('tf'). \ setMinTF(1). \ setMinDF(10). \ setVocabSize(2 ** 17) # fit dataframe_df cv_transformer = Pipeline(stages=[tokenizer, sw, cv]).fit(content_df) idf = IDF(minDocFreq=10). \ setInputCol('tf'). \ setOutputCol('tfidf') tfidf_transformer = Pipeline(stages=[cv_transformer, idf]).fit(content_df) return tfidf_transformer
def test_stopwordsremover(self): dataset = self.spark.createDataFrame([Row(input=["a", "panda"])]) stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output") # Default self.assertEqual(stopWordRemover.getInputCol(), "input") transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["panda"]) self.assertEqual(type(stopWordRemover.getStopWords()), list) self.assertTrue( isinstance(stopWordRemover.getStopWords()[0], basestring)) # Custom stopwords = ["panda"] stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getInputCol(), "input") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["a"]) # with language selection stopwords = StopWordsRemover.loadDefaultStopWords("turkish") dataset = self.spark.createDataFrame( [Row(input=["acaba", "ama", "biri"])]) stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, []) # with locale stopwords = ["BELKİ"] dataset = self.spark.createDataFrame([Row(input=["belki"])]) stopWordRemover.setStopWords(stopwords).setLocale("tr") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, [])
def clean_words(news): """ Function to clean the data from the news :return: Dataframe containing the list of words in the text and the total number of times they occur """ # just alphanumeric news_clean = news.where(news["NewsText"].isNotNull()).withColumn( 'NewsTextClean', f.regexp_replace('NewsText', "[^0-9a-zA-Z ]", " ")) # splits the new column news_clean_split = news_clean.select( f.split(news_clean["NewsTextClean"], " ").alias("NewsTextClean")) # removes stop words remover = StopWordsRemover(inputCol="NewsTextClean", outputCol="NoStopWords") news_without_stopwords = remover.transform(news_clean_split) # splits words in rows and count occurrence of each of them news_without_stopwords_str = news_without_stopwords.withColumn( 'NoStopWordsStr', f.concat_ws(' ', 'NoStopWords')) news_to_show = news_without_stopwords_str.withColumn('word', f.explode(f.col('NoStopWords'))) \ .groupBy('word') \ .count() \ .sort('count', ascending=False) \ .where('word <>""') return news_to_show
def random_text_classifier(input_loc, output_loc): """ This is a dummy function that mocks the following steps: 1. clean input data (tokenization, remove stop words) 2. use a pre-trained model to make prediction 3. write predictions to a HDFS output Naively marks reviews having the text "good" as positive and the rest as negative """ # read input df_raw = spark.read.option("header", True).csv(input_loc) # perform text cleaning # Tokenize text tokenizer = Tokenizer(inputCol='review_str', outputCol='review_token') df_tokens = tokenizer.transform(df_raw).select('cid', 'review_token') # Remove stop words remover = StopWordsRemover(inputCol='review_token', outputCol='review_clean') df_clean = remover.transform(df_tokens).select('cid', 'review_clean') # function to check presence of good and naively assume its a positive review df_out = df_clean.select( 'cid', array_contains(df_clean.review_clean, "good").alias('positive_review')) df_out.write.mode("overwrite").parquet(output_loc)