def test_stopwordsremover(self): dataset = self.spark.createDataFrame([Row(input=["a", "panda"])]) stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output") # Default self.assertEqual(stopWordRemover.getInputCol(), "input") transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["panda"]) self.assertEqual(type(stopWordRemover.getStopWords()), list) self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring)) # Custom stopwords = ["panda"] stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getInputCol(), "input") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["a"]) # with language selection stopwords = StopWordsRemover.loadDefaultStopWords("turkish") dataset = self.spark.createDataFrame([Row(input=["acaba", "ama", "biri"])]) stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, []) # with locale stopwords = ["BELKİ"] dataset = self.spark.createDataFrame([Row(input=["belki"])]) stopWordRemover.setStopWords(stopwords).setLocale("tr") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, [])
def main(sc): sql_context = SQLContext(sc) all_data = get_all_data() # Input data: Each row is a bag of words from a sentence or document. training_data = [(id_gen.next(), text.split(" ")) for text in all_data] documentdf = sql_context.createDataFrame(training_data, ["id", "text"]) remover = StopWordsRemover(inputCol="text", outputCol="text_filtered") cleaned_document = remover.transform(documentdf) # Learn a mapping from words to Vectors. word2vec = Word2Vec(vectorSize=len(training_data), inputCol="text_filtered", outputCol="result") model = word2vec.fit(cleaned_document) matrix = column_similarities(model.transform(cleaned_document)) # We use the size of the target data to filter only # products of target data to filter data and avoid # products of taret data to itself values = matrix.entries.filter( lambda x: x.j >= TARGET_DATA_SIZE and x.i < TARGET_DATA_SIZE).sortBy( keyfunc=lambda x: x.value, ascending=False).map( lambda x: x.j).distinct().take(100) training_data_index = dict(training_data) for position, item in enumerate(values): line = " ".join(training_data_index[int(item)]) print('%d -> %s' % (position, line.encode('utf-8')))
def remove_stop_words(p_df, in_column, out_column): """ Removes stop words from a column in a DataFrame. The column must be a list of words. :param p_df: A DataFrame. :param in_column: Name of the input column. :param out_column: Name of the output column. :return: A DataFrame. """ remover = StopWordsRemover(inputCol=in_column, outputCol=out_column) return remover.transform(p_df)
def process(reviews): if (reviews.isEmpty()): pass else: start = time.time() #get reviews with overall rating > 3 and overall rating < 3 pos_reviews = reviews.filter(lambda x: x[0] > 3.0) neg_reviews = reviews.filter(lambda x: x[0] < 3.0) #set label for each class. 0.0 is positive - 1.0 is negative review_labels = reviews.map(lambda x: 0.0 if x[0] > 3.0 else 1.0) Words = Row('label', 'words') words = reviews.map(lambda r: Words(*r)) words_df = spark.createDataFrame(words) #reviews tokenization token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True) token_filtered = token.transform(words_df) #stopwords elimination remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False) stopwords_filtered = remover.transform(token_filtered) prep_filtered = ( stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0]) #tf-idf calculation tf = HashingTF(numFeatures=numFeatures).transform( prep_filtered.map(porter_stem, preservesPartitioning=True)) idf = IDF().fit(tf) train_tfidf = idf.transform(tf) #set training dataset with label training = review_labels.zip(train_tfidf).map( lambda x: LabeledPoint(x[0], x[1])) #train the model classifier model = NaiveBayes.train(training) #save model classifier to HDFS output_dir = "hdfs://VM10-1-0-14:9000/classifier/" + model_name model.save(sc, output_dir) end = time.time() print("Total Reviews : ", reviews.count(), "Processing Time : ", (end - start)) ssc.stop()
def preprocessDF(self): reTokenizer = RegexTokenizer(pattern=r'\W+', inputCol='reformat2', outputCol='tokenKey', toLowercase=True) df_token = reTokenizer.transform(self.df) remover = StopWordsRemover(inputCol='tokenKey', outputCol='tokens') remover.setStopWords(list(self.stopWords)) df_token = remover.transform(df_token) df_token=df_token.select('categories','created_at','favorite_count','quote_count','reply_count','retweet_count','user.followers_count', 'user.favourites_count','user.friends_count','tokens') self.df1.write.json(os.path.join(bdenv_loc,'twitter_parse_reformat_2018_01-06.json'), mode='append') df_token.write.json(os.path.join(bdenv_loc,'twitter_parse_tokens_2018_01-06.json'), mode='append')
def stopWordsRemover(self, dataset, colName): stopWordsList = StopWords.stopWordsKNIME sentimentStopWordRemover = StopWordsRemover( inputCol=colName, outputCol=self.dmxStopWords, stopWords=stopWordsList) dataset = sentimentStopWordRemover.transform(dataset) textProcessing = TextProcessing() dataset = textProcessing.stemming(dataset, pc.DMXSTOPWORDS) dataset = textProcessing.ngrams(dataset, pc.DMXSTOPWORDS, 2) dataset = textProcessing.lemmatization(dataset, pc.DMXSTOPWORDS) return dataset
def functions_for_deal_with_texts(spark, resources_folder): send_df = spark.createDataFrame([ (0, 'Hi I heard about Spark'), (1, 'I wish java could use case classes'), (2, 'Logistic,regression,models,are,neat'), ], ['id', 'sentence']) tokenizer = Tokenizer(inputCol='sentence', outputCol='words') regularTokenizer = RegexTokenizer( inputCol='sentence', outputCol='words', pattern='\\W') count_token = udf(lambda words: len(words), IntegerType()) tokenize = tokenizer.transform(send_df) tokenize.show() tokenize.withColumn('tokens', count_token(col('words'))).show() rg_tokenize = regularTokenizer.transform(send_df) rg_tokenize.show() rg_tokenize.withColumn('tokens', count_token(col('words'))).show() # remover palabras comunes sentenceData = spark.createDataFrame([ (0, ["I", "saw", "the", "red", "balloon"]), (1, ["Mary", "had", "a", "little", "lamb"]) ], ["id", "raw"]) remover = StopWordsRemover(inputCol="raw", outputCol="filtered") remover.transform(sentenceData).show(truncate=False) wordDataFrame = spark.createDataFrame([ (0, ["Hi", "I", "heard", "about", "Spark"]), (1, ["I", "wish", "Java", "could", "use", "case", "classes"]), (2, ["Logistic", "regression", "models", "are", "neat"]) ], ["id", "words"]) ngram = NGram(n=2, inputCol="words", outputCol="ngrams") ngramDataFrame = ngram.transform(wordDataFrame) ngramDataFrame.select("ngrams").show(truncate=False)
def kmeans(params): path = params[0] k = int(params[1]) iterations = int(params[2]) target_dir = params[3] try: # Creating session spark_session = SparkSession.builder.appName( "project4-jwj").getOrCreate() # loading the files from hdfs ang getting a DataFrame data = spark_session.read.format("csv").option("header", "true").load( "{}/*.csv".format(path)) #data.show() # Getting column's name columns = data.columns # Removing null rows for i in columns: data = data.filter(col(i).isNotNull()) # Breaking the content column into individual words tokenizer = Tokenizer(inputCol="content", outputCol="Words") tokenized = tokenizer.transform(data) #tokenized.show() # Removing stop words remover = StopWordsRemover(inputCol="Words", outputCol="Filtered") removed = remover.transform(tokenized) #removed.show() # Term frecuency - inverse document frecuency hashingTF = HashingTF(inputCol="Filtered", outputCol="rawFeatures", numFeatures=3000) # Getting the frecuency term vector to try to get k and train kmeans featurizedData = hashingTF.transform(removed) #featurizedData.show() idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.show() # Train KMeans kmean = KMeans().setK(k).setMaxIter(iterations).fit(rescaledData) clustersTable = kmean.transform(rescaledData) clustersTable.show() clustersTable.select("title", "prediction").repartition( 1).write.format("com.databricks.spark.csv").save(target_dir) except Exception as e: print(str(e), file=sys.stderr) sys.exit(1)
def task_four(ngram): """ Set the ngram value :param ngram: :return: """ params = list(inspect.getargspec(task_four)) p = list(chain.from_iterable([i for i in params if i is not None])) param_values = {} if len(p) > 0: for i, v in enumerate(p): try: value = raw_input("Please enter a value for {} ==> ".format(v)) param_values.update({v: value}) except: pass ngram = param_values.get(p[0]) if int(ngram) == 2: # --- list of stopwords stopwords = { 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 'don', 'should', 'now', ' a ', 'insured', 'sured', 'coverage', 'year', 'dob', 'insd', 'left' } # --- remove stop words REMOVER = StopWordsRemover() stopwords = REMOVER.getStopWords() REMOVER.setInputCol("inter_wordlist") REMOVER.setOutputCol("inter_wordlist_two") stpwrds_rmvd_sdf = REMOVER.transform(VECTOR_DATAFRAME) \ .select(["Claim_Id", "filename", "inter_wordlist_two"]) else: pass
def tokenize(df, string_cols): output = df for c in string_cols: output = output.withColumn('temp', f.coalesce(f.col(c), f.lit(''))) tokenizer = RegexTokenizer(inputCol='temp', outputCol=c + "_tokens", pattern="\\W") remover = StopWordsRemover(inputCol=c + "_tokens", outputCol=c + "_swRemoved") output = tokenizer.transform(output) output = remover.transform(output)\ .drop('temp', c+"_tokens") return output
def removeStopWords(df, column): """ Remove stop-words (like "the", "a", "I", etc.) from given column. The column must contain an array of strings. Transformation: array<string> --> array<string> """ # creates remover to filter out common stop-words remover = StopWordsRemover(inputCol=column, outputCol='_'+column) # transform: array<string> --> array<string> df = remover.transform(df) df = replace(df, column, '_'+column) return df
def main(data_file_csv): news_data = spark.read.csv(data_file_csv, header=True) snippet_text = news_data.select('snippet') snippet_text = snippet_text.replace(r'\\n\\n|\\n', ' ') udf_clean = udf(clean_words, StringType()) data_cleaned = snippet_text.withColumn("snippet_c", udf_clean(snippet_text['snippet'])) tokenizer = Tokenizer(inputCol="snippet_c", outputCol="tokens") data_tokenized = tokenizer.transform(data_cleaned) data_tokenized_2cols = data_tokenized.select("snippet_c", "tokens") SWR = StopWordsRemover(inputCol="tokens", outputCol="tokens_final") data_final = SWR.transform(data_tokenized_2cols) output = data_final.select('tokens_final') return output
def movie_wordcloud(df): title_df = df.select("id", "title") # Clean text df_clean = title_df.select( "id", lower(regexp_replace('title', "[^a-zA-Z\\s]", "")).alias('title')) # Tokenize text tokenizer = Tokenizer(inputCol='title', outputCol='words_token') df_words_token = tokenizer.transform(df_clean).select('id', 'words_token') # Remove stop words remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean') df_words_no_stopw = remover.transform(df_words_token).select( 'id', 'words_clean') #df_words_no_stopw.show(10) wordsDF = df_words_no_stopw.select(explode("words_clean").alias("words")) wordsDF = wordsDF.select(trim(wordsDF.words).alias("words")) #wordsDF.show() wordCountDF = wordsDF.groupBy("words").count().orderBy( desc("count")).limit(16) #wordCountDF.show() pandD = wordCountDF.toPandas() pandD.drop(0, inplace=True) sns.barplot(y='words', x='count', data=pandD) plt.title("Movie Title Analysis") plt.xlabel('Words Frequency') plt.ylabel('Words') #plt.show() wordCountDF = wordsDF.groupBy("words").count().orderBy( desc("count")).limit(101) pandD = wordCountDF.toPandas() pandD.drop(0, inplace=True) # drop first row wordcloudConvertDF = pandD.set_index('words').T.to_dict('records') wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=100, relative_scaling=0.5, colormap='Dark2') \ .generate_from_frequencies(dict(*wordcloudConvertDF)) plt.figure(figsize=(14, 10)) plt.imshow(wordcloud, interpolation="bilinear") plt.title("Words Cloud - Movie Titles") plt.axis('off') plt.show() """# Overview Cloud
def get_top_n_in_array(df_lookup, top): df_lookup = df_lookup.select('_c0').distinct() tokenizer = Tokenizer(inputCol="_c0", outputCol="token_raw") remover = StopWordsRemover(inputCol="token_raw", outputCol="token_filtered") df_lookup = tokenizer.transform(df_lookup) df_lookup = remover.transform(df_lookup) df_lookup = df_lookup.select( (F.explode("token_filtered"))).groupby("col").count().sort( 'count', ascending=False) df_lookup = df_lookup.filter(F.length("col") > 2).limit(top).select( F.collect_list("col")).withColumnRenamed("collect_list(col)", "to_match") return df_lookup
def fit(self): sqlContext = SparkSession.builder.getOrCreate() if self.test: df = sqlContext.sql( "select * from cmp_tmp_user_identification where dt='2014-01'") else: df = sqlContext.sql("select * from cmp_tmp_user_identification") if self.tweet and self.retweet: df = df.withColumn('content', F.concat('text', 'retweeted')) elif self.tweet: df = df.filter("retweeted==' '") df = df.withColumn('content', F.col('text')) elif self.retweet: df = df.filter('length(retweeted)>1') df = df.withColumn('content', F.col('retweeted')) df = df.withColumn('content', textCut(clean_text('content'))) ##stopwords remover = StopWordsRemover(inputCol="content", outputCol="words", stopWords=self.stopwords) df = remover.transform(df) ## 清理空字符 df = df.filter('size(words)>0') self.sentence_length_distribution = df.selectExpr( 'size(words) as wz').groupBy('wz').count().toPandas().set_index( 'wz').sort_index() ###vec cv = CountVectorizer(inputCol='words', outputCol='vertors', minDF=self.minDF, minTF=self.minTF) model_cv = cv.fit(df) word2bag = model_cv.vocabulary self.baglen = len(word2bag) self.dictionary = dict( zip(word2bag, ['W' + str(i) for i in range(1, self.baglen)])) sc = SparkContext.getOrCreate() diction = sc.broadcast(self.dictionary) ## to English format to GCN df = df.withColumn('words_space', toSpaceSplit('words')) result_df = df.selectExpr('uid,label,identity,words_space'.split(',')) ##aggregate to user level result_df = result_df.groupBy('uid', 'label', 'identity').agg( F.collect_list('words_space').alias('uid_words')) result_df = result_df.withColumn('uid_words', concat_uid('uid_words')) return result_df
def pre_processing(cf): # Converting label -1 -> 0 cf = cf.withColumn( "label", f.when(cf["label"] == -1, 0).otherwise(cf["label"].cast(IntegerType()))) # removing punctuations cf_pl = cf.rdd.map( lambda x: (re.sub(r'[^\w\s]', '', x.review).lower(), x.label)).toDF( ["review", "label"]) # class imbalance solved here cf_pl = resample(cf_pl, 2) # Tokenize the reviews tokenizer = Tokenizer(inputCol="review", outputCol="tokenized") t = tokenizer.transform(cf_pl) # removing stop words stopwords_remover = StopWordsRemover(inputCol="tokenized", outputCol="filtered") s = stopwords_remover.transform(t) # removed empty strings in tokenized arrays s = s.rdd.map(lambda x: (x.review, x.label, x.tokenized, [y.strip() for y in x.filtered if y.strip()])).toDF( ["review", "label", "tokenized", "filtered"]) # Lemmatization lemmatizer = WordNetLemmatizer() try: s = s.rdd.map(lambda x: ([(lemmatizer.lemmatize(y, get_wordnet_pos(y))) for y in x.filtered])).toDF([ "review", "label", "tokenized", "filtered", "lemmatized" ]) except: # incase it fails, we use the filtered column s = s.withColumn('lemmatized', s.filtered) # temporary variable swap class_balancedDf = s # randomly shuffling class_balancedDf class_balancedDf = class_balancedDf.orderBy(rand()) return class_balancedDf
def cleanDf(df): df = df.withColumn("decisionDate", (f.col("decisionDate").cast("date"))) cleanFT_udf = f.udf(cleanFullText, ArrayType(StringType())) df = df.withColumn("fullTextCleaned", cleanFT_udf(df.fullText)) cleanK_udf = f.udf(cleanKeywords, ArrayType(StringType())) df = df.withColumn("keywords", cleanK_udf(df.keywords)) remover = StopWordsRemover(inputCol="fullTextCleaned", outputCol="filteredFullText", stopWords=stop_words) df = remover.transform(df) return df
def process_tweet_text(df): """Removes punctuation, stop words from inputCol and the output is in the outputCol Column. Args: df (DataFrame): A DataFrame with the column from which Stop Words need to be removed. Returns: DataFrame: Applying StopWordsRemover with text_clean as the input column and filtered as the output column. """ df = df.withColumn('text', split(removePunctuation(df['text']), ' ').alias('text')) stopWordList = list(string.punctuation) + ['http', 'https', 'rt','via','...','…','’','—','—:','“'] + StopWordsRemover.loadDefaultStopWords('english') remover = StopWordsRemover(inputCol="text", outputCol="filtered", stopWords = stopWordList) df = remover.transform(df) df = df.withColumn('tweet', array_join(df['filtered'], ' ')) return df.select('date', 'tweet', 'hashtags')
def clean(df): tokenizer = Tokenizer(inputCol="body", outputCol="vector") remover = StopWordsRemover(inputCol="vector", outputCol="body") df = df.withColumn('body', regexp_replace(col('body'), '<code>.*?</code>', ' ')) df = df.withColumn('body', regexp_replace(col('body'), '<.*?>', ' ')) df = df.withColumn('body', regexp_replace(col('body'), '&.*?;', ' ')) df = df.withColumn( 'body', regexp_replace(col('body'), "[{0}]".format(re.escape(string.punctuation)), ' ')) df = df.withColumn('body', regexp_replace(col('body'), '[^a-zA-Z]', ' ')) df = tokenizer.transform(df).drop('body') df = remover.transform(df).drop('vector') return df
def transform_tweet_data(): # load data = spark.read.csv("dbfs:/FileStore/tweets/trump_insult_tweets_2014_to_2021.csv", header=True) # select data = data.select(split("tweet", " ").alias("tweet"), "target").dropna() # remove stopword remover = StopWordsRemover(inputCol='tweet', outputCol='tweet_clean') data = remover.transform(data) # stem stemmer = SnowballStemmer(language='english') stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType())) data = data.withColumn("tweet_stemmed", stemmer_udf("tweet_clean")).select('target', 'tweet_stemmed') # clean data = data.withColumn("tweet", regexp_replace(concat_strings("tweet_stemmed"), '"', "")).select("tweet", "target") return data
def clean_data(): """ Clean the Tweet by removing punctuations and stop words :return cleaned data: """ data = sc.textFile("data/data.txt") col_rdd = data.map(lambda x: (x.split('\t')[0], x[-1])) punctuation_removed_rdd = col_rdd.map( lambda x: (remove_punctuation(x[0]), float(x[1]))) data_df = sqlContext.createDataFrame(punctuation_removed_rdd, ["text", "label"]) remover = StopWordsRemover(inputCol="text", outputCol="words", stopWords=stopwords.words('english')) return remover.transform(data_df).select(["label", "words"])
def preprocessing_titles(path,name): query = preprocessData(path) tokenizer = Tokenizer(inputCol="title", outputCol="tokenized_title") wordsData = tokenizer.transform(query) #after Stopword removal remover = StopWordsRemover(inputCol="tokenized_title", outputCol="filtered") wordsData= remover.transform(wordsData) df = wordsData.map(lambda x:x['id']).zipWithUniqueId().toDF(["id","index"]) df.registerTempTable("indices") wordsData.registerTempTable("words") qr = sqlContext.sql("SELECT index,words.id,filtered FROM indices JOIN words ON words.id = indices.id") if name!='': exportOnS3(qr,"s3a://redit-preprocessed/",name) qr = qr.map(lambda Row:(Row['index'],Row['id'],Row['filtered']))
def test_stop_words_remover2(self): data = self.spark.createDataFrame([(["a", "b", "c"],)], ["text"]) model = StopWordsRemover(inputCol="text", outputCol="words", stopWords=["b"]) model_onnx = convert_sparkml(model, 'Sparkml StopWordsRemover', [('text', StringTensorType([None]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = numpy.array(predicted.toPandas().words.values[0]) data_np = numpy.array(data.toPandas().text.values[0]) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlStopWordsRemover") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['words'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def append_tokens(self,df): """ Creates tokens from the pagename column in the dataframe then removes stop-words from the tokens. Adds the tokens under the column rawTokens and tokens. Args: :param df: Dataframe to add token columns to. Returns: :return: Dataframe with new columns rawTokens and tokens. """ #Tokenize pagename and convert tokens to their stem words. tokenize_udf = udf(tokenize_porter, returnType=ArrayType(StringType())) df = df.withColumn('rawTokens', tokenize_udf(df['pagename'])) #Remove stop words. stop_words_remover = StopWordsRemover(inputCol="rawTokens", outputCol="tokens") df = stop_words_remover.transform(df) return df
def topicPredict(inputs): #output_path = "/user/llbui/bigdata45_500" output_path = "C:/Users/linhb/bigdata45_500" query = inputs n = 10 #number of similar document to return feature = "abstract" #feature to compare df = sc.parallelize([(0, query)]).toDF(["id", feature]) tokenizer = RegexTokenizer(inputCol=feature, outputCol="words", pattern="\\P{Alpha}+") df2 = tokenizer.transform(df) remover = StopWordsRemover(inputCol="words", outputCol="words2") df3 = remover.transform(df2) udf_remove_words = udf(lambda x: remove_words(x), ArrayType(StringType())) df4 = df3.withColumn("words3", udf_remove_words(df3.words2)) # text to feature vector - TF_IDF countTF_model = CountVectorizerModel.load(output_path + "/tf_model") df_countTF = countTF_model.transform(df4) idf_model = IDFModel.load(output_path + "/idf_model") df_IDF = idf_model.transform(df_countTF) # LDA Model lda_model = LocalLDAModel.load(output_path + "/lda_model") #output topics for document -> topicDistribution df_Feature = lda_model.transform(df_IDF) feature_vector = df_Feature.select("id", "topicDistribution").collect()[0][1] print("Feature Vector:", feature_vector) #Load existing document df_Document = sqlCt.read.load(output_path + "/topicDistribution.parquet") udf_cosineSimilarity = udf( lambda x_vector: cosineSimilarity(x_vector, feature_vector), FloatType()) df_Similarity = df_Document.withColumn( "similarity", udf_cosineSimilarity("topicDistribution")) df_Similarity_Sorted = df_Similarity.sort(desc("similarity")) return df_Similarity_Sorted.limit(n).select("_id", "title", "abstract", "url", "topicDistribution").collect()
def aggregate_spark(data, features, args): from pyspark.ml.feature import StopWordsRemover, RegexTokenizer import pyspark.sql.functions as F from pyspark.sql.types import IntegerType regexTokenizer = RegexTokenizer(inputCol=features["col"], outputCol="token_list", pattern="\\W") regexTokenized = regexTokenizer.transform(data) remover = StopWordsRemover(inputCol="token_list", outputCol="filtered_word_list") max_review_length_row = (remover.transform(regexTokenized).select( F.size(F.col("filtered_word_list")).alias("word_count")).agg( F.max(F.col("word_count")).alias("max_review_length")).collect()) return max_review_length_row[0]["max_review_length"]
def convertToVec(df, sc, ss, outputName, inputCol='tokens'): print('\n\n\n Removing Stopwords... \n\n\n') remover=StopWordsRemover(inputCol=inputCol, outputCol='nostops', stopWords=StopWordsRemover.loadDefaultStopWords('english')) df=remover.transform(df) cv=CountVectorizer(inputCol='nostops', outputCol='vectors',minTF=1.0) vecModel=cv.fit(df) new=False if new: print('\n\n\n Get Vocab... \n\n\n') inv_voc=vecModel.vocabulary f = codecs.open(outputName+'_vocab.txt', encoding='utf-8', mode='w') for item in inv_voc: f.write(u'{0}\n'.format(item)) f.close() vectors= vecModel.transform(df).select('id','subreddit','vectors') return vectors
def preprocessDF(self, df, cols): # concatenation df_concat = df.withColumn("concat", concat_ws(' ', *cols)) # Split at whitespace and characters that are not letter tokenizer = RegexTokenizer(inputCol="concat", outputCol="words", pattern=r'\W+') df_tokenizer = tokenizer.transform(df_concat) # stopword remover remover = StopWordsRemover(inputCol="words", outputCol="joinKey", stopWords=self.stopWordsBC.value) df_remover = remover.transform(df_tokenizer) \ .drop("concat").drop("words") return df_remover
def sentiment_validate(lrModel): rdd = spark_context.textFile("/user/SentimentalData/Subset100k.csv") header = rdd.first(); rdd = rdd.filter(lambda row: row != header) spark = getSparkSessionInstance(rdd.context.getConf()) r = rdd.mapPartitions(lambda x : csv.reader(x)) parts = r.map(lambda x : Row(sentence=str.strip(x[3]), label=int(x[1]))) partsDF = spark.createDataFrame(parts) partsDF.show(truncate=False) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") tokenized = tokenizer.transform(partsDF) tokenized.show(truncate=False) remover = StopWordsRemover(inputCol="words", outputCol="base_words") base_words = remover.transform(tokenized) base_words.show(truncate=False) train_data_raw = base_words.select("base_words", "label") train_data_raw.show(truncate=False) base_words = train_data_raw.select("base_words") base_words.show(truncate=False) word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="base_words", outputCol="features") model = word2Vec.fit(train_data_raw) final_train_data2 = model.transform(train_data_raw) final_train_data2.show() final_train_data2 = final_train_data2.select("label", "features")
def remove_l_c_words(df, least, most): # Let's find out which words we keep vocabulary = df.map(lambda row: row.tweets).reduce(lambda x, y: x + y) count = sc.parallelize(vocabulary).map(lambda word: (word, 1)).reduceByKey( add) count = count.sortBy(lambda wc: wc[1], ascending=False) # Add to the list of stopwords stop_words_lc = count.filter(lambda wc: wc[1] == least).map( lambda wc: wc[0]).collect() if most < 1: stop_words = stop_words_lc else: stop_words_mc = count.map(lambda wc: wc[0]).take(most) stop_words = stop_words_lc + stop_words_mc remover = StopWordsRemover(inputCol="tweets", outputCol='cleaned_tweets', stopWords=stop_words) return remover.transform(df)
def preprocess_text(df): df_select = df.dropna(subset=["raw_tweet_text"]).select(cols_select) # 1. clean text df_select_clean = (df_select.withColumn( "tweet_text", F.regexp_replace( "raw_tweet_text", r"[@#&][A-Za-z0-9_-]+", " ")).withColumn( "tweet_text", F.regexp_replace( "tweet_text", r"\w+:\/\/\S+", " ")).withColumn( "tweet_text", F.regexp_replace( "tweet_text", r"[^A-Za-z]", " ")).withColumn( "tweet_text", F.regexp_replace( "tweet_text", r"\s+", " ")).withColumn( "tweet_text", F.lower( F.col("tweet_text"))).withColumn( "tweet_text", F.trim(F.col("tweet_text")))) # tokenize tokenizer = Tokenizer(inputCol="tweet_text", outputCol="tokens") # 2.2. remove stopwords stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="remove_stop") stopword_remover.setStopWords(stopwords_list) #2.3. stemming # TODO: how to modify the stemming function into a transformer? stemmer = PorterStemmer() # more straightforward to use lambda stem_udf = F.udf(lambda l: [stemmer.stem(word) for word in l], returnType=ArrayType(StringType())) df_tokenized = tokenizer.transform(df_select_clean) df_rmstop = stopword_remover.transform(df_tokenized) df_stemmed = df_rmstop.withColumn("stemmed", stem_udf(F.col("remove_stop"))) return df_stemmed
def tokenize_df(df): tokenizer = Tokenizer(inputCol="text", outputCol="vector") remover = StopWordsRemover() remover.setInputCol("vector") remover.setOutputCol("vector_no_stopw") stopwords = remover.getStopWords() stemmer = PorterStemmer() stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType())) df = df.select(clean_text(col("text")).alias("text")) df = tokenizer.transform(df).select("vector") df = remover.transform(df).select("vector_no_stopw") df = (df .withColumn("vector_stemmed", stemmer_udf("vector_no_stopw")) .select("vector_stemmed") ) return df
def bayes_cv(business_id): """ Crossvalidation of bayes model """ spark = yelp_lib.spark review = yelp_lib.get_parq('review') business_df = review.filter(review['business_id'] == business_id) regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") wordsDataFrame = regexTokenizer.transform(business_df) remover = StopWordsRemover(inputCol="words", outputCol="filtered") cleaned = remover.transform(wordsDataFrame) star_mapping = {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 1.0, 5: 1.0} cleaned = cleaned.replace(star_mapping, 'stars') cleaned = cleaned.withColumn("stars", cleaned["stars"].cast("double")) cv = CountVectorizer(inputCol="filtered", outputCol="features") model = cv.fit(cleaned) vectorized = model.transform(cleaned) vectorized = vectorized.select( col('stars').alias('label'), col('features')) splits = vectorized.randomSplit([0.6, 0.4], 1234) train = splits[0] test = splits[1] # create the trainer and set its parameters nb = NaiveBayes(smoothing=1.0) # train the model nb_model = nb.fit(train) # compute accuracy on the test set result = nb_model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") return "Accuracy: " + str(evaluator.evaluate(predictionAndLabels))
def preprocessing_titles(path, name): query = preprocessData(path) tokenizer = Tokenizer(inputCol="title", outputCol="tokenized_title") wordsData = tokenizer.transform(query) #after Stopword removal remover = StopWordsRemover(inputCol="tokenized_title", outputCol="filtered") wordsData = remover.transform(wordsData) df = wordsData.map(lambda x: x['id']).zipWithUniqueId().toDF( ["id", "index"]) df.registerTempTable("indices") wordsData.registerTempTable("words") qr = sqlContext.sql( "SELECT index,words.id,filtered FROM indices JOIN words ON words.id = indices.id" ) if name != '': exportOnS3(qr, "s3a://redit-preprocessed/", name) qr = qr.map(lambda Row: (Row['index'], Row['id'], Row['filtered']))
def pre_process_data(df): df_collumn = df.withColumn( "text", regexp_replace(lower(df["text"]), "[$&+,:;=?@#|'<>.-^*()%!]", "")) df_without = df_collumn.withColumn( "text", regexp_replace(lower(df_collumn["text"]), "-", " ")) df_read = df_without.select('*').withColumn("id", monotonically_increasing_id()) # Tokenize data tokenizer = Tokenizer(inputCol="text", outputCol="words") df_tokenized = tokenizer.transform(df_read) #Remove Stop Words language = "portuguese" remover = StopWordsRemover( inputCol="words", outputCol="filtered", stopWords=StopWordsRemover.loadDefaultStopWords(language)) df_clean = remover.transform(df_tokenized) #Return dataframe return df_clean
def init_base_df(file_path=default_file_path): # Set legacy parsing as Spark 3.0+ cannot use 'E' for timestamp spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY") print("Loading", default_file_path) raw_df = ( spark.read.format("csv") .option("inferSchema", True) .load(file_path) .toDF("polarity", "tweet_id", "datetime", "query", "user", "text") ) # Parse string to timestamp time_parsed_df = raw_df.withColumn( "timestamp", to_timestamp("datetime", "EEE MMM dd HH:mm:ss zzz yyyy") ) df = time_parsed_df.drop("query").drop("datetime") # Shift polarity from a range of [0:4], to [-1:1] scaled_polarity_df = df.withColumn("sentiment", (col("polarity") / 2) - 1).drop( "polarity" ) clean_text_df = df.select(clean_text(col("text")).alias("text"), "tweet_id") tokenizer = Tokenizer(inputCol="text", outputCol="vector") vector_df = tokenizer.transform(clean_text_df).select("vector", "tweet_id") remover = StopWordsRemover() stopwords = remover.getStopWords() remover.setInputCol("vector") remover.setOutputCol("tokens") tokens_no_stopw_df = remover.transform(vector_df).select("tokens", "tweet_id") tweets_with_tokens_df = scaled_polarity_df.join(tokens_no_stopw_df, on=["tweet_id"]) return tweets_with_tokens_df
def get_top_words(dataset, signatures): # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer # Or translate comments in other languages using the free Microsoft Translate API. sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1))) if sentenceData.rdd.isEmpty(): return dict() # Tokenize comments. tokenizer = Tokenizer(inputCol='user_comments', outputCol='words') wordsData = tokenizer.transform(sentenceData) # Remove duplicate words from comments. wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words']) if wordsData.rdd.isEmpty(): print("[WARNING]: wordsData is empty, sentenceData wasn't.") return dict() # Clean comment words by removing puntuaction and stemming. def clean_word(w): return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower())) wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words']) # XXX: Useless with TF-IDF? remover = StopWordsRemover(inputCol='words', outputCol='filtered') cleanWordsData = remover.transform(wordsData) cv = CountVectorizer(inputCol='filtered', outputCol='features') model = cv.fit(cleanWordsData) featurizedData = model.transform(cleanWordsData) idf = IDF(inputCol='features', outputCol='tfidf_features') idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect() return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
) strip_tags_udf = udf(strip_tags) tokenizer = Tokenizer(inputCol="comment_clean", outputCol="words") stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="tokens") # Load data comments = sqlContext.read.json(fn) # Calcualte tokens dataframe as one pipeline tokens = stopWordsRemover.transform( tokenizer.transform(comments\ .withColumn("comment_clean", strip_tags_udf(comments["comment_text"]))\ )\ )\ .select(explode("tokens").alias("token"))\ .groupBy("token")\ .count()\ .orderBy("count", ascending=False)\ .select("count")\ .limit(1000) # Switch to Pandas tokens_pdf = tokens.toPandas() tokens_pdf = tokens_pdf.ix[1:] tokens_pdf["rank"] = range(1, tokens_pdf.shape[0] + 1) print(tokens_pdf.head()) # Make a graph fig = sns.jointplot(x="rank", y="count", data=tokens_pdf) fig.savefig('temp.png')
.setInputCol("Description")\ .setOutputCol("DescOut")\ .setPattern(" ")\ .setGaps(False)\ .setToLowercase(True) rt.transform(sales.select("Description")).show(20, False) # COMMAND ---------- from pyspark.ml.feature import StopWordsRemover englishStopWords = StopWordsRemover.loadDefaultStopWords("english") stops = StopWordsRemover()\ .setStopWords(englishStopWords)\ .setInputCol("DescOut") stops.transform(tokenized).show() # COMMAND ---------- from pyspark.ml.feature import NGram unigram = NGram().setInputCol("DescOut").setN(1) bigram = NGram().setInputCol("DescOut").setN(2) unigram.transform(tokenized.select("DescOut")).show(False) bigram.transform(tokenized.select("DescOut")).show(False) # COMMAND ---------- from pyspark.ml.feature import CountVectorizer cv = CountVectorizer()\
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.feature import StopWordsRemover # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("StopWordsRemoverExample")\ .getOrCreate() # $example on$ sentenceData = spark.createDataFrame([ (0, ["I", "saw", "the", "red", "balloon"]), (1, ["Mary", "had", "a", "little", "lamb"]) ], ["id", "raw"]) remover = StopWordsRemover(inputCol="raw", outputCol="filtered") remover.transform(sentenceData).show(truncate=False) # $example off$ spark.stop()
from pyspark.sql import SQLContext from pyspark.sql.functions import desc, explode from pyspark.sql.types import * from storage import Sqlite PARTITIONS = 500 THRESHOLD = 50 if __name__ == "__main__": conf = SparkConf().setAppName("reddit") conf.set('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') conf.set('spark.local.dir', '/mnt/work') conf.set('spark.driver.maxResultSize', '12g') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) fields = [StructField("subreddit", StringType(), True), StructField("body", StringType(), True)] rawDF = sqlContext.read.json("file:///mnt/s3/2015/*", StructType(fields)) # split comments into words tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsDataFrame = tokenizer.transform(rawDF) remover = StopWordsRemover(inputCol="words", outputCol="filtered") filteredDataFrame = remover.transform(wordsDataFrame) # explode terms into individual rows termDataFrame = filteredDataFrame.select(['subreddit', explode(filteredDataFrame.filtered).alias("term")]) # group by subreddit and term, then count occurence of term in subreddits countsDataFrame = termDataFrame.groupBy(['subreddit', 'term']).count() db = Sqlite() countsDataFrame.select(['subreddit', 'term', 'count']).filter('count > {}'.format(THRESHOLD)).foreachPartition(db.saveSubredditWords)
def preprocess_tweets(tweets): tokenizer = Tokenizer(inputCol="text", outputCol="words") tweets = tokenizer.transform(tweets) remover = StopWordsRemover(inputCol="words", outputCol="filtered") tweets = remover.transform(tweets) return tweets