def feature_extract(train_t): stopWords = spark_ft.StopWordsRemover.loadDefaultStopWords('english') sw_remover1 = spark_ft.StopWordsRemover(inputCol='ntokens1', outputCol='clean_tokens1', stopWords=stopWords) text2vec1 = spark_ft.Word2Vec(vectorSize=50, minCount=1, seed=123, inputCol='ntokens1', outputCol='text_vec1', windowSize=1, maxSentenceLength=100) assembler1 = spark_ft.VectorAssembler(inputCols=['text_vec1'], outputCol='features1') sw_remover2 = spark_ft.StopWordsRemover(inputCol='ntokens2', outputCol='clean_tokens2', stopWords=stopWords) text2vec2 = spark_ft.Word2Vec(vectorSize=50, minCount=1, seed=123, inputCol='ntokens2', outputCol='text_vec2', windowSize=1, maxSentenceLength=100) assembler2 = spark_ft.VectorAssembler(inputCols=['text_vec2'], outputCol='features2') feature_pipeline = Pipeline(stages=[ sw_remover1, text2vec1, assembler1, sw_remover2, text2vec2, assembler2 ]) feature_model = feature_pipeline.fit(train_t) train_featurized = feature_model.transform(train_t).persist() tA = train_featurized.select('text_vec1').collect() tA_array = np.array(tA) tB = train_featurized.select('text_vec2').collect() tB_array = np.array(tB) return tA_array, tB_array
def canonicaltokens(df, inputColumn, outputColumn): """ turn input column of strings into canonical format as output column of tokens return as output column added to the dataframe """ newname = df.withColumn("cleanname", \ f.regexp_replace(f.regexp_replace(f.rtrim(f.ltrim(f.col(inputColumn))), \ " (\w) (\w) ", "$1$2"), "(\w) (\w) (\w)$", "$1$2$3")) newtokenizer = mlf.Tokenizer(inputCol="cleanname", outputCol="words") chtokenized = newtokenizer.transform(newname).drop("cleanname") stopwordremover = mlf.StopWordsRemover(inputCol="words", outputCol=outputColumn) canonicalname = stopwordremover.transform(chtokenized).drop("words") return canonicalname
def n_gram(df, input_col, n=2): """ Converts the input array of strings inside of a Spark DF into an array of n-grams. :param df: Pyspark dataframe to analyze :param input_col: Column to analyzer. :param n: number of elements per n-gram >=1. :return: Spark DataFrame with n-grams calculated. """ is_dataframe(df) tokenizer = feature.Tokenizer().setInputCol(input_col) | feature.StopWordsRemover() count = feature.CountVectorizer() gram = feature.NGram(n=n) | feature.CountVectorizer() tf = tokenizer | (count, gram) | feature.VectorAssembler() tfidf = tf | feature.IDF().setOutputCol('features') tfidf_model = tfidf.fit(df) df_model = tfidf_model.transform(df) return df_model, tfidf_model
def test_unigram_and_bigram(): df = SPARK_SESSION.sparkContext. \ parallelize([['this is the best sentence ever'], ['this is however the worst sentence available']]). \ toDF(schema=types.StructType().add('sentence', types.StringType())) import requests stop_words = requests.get( 'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words' ).text.split() tokenizer = feature.Tokenizer().setInputCol( 'sentence') | feature.StopWordsRemover(stopWords=stop_words) unigram = feature.CountVectorizer() bigram = feature.NGram() | feature.CountVectorizer() trigram = feature.NGram(n=3) | feature.CountVectorizer() tf = tokenizer | (unigram, bigram, trigram) | feature.VectorAssembler() tfidf = tf | feature.IDF().setOutputCol('features') tfidf_model = tfidf.fit(df) assert_equal( tfidf_model.transform(df).select('sentence', 'features').count(), 2)
spark = SparkSession(sc) schema = StructType([StructField('documents', StringType(), True)]) text_1 = spark.read.format('text').schema(schema).load( '20news-19997/20_newsgroups/alt.atheism/49960.txt') text_2 = spark.read.format('text').schema(schema).load( '20news-19997/20_newsgroups/alt.atheism/51060.txt') text_data = text_1.union(text_2) tokenizer = ft.RegexTokenizer(inputCol='documents', outputCol='input_arr', pattern=r'\s+|[,.\"]') df1 = tokenizer.transform(text_data) stopwords = ft.StopWordsRemover(inputCol='input_arr', outputCol='input_stop') df2 = stopwords.transform(df1) stringIndex = ft.CountVectorizer(inputCol='input_stop', outputCol='input_indexed') cv_model = stringIndex.fit(df2) df3 = cv_model.transform(df2) df3.select('input_stop', 'input_indexed').show(truncate=False) lda = LDA(k=2, maxIter=10, optimizer='em', featuresCol='input_indexed') model = lda.fit(df3) print("vocal size", model.vocabSize()) print(model.topicsMatrix) topics = model.describeTopics()
train, test = processed.randomSplit(weights=[0.7, 0.3], seed=123) print(train.count()) print(test.count()) # COMMAND ---------- # MAGIC %md #### Train Classifier # COMMAND ---------- from pyspark.ml import feature as spark_ft stopWords = spark_ft.StopWordsRemover.loadDefaultStopWords('english') sw_remover = spark_ft.StopWordsRemover(inputCol='ntokens', outputCol='clean_tokens', stopWords=stopWords) tf = spark_ft.CountVectorizer(vocabSize=500, inputCol='clean_tokens', outputCol='tf') idf = spark_ft.IDF(minDocFreq=5, inputCol='tf', outputCol='idf') feature_pipeline = Pipeline(stages=[sw_remover, tf, idf]) feature_model = feature_pipeline.fit(train) train_featurized = feature_model.transform(train).persist() # COMMAND ---------- display(train_featurized.groupBy("label").count())
top_restaurants_list = [(i.name) for i in top_restaurants.collect()] df_review_top_rest = df_yelp_review.filter( df_yelp_review["name"].isin(top_restaurants_list)) df_review_top_rest = df_review_top_rest.select("text").limit(10000) tokenizer = ft.RegexTokenizer(inputCol='text', outputCol='word', pattern='\s+|[,.\"]') tok = tokenizer \ .transform(df_review_top_rest) \ .select('word') stopwords = ft.StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='input_stop') ngram = ft.NGram(n=2, inputCol=stopwords.getOutputCol(), outputCol="nGrams") pipeline = Pipeline(stages=[tokenizer, stopwords, ngram]) data_ngram = pipeline \ .fit(df_review_top_rest) \ .transform(df_review_top_rest) data_ngram = data_ngram.select('nGrams') FWords = data_ngram.rdd.flatMap(once) WCount = FWords.reduceByKey(operator.add) FreqWords = WCount.sortBy(lambda t: t[1], ascending=False).take(400) FreqWordDict = dict(FreqWords)