def test_unigram_and_bigram(): df = SPARK_SESSION.sparkContext. \ parallelize([['this is the best sentence ever'], ['this is however the worst sentence available']]). \ toDF(schema=types.StructType().add('sentence', types.StringType())) import requests stop_words = requests.get( 'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words' ).text.split() tokenizer = feature.Tokenizer().setInputCol( 'sentence') | feature.StopWordsRemover(stopWords=stop_words) unigram = feature.CountVectorizer() bigram = feature.NGram() | feature.CountVectorizer() trigram = feature.NGram(n=3) | feature.CountVectorizer() tf = tokenizer | (unigram, bigram, trigram) | feature.VectorAssembler() tfidf = tf | feature.IDF().setOutputCol('features') tfidf_model = tfidf.fit(df) assert_equal( tfidf_model.transform(df).select('sentence', 'features').count(), 2)
def n_gram(df, input_col, n=2): """ Converts the input array of strings inside of a Spark DF into an array of n-grams. :param df: Pyspark dataframe to analyze :param input_col: Column to analyzer. :param n: number of elements per n-gram >=1. :return: Spark DataFrame with n-grams calculated. """ is_dataframe(df) tokenizer = feature.Tokenizer().setInputCol(input_col) | feature.StopWordsRemover() count = feature.CountVectorizer() gram = feature.NGram(n=n) | feature.CountVectorizer() tf = tokenizer | (count, gram) | feature.VectorAssembler() tfidf = tf | feature.IDF().setOutputCol('features') tfidf_model = tfidf.fit(df) df_model = tfidf_model.transform(df) return df_model, tfidf_model
def main(): sc, sqlContext, spark = spark_init() df = spark.read.csv(os.path.join(data_dir, 'tcat*.csv'), header=True) # Removing the retweet indicator RT from tweets df = df.withColumn('text', functions.regexp_replace('text', 'RT', '')) df = extract_basics(df) from_users_df = pd.DataFrame(df.groupBy('from_user_name').count()\ .sort(functions.desc('count')).take(1000), columns = ['from_user_name', 'count']) from_users_df.to_csv(os.path.join(data_dir, 'frequent_users.csv'), index=False) from_users_df['botometer_scores'] = from_users_df['from_user_name'].apply( get_botometer_score) bots_df = from_users_df[from_users_df['botometer_scores'] >= 3] bots_df.to_csv(os.path.join(data_dir, 'bots.csv'), index=False) user_mentions_df = get_topN_of_col(df, 'user_mentions', 50) user_mentions_df.to_csv(os.path.join(data_dir, 'frequent_user_mentions.csv'), index=False) bot_set = set(bots_df['from_user_name'].to_list()) df = df.where(functions.col('from_user_name').isin(bot_set)) hashtags_df = get_topN_of_col(df, 'hashtags', 50) hashtags_df.to_csv(os.path.join(data_dir, 'frequent_hashtags.csv'), index=False) emojis_df = get_topN_of_col(df, 'emojis', 50) emojis_df.to_csv(os.path.join(data_dir, 'frequent_emojis.csv'), index=False) emoji_sequence_df = get_topN_of_col(df, 'emoji_sequence', 50) emoji_sequence_df.to_csv(os.path.join(data_dir, 'frequent_emoji_sequences.csv'), index=False) words_df = get_topN_of_col(df, 'words', 50) words_df.to_csv(os.path.join(data_dir, 'frequent_words.csv'), index=False) bigram = feature.NGram(n=2, inputCol='words', outputCol='BiGrams').transform(df.select(df.words)) trigram = feature.NGram(n=3, inputCol='words', outputCol='TriGrams').transform( df.select(df.words)) bigram_pd_df = get_topN_of_col(bigram, 'BiGrams', 20) trigram_pd_df = get_topN_of_col(trigram, 'TriGrams', 20) bigram_pd_df.to_csv(os.path.join(data_dir, 'frequent_bigrams.csv'), index=False) trigram_pd_df.to_csv(os.path.join(data_dir, 'frequent_trigrams.csv'), index=False) # To retrieve ngrams containing specific search term get_frequent_ngrams_containing('home', bigram, 'BiGrams', 20) get_frequent_ngrams_containing('safe', trigram, 'TriGrams', 20) # Descriptive statistics of sentiment scores stats_df = df.select(functions.mean(df.sentiment_score).alias('mean'),\ functions.min(df.sentiment_score).alias('min'),\ functions.max(df.sentiment_score).alias('max'),\ functions.stddev(df.sentiment_score).alias('stddev'),\ functions.variance(df.sentiment_score).alias('variance')) stats_pd_df = stats_df.toPandas() stats_pd_df.to_csv(os.path.join(data_dir, 'sentiment_stats.csv'), index=False) quantiles = df.approxQuantile(col='sentiment_score', probabilities=[0.0, 0.25, 0.5, 0.75, 1.0],\ relativeError=0.05) sns.set(style='darkgrid', palette='pastel') plt.figure(figsize=(16, 6)) sns.boxplot(palette=['m'], data=quantiles, orient='h') plt.savefig(os.path.join(data_dir, 'sentiment_score_boxplot.png')) plt.close()
df_yelp_review["name"].isin(top_restaurants_list)) df_review_top_rest = df_review_top_rest.select("text").limit(10000) tokenizer = ft.RegexTokenizer(inputCol='text', outputCol='word', pattern='\s+|[,.\"]') tok = tokenizer \ .transform(df_review_top_rest) \ .select('word') stopwords = ft.StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='input_stop') ngram = ft.NGram(n=2, inputCol=stopwords.getOutputCol(), outputCol="nGrams") pipeline = Pipeline(stages=[tokenizer, stopwords, ngram]) data_ngram = pipeline \ .fit(df_review_top_rest) \ .transform(df_review_top_rest) data_ngram = data_ngram.select('nGrams') FWords = data_ngram.rdd.flatMap(once) WCount = FWords.reduceByKey(operator.add) FreqWords = WCount.sortBy(lambda t: t[1], ascending=False).take(400) FreqWordDict = dict(FreqWords) #print(FreqWordDict)