예제 #1
0
def test_unigram_and_bigram():
    df = SPARK_SESSION.sparkContext. \
        parallelize([['this is the best sentence ever'],
                     ['this is however the worst sentence available']]). \
        toDF(schema=types.StructType().add('sentence', types.StringType()))
    import requests
    stop_words = requests.get(
        'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words'
    ).text.split()

    tokenizer = feature.Tokenizer().setInputCol(
        'sentence') | feature.StopWordsRemover(stopWords=stop_words)
    unigram = feature.CountVectorizer()
    bigram = feature.NGram() | feature.CountVectorizer()
    trigram = feature.NGram(n=3) | feature.CountVectorizer()
    tf = tokenizer | (unigram, bigram, trigram) | feature.VectorAssembler()
    tfidf = tf | feature.IDF().setOutputCol('features')

    tfidf_model = tfidf.fit(df)
    assert_equal(
        tfidf_model.transform(df).select('sentence', 'features').count(), 2)
예제 #2
0
def n_gram(df, input_col, n=2):
    """
    Converts the input array of strings inside of a Spark DF into an array of n-grams.
    :param df: Pyspark dataframe to analyze
    :param input_col: Column to analyzer.
    :param n: number of elements per n-gram >=1.
    :return: Spark DataFrame with n-grams calculated.
    """

    is_dataframe(df)

    tokenizer = feature.Tokenizer().setInputCol(input_col) | feature.StopWordsRemover()
    count = feature.CountVectorizer()
    gram = feature.NGram(n=n) | feature.CountVectorizer()
    tf = tokenizer | (count, gram) | feature.VectorAssembler()
    tfidf = tf | feature.IDF().setOutputCol('features')

    tfidf_model = tfidf.fit(df)
    df_model = tfidf_model.transform(df)
    return df_model, tfidf_model
예제 #3
0
def main():
    sc, sqlContext, spark = spark_init()
    df = spark.read.csv(os.path.join(data_dir, 'tcat*.csv'), header=True)
    # Removing the retweet indicator RT from tweets
    df = df.withColumn('text', functions.regexp_replace('text', 'RT', ''))
    df = extract_basics(df)

    from_users_df = pd.DataFrame(df.groupBy('from_user_name').count()\
                    .sort(functions.desc('count')).take(1000), columns = ['from_user_name', 'count'])
    from_users_df.to_csv(os.path.join(data_dir, 'frequent_users.csv'),
                         index=False)

    from_users_df['botometer_scores'] = from_users_df['from_user_name'].apply(
        get_botometer_score)
    bots_df = from_users_df[from_users_df['botometer_scores'] >= 3]
    bots_df.to_csv(os.path.join(data_dir, 'bots.csv'), index=False)

    user_mentions_df = get_topN_of_col(df, 'user_mentions', 50)
    user_mentions_df.to_csv(os.path.join(data_dir,
                                         'frequent_user_mentions.csv'),
                            index=False)

    bot_set = set(bots_df['from_user_name'].to_list())
    df = df.where(functions.col('from_user_name').isin(bot_set))

    hashtags_df = get_topN_of_col(df, 'hashtags', 50)
    hashtags_df.to_csv(os.path.join(data_dir, 'frequent_hashtags.csv'),
                       index=False)

    emojis_df = get_topN_of_col(df, 'emojis', 50)
    emojis_df.to_csv(os.path.join(data_dir, 'frequent_emojis.csv'),
                     index=False)

    emoji_sequence_df = get_topN_of_col(df, 'emoji_sequence', 50)
    emoji_sequence_df.to_csv(os.path.join(data_dir,
                                          'frequent_emoji_sequences.csv'),
                             index=False)

    words_df = get_topN_of_col(df, 'words', 50)
    words_df.to_csv(os.path.join(data_dir, 'frequent_words.csv'), index=False)

    bigram = feature.NGram(n=2, inputCol='words',
                           outputCol='BiGrams').transform(df.select(df.words))
    trigram = feature.NGram(n=3,
                            inputCol='words', outputCol='TriGrams').transform(
                                df.select(df.words))
    bigram_pd_df = get_topN_of_col(bigram, 'BiGrams', 20)
    trigram_pd_df = get_topN_of_col(trigram, 'TriGrams', 20)
    bigram_pd_df.to_csv(os.path.join(data_dir, 'frequent_bigrams.csv'),
                        index=False)
    trigram_pd_df.to_csv(os.path.join(data_dir, 'frequent_trigrams.csv'),
                         index=False)

    # To retrieve ngrams containing specific search term
    get_frequent_ngrams_containing('home', bigram, 'BiGrams', 20)
    get_frequent_ngrams_containing('safe', trigram, 'TriGrams', 20)

    # Descriptive statistics of sentiment scores
    stats_df = df.select(functions.mean(df.sentiment_score).alias('mean'),\
                     functions.min(df.sentiment_score).alias('min'),\
                     functions.max(df.sentiment_score).alias('max'),\
                     functions.stddev(df.sentiment_score).alias('stddev'),\
                     functions.variance(df.sentiment_score).alias('variance'))
    stats_pd_df = stats_df.toPandas()
    stats_pd_df.to_csv(os.path.join(data_dir, 'sentiment_stats.csv'),
                       index=False)

    quantiles = df.approxQuantile(col='sentiment_score', probabilities=[0.0, 0.25, 0.5, 0.75, 1.0],\
                                 relativeError=0.05)
    sns.set(style='darkgrid', palette='pastel')
    plt.figure(figsize=(16, 6))
    sns.boxplot(palette=['m'], data=quantiles, orient='h')
    plt.savefig(os.path.join(data_dir, 'sentiment_score_boxplot.png'))
    plt.close()
    df_yelp_review["name"].isin(top_restaurants_list))

df_review_top_rest = df_review_top_rest.select("text").limit(10000)

tokenizer = ft.RegexTokenizer(inputCol='text',
                              outputCol='word',
                              pattern='\s+|[,.\"]')

tok = tokenizer \
    .transform(df_review_top_rest) \
    .select('word')

stopwords = ft.StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                outputCol='input_stop')

ngram = ft.NGram(n=2, inputCol=stopwords.getOutputCol(), outputCol="nGrams")

pipeline = Pipeline(stages=[tokenizer, stopwords, ngram])

data_ngram = pipeline \
    .fit(df_review_top_rest) \
    .transform(df_review_top_rest)

data_ngram = data_ngram.select('nGrams')

FWords = data_ngram.rdd.flatMap(once)
WCount = FWords.reduceByKey(operator.add)
FreqWords = WCount.sortBy(lambda t: t[1], ascending=False).take(400)
FreqWordDict = dict(FreqWords)

#print(FreqWordDict)