def main(inputs):
    reviews_df = utilities.get_completereviews_dataframe(spark)
    #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA)
    #reviews_df.cache()
    helpful_df = reviews_df.filter(reviews_df.product_id.isNotNull()).filter(
        reviews_df.total_votes > 1).filter(
            reviews_df.helpful_votes > 0).cache()

    helpful_df = helpful_df.withColumn(
        'helpfulness_percentage',
        (helpful_df.helpful_votes / helpful_df.total_votes) * 100)
    helpful_df = helpful_df.select('helpfulness_percentage', 'star_rating')

    helpful_df = helpful_df.groupBy('star_rating').agg(
        functions.mean(helpful_df.helpfulness_percentage).alias(
            'mean_helpfulness_percent')).select(
                'star_rating', 'mean_helpfulness_percent').cache()
    helpful_df.repartition(1).write.mode('overwrite').csv(
        'helpfulness_vs_rating')

    avg_helpfulness_dict = helpful_df.coalesce(1).rdd.collectAsMap()

    x_array = list(avg_helpfulness_dict.keys())
    y_array = list(avg_helpfulness_dict.values())

    plt.bar(x_array, y_array, color='g')
    plt.xlabel('Ratings')
    plt.ylabel('Average helpfulness')
    plt.title('Avgerage helpfuless vs rating')
    plt.show()
def main(inputs):
    reviews_df = utilities.get_completereviews_dataframe(spark)
    #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA, encoding='utf-8')

    reviews_filtered = reviews_df.filter(
        reviews_df.product_id.isNotNull()).filter(
            reviews_df.total_votes > 1).filter(reviews_df.helpful_votes > 0)

    helpful_df = reviews_filtered.withColumn(
        'helpfulness_percentage',
        (reviews_filtered.helpful_votes / reviews_filtered.total_votes) *
        100).select('customer_id', 'helpfulness_percentage').cache()

    aggregations = [
        functions.count('*').alias('num_reviews'),
        functions.mean('helpfulness_percentage').alias(
            'mean_helpfulness_percentage')
    ]

    reviews_count_df = helpful_df.groupBy('customer_id').agg(*aggregations)

    reviews_count_df = reviews_count_df.filter(
        reviews_count_df.num_reviews > 150).filter(
            reviews_count_df.mean_helpfulness_percentage > 90)

    reviews_count_df.sort(
        'num_reviews',
        ascending=False).write.mode('overwrite').csv('influential_reviewers')
def main(inputs):
    reviews_df = utilities.get_completereviews_dataframe(spark)
    #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA)
    reviews_df = reviews_df.filter(reviews_df.product_id.isNotNull())

    aggregated_df = reviews_df.groupBy(reviews_df.product_category).agg(functions.avg('star_rating').alias('average')).cache()
    aggregated_df.write.mode('overwrite').csv('category_wise_average')

    aggregated_dict_high = aggregated_df.sort(aggregated_df.average, ascending=False).rdd.collectAsMap()
    x_values_high = list(aggregated_dict_high.keys())
    labels_high = list(aggregated_dict_high.values())
    y_values_high = range(len(labels_high))

    plt.barh(y_values_high, labels_high)
    plt.yticks(y_values_high, x_values_high)
    plt.xlabel('Star Rating of an average product')
    plt.ylabel('Categories')
    plt.title('Categories wise average ratings')

    # aggregated_dict_low = aggregated_df.sort(aggregated_df.average).rdd.collectAsMap()
    # x_values_low = list(aggregated_dict_low.keys())
    # labels_low = list(aggregated_dict_low.values())
    # y_values_low = range(len(labels_low))
    #
    # plt.subplot(2, 1, 2)
    # plt.barh(y_values_low, labels_low)
    # plt.yticks(y_values_low, x_values_low, rotation='30')
    # plt.xlabel('Star Rating of an average product')
    # plt.ylabel('Categories')
    # plt.title('Categories with lowest average rating')

    plt.show()
Exemplo n.º 4
0
def main(inputs):
    reviews_df = utilities.get_completereviews_dataframe(spark).dropna()
    #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA).dropna()
    reviews_df.cache()

    reviews_df = reviews_df.filter(
        reviews_df.product_id.isNotNull()).withColumn(
            'length_words', word_count(reviews_df.review_body))
    avg_length_df = reviews_df.groupBy('product_category').agg(
        functions.avg('length_words').alias('avg_length_words')).select(
            'product_category', 'avg_length_words').cache()
    avg_length_df.repartition(1).write.mode('overwrite').csv(
        'avg_length_per_category')

    avg_length_dict = avg_length_df.rdd.collectAsMap()
    avg_length_ddict = collections.OrderedDict(
        sorted(avg_length_dict.items(), reverse=True))

    x_values_high = list(avg_length_ddict.keys())
    labels_high = list(avg_length_ddict.values())
    y_values_high = range(len(labels_high))

    plt.barh(y_values_high, labels_high, color='g')
    plt.yticks(y_values_high, x_values_high)

    plt.xlabel('Average Review Length in words')
    plt.ylabel('Category')
    plt.title('Average Length across categories')
    plt.show()
Exemplo n.º 5
0
def main(inputs):
    reviews_df = utilities.get_completereviews_dataframe(spark)
    #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA, encoding='utf-8')

    helpful_df = reviews_df.filter(reviews_df.product_id.isNotNull()).filter(
        reviews_df.total_votes > 1).filter(
            reviews_df.helpful_votes > 0).cache()

    helpful_df = helpful_df.withColumn(
        'helpfulness_percentage',
        (helpful_df.helpful_votes / helpful_df.total_votes) * 100)
    helpful_df = helpful_df.withColumn(
        'length_words',
        word_count(reviews_df.review_body)).select('helpfulness_percentage',
                                                   'star_rating',
                                                   'length_words')

    helpful_star_rating_corr = helpful_df.corr('helpfulness_percentage',
                                               'star_rating')
    length_rating_corr = helpful_df.corr('length_words', 'star_rating')
    star_length_corr = helpful_df.corr('helpfulness_percentage',
                                       'length_words')

    print(helpful_star_rating_corr)
    print(length_rating_corr)
    print(star_length_corr)
def main(inputs):
    reviews_df = utilities.get_completereviews_dataframe(spark).dropna()
    #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA)
    reviews_df.cache()
    helpful_df = reviews_df.filter(reviews_df.product_id.isNotNull()).filter(reviews_df.total_votes > 1).filter(reviews_df.helpful_votes > 0).cache()
    helpful_df = helpful_df.withColumn('helpfulness_percentage',(helpful_df.helpful_votes / helpful_df.total_votes) * 100)
    helpful_df = helpful_df.filter(helpful_df.helpfulness_percentage > 90).select('review_body')            # take most helpful reviews

    helpful_df = helpful_df.withColumn('word_count', word_count(helpful_df.review_body)).cache()

    ## median of most helpful reviews
    median = helpful_df.approxQuantile('word_count', [0.5], 0.25)
    sc.parallelize(median).repartition(1).saveAsTextFile('median_words_in_helpfulreviews.csv')

    ## average, std of most helpful reviews
    aggregations = [functions.avg('word_count').alias('average_wordcount'), functions.stddev('word_count').alias('std_wordcount')]

    stats_df = helpful_df.agg(*aggregations)
    stats_df.repartition(1).write.mode('overwrite').csv('average_stddev_wordcount_mosthelpful')


    count = reviews_df.filter(reviews_df.customer_id.isNotNull()).select('customer_id').agg(functions.countDistinct('customer_id'))
    count.repartition(1).write.mode('overwrite').csv('total_customersin_dataset')

    product_count = reviews_df.filter(reviews_df.product_id.isNotNull()).select('product_id').agg(functions.countDistinct('product_id'))
    product_count.repartition(1).write.mode('overwrite').csv('total_productsin_dataset')
Exemplo n.º 7
0
def main(inputs):
    reviews_df = utilities.get_completereviews_dataframe(spark)
    #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA)
    #Removing rows which have total votes or helpful votes 0 to avoid divide by zero error
    helpful_df = reviews_df.filter(reviews_df.product_id.isNotNull()).filter(
        reviews_df.total_votes > 1).filter(
            reviews_df.helpful_votes > 0).cache()

    helpful_df = helpful_df.withColumn(
        'helpfulness_percentage',
        (helpful_df.helpful_votes / helpful_df.total_votes) * 100)
    helpful_df = helpful_df.select('helpfulness_percentage',
                                   'product_category')

    helpful_df = helpful_df.groupBy('product_category').agg(
        functions.mean(helpful_df.helpfulness_percentage).alias(
            'mean_helpfulness_percent'))
    helpful_df.write.mode('overwrite').csv('category_mean_helpfulness')

    #Collecting is safe since we know the result is a very small dataset - rows = no of product categories
    aggregated_dict_high = helpful_df.sort(helpful_df.mean_helpfulness_percent,
                                           ascending=False).rdd.collectAsMap()

    x_values_high = list(aggregated_dict_high.keys())
    labels_high = list(aggregated_dict_high.values())
    y_values_high = range(len(labels_high))

    plt.barh(y_values_high, labels_high)
    plt.yticks(y_values_high, x_values_high)
    plt.xlabel('Helpfulness percentage of an average review')
    plt.ylabel('Categories')
    plt.title('Categories with most helpful reviews')

    plt.show()
Exemplo n.º 8
0
def main(inputs):
    reviews_df = utilities.get_completereviews_dataframe(spark)
    # nltk.data.path.append('/home/youruserid/nltk_data')
    # reviews_df = spark.read.parquet(inputs)
    reviews_df = reviews_df.filter(reviews_df.product_id.isNotNull())

    positive_reviews_df = reviews_df.filter(reviews_df.star_rating > 3)
    negative_reviews_df = reviews_df.filter(reviews_df.star_rating < 3)
    neutral_reviews_df = reviews_df.filter(reviews_df.star_rating == 3)

    save_adjectives(positive_reviews_df, 'positive_adjectives')
    save_adjectives(negative_reviews_df, 'negative_adjectives')
    save_adjectives(neutral_reviews_df, 'neutral_adjectivees')
Exemplo n.º 9
0
def main(inputs):
    reviews_df = utilities.get_completereviews_dataframe(spark)
    #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA)
    reviews_df = reviews_df.filter(reviews_df.product_id.isNotNull()).select(
        reviews_df.star_rating).cache()
    ratings_dict = reviews_df.groupBy('star_rating').agg(
        functions.count('star_rating').alias('count')).rdd.collectAsMap()

    x_array = list(ratings_dict.keys())
    y_array = list(ratings_dict.values())

    plt.bar(x_array, y_array)
    plt.xlabel('Ratings of products on Amazon')
    plt.ylabel('count')
    plt.title('Count of each rating')
    plt.show()
    plt.savefig('../../figures/ratings_distribution.png')
def main(inputs):
    reviews_df = utilities.get_completereviews_dataframe(spark)
    #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA)

    #find average rating score over all reviews
    reviews_df = reviews_df.filter(reviews_df.product_id.isNotNull()).select(reviews_df.star_rating).cache()
    mean = reviews_df.agg(functions.avg(reviews_df.star_rating).alias('mean'))
    mean.write.mode('overwrite').csv('average')

    #find median
    median = reviews_df.approxQuantile('star_rating', [0.5], 0.25)
    sc.parallelize(median).saveAsTextFile('median.csv')
    print("The median of the dataset is : " + str(median))

    #find mode
    mode_dict = reviews_df.groupBy('star_rating').agg(functions.count('star_rating').alias('count')).rdd.collectAsMap()
    mode = keyWithMaxValue(mode_dict)
    mode = [mode]
    sc.parallelize(list(mode)).saveAsTextFile('mode.csv')
    print("The mode of the dataset: " + str(mode))
def main(inputs):
    reviews_df = utilities.get_completereviews_dataframe(spark)
    #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA)
    reviews_df.cache()

    product_count_df = reviews_df.filter(reviews_df.customer_id.isNotNull()).groupBy('product_category').agg(functions.countDistinct('product_id'))
    product_count_df.write.mode('overwrite').csv('product_count_categorywise')
    product_count = product_count_df.rdd.collectAsMap()
    aggregated_dict = collections.OrderedDict(sorted(product_count.items(), reverse=True))

    x_values_high = list(aggregated_dict.keys())
    labels_high = list(aggregated_dict.values())
    y_values_high = range(len(labels_high))

    plt.barh(y_values_high, labels_high, color='g')
    plt.yticks(y_values_high, x_values_high)

    plt.xlabel('Number of distinct products')
    plt.ylabel('Category')
    plt.title('Product count across categories')
    plt.show()
Exemplo n.º 12
0
def main(inputs):
    reviews_df = utilities.get_completereviews_dataframe(spark).dropna()
    #reviews_df = spark.read.csv(sep='\t', path=inputs, schema=utilities.REVIEWS_SCHEMA).dropna()
    reviews_df.cache()

    reviews_df = reviews_df.filter(
        reviews_df.product_id.isNotNull()).withColumn(
            'length_words', word_count(reviews_df.review_body))
    avg_length_df = reviews_df.groupBy('star_rating').agg(
        functions.avg('length_words').alias('avg_length_words')).select(
            'star_rating', 'avg_length_words').cache()
    avg_length_df.write.mode('overwrite').csv('avg_length_per_rating')

    avg_length_dict = avg_length_df.rdd.collectAsMap()

    x_array = list(avg_length_dict.keys())
    y_array = list(avg_length_dict.values())

    plt.bar(x_array, y_array, color='g')
    plt.xlabel('Ratings of products on Amazon')
    plt.ylabel('Avg length in words')
    plt.title('Avgerage length vs Ratings')
    plt.show()