Пример #1
0
def main():
    movie_data = get_movie_data()
    print movie_data.first()
    num_movies = movie_data.count()
    print "Movies: %d" % num_movies
    movie_fields = movie_data.map(lambda lines: lines.split("|"))
    years = movie_fields.map(lambda fields: fields[2]).map(
        lambda x: convert_year(x))
    # we filter out any 'bad' data points here
    years_filtered = years.filter(lambda x: x != 1900)
    # plot the movie ages histogram
    movie_ages = years_filtered.map(lambda yr: 1998 - yr).countByValue()
    values = movie_ages.values()
    bins = movie_ages.keys()
    plt.hist(values, bins=bins, color='lightblue', normed=True)
    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(16, 10)
    plt.show()
Пример #2
0
def main():
    movie_data = get_movie_data()

    print(movie_data.first())
    num_movies = movie_data.count()
    print("Movies: %d" % num_movies)
    #movie_years = movie_data.select("year")
    #from pyspark.sql.functions import udf
    #from pyspark.sql import SparkSession

    spark.udf.register("convert_year", convert_year)
    # Bug in pyspark 2.0.0 reverting to RDD
    # https://issues.apache.org/jira/browse/SPARK-17538
    #movie_data.createTempView("movie_data")
    #movie_years = spark.sql("select convertYear(date) as year from movie_data")
    #print(movie_years.first)

    movie_fields = movie_data.map(lambda lines: lines.split("|"))
    print(len(movie_fields.first()))
    years = movie_fields.map(lambda fields: fields[2]).map(
        lambda x: convert_year(x))
    # # we filter out any 'bad' data points here
    years_filtered = years.filter(lambda x: x != 1900)
    years_filtered = years_filtered.sortBy(lambda years: -years)
    # plot the movie ages histogram
    movie_ages = years_filtered.map(lambda yr: 1998 - yr).countByValue()
    # movie_ages = years_filtered.map(lambda yr: yr).countByValue()
    values = movie_ages.values()
    bins = list(movie_ages.keys())
    print("")
    print(bins)
    plt.hist(values, bins=bins, color='lightblue', density=True)
    plt.xticks(fontsize='12')

    # fig, ax = matplotlib.pyplot.subplots()
    # fig.set_size_inches(16, 10)

    # for tick in ax.xaxis.get_major_ticks():
    #    tick.label.set_fontsize(8)
    #            # specify integer or one of preset strings, e.g.
    #            #tick.label.set_fontsize('x-small')
    #    tick.label.set_rotation('vertical')
    plt.show()
def main():
    rating_data_raw = get_rating_data()
    print(rating_data_raw.first())
    num_ratings = rating_data_raw.count()
    print("Ratings: %d" % num_ratings)
    num_movies = get_movie_data().count()
    num_users = get_user_data().count()

    rating_data = rating_data_raw.map(lambda line: line.split("\t"))
    ratings = rating_data.map(lambda fields: int(fields[2]))
    max_rating = ratings.reduce(lambda x, y: max(x, y))
    min_rating = ratings.reduce(lambda x, y: min(x, y))
    mean_rating = ratings.reduce(lambda x, y: x + y) / float(num_ratings)
    median_rating = np.median(ratings.collect())
    ratings_per_user = num_ratings / num_users
    ratings_per_movie = num_ratings / num_movies
    print("Min rating: %d" % min_rating)
    print("Max rating: %d" % max_rating)
    print("Average rating: %2.2f" % mean_rating)
    print("Median rating: %d" % median_rating)
    print("Average # of ratings per user: %2.2f" % ratings_per_user)
    print("Average # of ratings per movie: %2.2f" % ratings_per_movie)