예제 #1
0
def main():
    # get user data
    user_data = get_user_data()
    # extract occupation
    user_occ = user_data.groupby("occupation").count().collect()
    # extract list of tuple occupation by length
    # user_occ_list = map(lambda x: (x.occupation, x.__getattr__('count')), user_occ)
    user_occ_list = [(x.occupation, x.__getattr__('count')) for x in user_occ]
    x_axis1 = np.array([c[0] for c in user_occ_list])
    y_axis1 = np.array([c[1] for c in user_occ_list])
    sorted_ind = np.argsort(y_axis1)
    x_axis = x_axis1[sorted_ind]
    y_axis = y_axis1[sorted_ind]

    pos = np.arange(len(x_axis))
    width = 1.0

    ax = plt.axes()
    ax.set_xticks(pos + (width / 2))
    ax.set_xticklabels(x_axis)

    plt.bar(pos, y_axis, width, color='lightblue')
    plt.xticks(rotation=45, fontsize='9')
    plt.gcf().subplots_adjust(bottom=0.15)
    #fig = matplotlib.pyplot.gcf()

    plt.show()
예제 #2
0
def main():
    user_data = get_user_data()
    user_occ = user_data.groupby("occupation").count().collect()

    user_occ_len = len(user_occ)
    user_occ_list = []
    for i in range(0, (user_occ_len - 1)):
        element = user_occ[i]
        count = element.__getattr__('count')

        tup = (element.occupation, count)
        user_occ_list.append(tup)
    x_axis1 = np.array([c[0] for c in user_occ_list])
    y_axis1 = np.array([c[1] for c in user_occ_list])
    x_axis = x_axis1[np.argsort(y_axis1)]
    y_axis = y_axis1[np.argsort(y_axis1)]

    pos = np.arange(len(x_axis))
    width = 1.0

    ax = plt.axes()
    ax.set_xticks(pos + (width / 2))
    ax.set_xticklabels(x_axis)

    plt.bar(pos, y_axis, width, color='lightblue')
    plt.xticks(rotation=45, fontsize='9')
    plt.gcf().subplots_adjust(bottom=0.15)
    #fig = matplotlib.pyplot.gcf()

    plt.show()
예제 #3
0
def main():
    user_data = get_user_data()
    user_fields = user_data.map(lambda line: line.split("|"))
    ages = user_fields.map(lambda x: int(x[1])).collect()
    plt.hist(ages, bins=20, color='lightblue', normed=True)
    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(16, 10)
    plt.show()
예제 #4
0
def main():
    user_data = get_user_data()
    num_users = user_data.count()
    num_genders = len(user_data.groupBy("gender").count().collect())
    num_occupation = len(user_data.groupBy("occupation").count().collect())
    num_zipcodes = len(user_data.groupby("zipCode").count().collect())
    print("Users: " + str(num_users))
    print("Genders: " + str(num_genders))
    print("Occupation: " + str(num_occupation))
    print("ZipCodes: " + str(num_zipcodes))
예제 #5
0
def main():
    user_data = get_user_data()
    user_ages = user_data.select('age').collect()
    user_ages_list = []
    user_ages_len = len(user_ages)
    for i in range(user_ages_len):
        user_ages_list.append(user_ages[i].age)
    plt.hist(user_ages_list, bins=20, color='lightblue', normed=True)
    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(16, 10)
    plt.show()
def main():
    user_data = get_user_data()
    user_data.first()

    user_fields = user_data.map(lambda line: line.split("|"))
    num_users = user_fields.map(lambda fields: fields[0]).count()
    num_genders = user_fields.map(lambda fields: fields[2]).distinct().count()
    num_occupations = user_fields.map(
        lambda fields: fields[3]).distinct().count()
    num_zipcodes = user_fields.map(lambda fields: fields[4]).distinct().count()
    print "Users: %d, genders: %d, occupations: %d, ZIP codes: %d" % \
          (num_users, num_genders, num_occupations, num_zipcodes)
def main():
    rating_data_raw = get_rating_data()
    print(rating_data_raw.first())
    num_ratings = rating_data_raw.count()
    print("Ratings: %d" % num_ratings)
    num_movies = get_movie_data().count()
    num_users = get_user_data().count()

    rating_data = rating_data_raw.map(lambda line: line.split("\t"))
    ratings = rating_data.map(lambda fields: int(fields[2]))
    max_rating = ratings.reduce(lambda x, y: max(x, y))
    min_rating = ratings.reduce(lambda x, y: min(x, y))
    mean_rating = ratings.reduce(lambda x, y: x + y) / float(num_ratings)
    median_rating = np.median(ratings.collect())
    ratings_per_user = num_ratings / num_users
    ratings_per_movie = num_ratings / num_movies
    print("Min rating: %d" % min_rating)
    print("Max rating: %d" % max_rating)
    print("Average rating: %2.2f" % mean_rating)
    print("Median rating: %d" % median_rating)
    print("Average # of ratings per user: %2.2f" % ratings_per_user)
    print("Average # of ratings per movie: %2.2f" % ratings_per_movie)
예제 #8
0
def main():
    user_data = get_user_data()
    user_fields = user_data.map(lambda line: line.split("|"))
    count_by_occupation = user_fields.map(lambda fields: (fields[
        3], 1)).reduceByKey(lambda x, y: x + y).collect()
    x_axis1 = np.array([c[0] for c in count_by_occupation])
    y_axis1 = np.array([c[1] for c in count_by_occupation])
    x_axis = x_axis1[np.argsort(y_axis1)]
    y_axis = y_axis1[np.argsort(y_axis1)]

    pos = np.arange(len(x_axis))
    width = 1.0

    ax = plt.axes()
    ax.set_xticks(pos + (width / 2))
    ax.set_xticklabels(x_axis)

    plt.bar(pos, y_axis, width, color='lightblue')
    plt.xticks(rotation=30)
    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(20, 10)
    plt.show()