def main(): # get user data user_data = get_user_data() # extract occupation user_occ = user_data.groupby("occupation").count().collect() # extract list of tuple occupation by length # user_occ_list = map(lambda x: (x.occupation, x.__getattr__('count')), user_occ) user_occ_list = [(x.occupation, x.__getattr__('count')) for x in user_occ] x_axis1 = np.array([c[0] for c in user_occ_list]) y_axis1 = np.array([c[1] for c in user_occ_list]) sorted_ind = np.argsort(y_axis1) x_axis = x_axis1[sorted_ind] y_axis = y_axis1[sorted_ind] pos = np.arange(len(x_axis)) width = 1.0 ax = plt.axes() ax.set_xticks(pos + (width / 2)) ax.set_xticklabels(x_axis) plt.bar(pos, y_axis, width, color='lightblue') plt.xticks(rotation=45, fontsize='9') plt.gcf().subplots_adjust(bottom=0.15) #fig = matplotlib.pyplot.gcf() plt.show()
def main(): user_data = get_user_data() user_occ = user_data.groupby("occupation").count().collect() user_occ_len = len(user_occ) user_occ_list = [] for i in range(0, (user_occ_len - 1)): element = user_occ[i] count = element.__getattr__('count') tup = (element.occupation, count) user_occ_list.append(tup) x_axis1 = np.array([c[0] for c in user_occ_list]) y_axis1 = np.array([c[1] for c in user_occ_list]) x_axis = x_axis1[np.argsort(y_axis1)] y_axis = y_axis1[np.argsort(y_axis1)] pos = np.arange(len(x_axis)) width = 1.0 ax = plt.axes() ax.set_xticks(pos + (width / 2)) ax.set_xticklabels(x_axis) plt.bar(pos, y_axis, width, color='lightblue') plt.xticks(rotation=45, fontsize='9') plt.gcf().subplots_adjust(bottom=0.15) #fig = matplotlib.pyplot.gcf() plt.show()
def main(): user_data = get_user_data() user_fields = user_data.map(lambda line: line.split("|")) ages = user_fields.map(lambda x: int(x[1])).collect() plt.hist(ages, bins=20, color='lightblue', normed=True) fig = matplotlib.pyplot.gcf() fig.set_size_inches(16, 10) plt.show()
def main(): user_data = get_user_data() num_users = user_data.count() num_genders = len(user_data.groupBy("gender").count().collect()) num_occupation = len(user_data.groupBy("occupation").count().collect()) num_zipcodes = len(user_data.groupby("zipCode").count().collect()) print("Users: " + str(num_users)) print("Genders: " + str(num_genders)) print("Occupation: " + str(num_occupation)) print("ZipCodes: " + str(num_zipcodes))
def main(): user_data = get_user_data() user_ages = user_data.select('age').collect() user_ages_list = [] user_ages_len = len(user_ages) for i in range(user_ages_len): user_ages_list.append(user_ages[i].age) plt.hist(user_ages_list, bins=20, color='lightblue', normed=True) fig = matplotlib.pyplot.gcf() fig.set_size_inches(16, 10) plt.show()
def main(): user_data = get_user_data() user_data.first() user_fields = user_data.map(lambda line: line.split("|")) num_users = user_fields.map(lambda fields: fields[0]).count() num_genders = user_fields.map(lambda fields: fields[2]).distinct().count() num_occupations = user_fields.map( lambda fields: fields[3]).distinct().count() num_zipcodes = user_fields.map(lambda fields: fields[4]).distinct().count() print "Users: %d, genders: %d, occupations: %d, ZIP codes: %d" % \ (num_users, num_genders, num_occupations, num_zipcodes)
def main(): rating_data_raw = get_rating_data() print(rating_data_raw.first()) num_ratings = rating_data_raw.count() print("Ratings: %d" % num_ratings) num_movies = get_movie_data().count() num_users = get_user_data().count() rating_data = rating_data_raw.map(lambda line: line.split("\t")) ratings = rating_data.map(lambda fields: int(fields[2])) max_rating = ratings.reduce(lambda x, y: max(x, y)) min_rating = ratings.reduce(lambda x, y: min(x, y)) mean_rating = ratings.reduce(lambda x, y: x + y) / float(num_ratings) median_rating = np.median(ratings.collect()) ratings_per_user = num_ratings / num_users ratings_per_movie = num_ratings / num_movies print("Min rating: %d" % min_rating) print("Max rating: %d" % max_rating) print("Average rating: %2.2f" % mean_rating) print("Median rating: %d" % median_rating) print("Average # of ratings per user: %2.2f" % ratings_per_user) print("Average # of ratings per movie: %2.2f" % ratings_per_movie)
def main(): user_data = get_user_data() user_fields = user_data.map(lambda line: line.split("|")) count_by_occupation = user_fields.map(lambda fields: (fields[ 3], 1)).reduceByKey(lambda x, y: x + y).collect() x_axis1 = np.array([c[0] for c in count_by_occupation]) y_axis1 = np.array([c[1] for c in count_by_occupation]) x_axis = x_axis1[np.argsort(y_axis1)] y_axis = y_axis1[np.argsort(y_axis1)] pos = np.arange(len(x_axis)) width = 1.0 ax = plt.axes() ax.set_xticks(pos + (width / 2)) ax.set_xticklabels(x_axis) plt.bar(pos, y_axis, width, color='lightblue') plt.xticks(rotation=30) fig = matplotlib.pyplot.gcf() fig.set_size_inches(20, 10) plt.show()