def smarter_topkstems_feature_vectors(video_ids, k=50): custom_ignore_stems = ["com"] # stems to ignore across all channels titles = query_videos("SELECT id, channelTitle, title FROM videos;") vid_to_stems = {} for vid_tuple in titles: vid, channelTitle, title = vid_tuple if vid in video_ids: channelTitleProcessed = process_title(channelTitle) title_stems = process_title(title, stems_to_ignore=custom_ignore_stems + channelTitleProcessed) vid_to_stems[vid] = title_stems # collapse all values in vid_to_stems to a single list of all stems stems_list = [ stem for stemlist in vid_to_stems.values() for stem in stemlist ] top_stems = Counter(stems_list).most_common(k) # create the vectors! feature_vecs = [] for stem in top_stems: # binary feature of whether a given word is in the stems feature_vec = [1 if stem in vid_to_stems[v] else 0 for v in video_ids] feature_vecs.append(feature_vec) return feature_vecs
def feature_vector__plain_duration(video_ids=None): duration_results = query_videos( "SELECT id, duration, viewCount FROM videos WHERE viewCount IS NOT NULL;" ) duration_objects = interpret_query_results(duration_results) video_id_objects = filter_durations(duration_objects, video_ids=video_ids) feature_vector = [d.duration for d in video_id_objects] return [feature_vector]
def category_dict(): # Returns: # Dictionary in the form videoid -> duration in seconds category_dict = {} category_results = query_videos("SELECT id, categoryId FROM videos;") for r_tuple in category_results: vid, cat = r_tuple category_dict[vid] = cat return category_dict
def title_topkstems(video_ids, k=10): # Returns the top k stems from all relevant video titles titles = query_videos("SELECT id, title FROM videos;") # assumes titles is a list of tuples, where each tuple contains the title at index 0 stems_list = [] for t_tuple in titles: vid, t = t_tuple if vid in video_ids: stems_list.extend(process_title(t)) return Counter(stems_list).most_common(k)
def feature_vector__distance_to_peak(video_ids=None): duration_results = query_videos( "SELECT id, duration, viewCount FROM videos WHERE viewCount IS NOT NULL;" ) duration_objects = interpret_query_results(duration_results) video_id_objects = filter_durations(duration_objects, video_ids=video_ids) # print("testing feature vector,len(duration_objects), video_id_objects) peak_x, _ = peak_point(video_id_objects) feature_vector = [ abs(peak_x - video.duration) for video in video_id_objects ] return [feature_vector]
def feature_vector(video_ids): category_vector = [] id_name = { 1: [], 2: [], 10: [], 15: [], 17: [], 18: [], 19: [], 20: [], 21: [], 22: [], 23: [], 24: [], 25: [], 26: [], 27: [], 28: [], 29: [], 30: [], 31: [], 32: [], 33: [], 34: [], 35: [], 36: [], 37: [], 38: [], 39: [], 40: [], 41: [], 42: [], 43: [], 44: [] } category_results = query_videos("SELECT id, categoryId FROM videos;") for r_tuple in category_results: vid, cat = r_tuple for id in id_names: if cat == id: id_name[id].append(1) else: id_name[id].append(0) return list(id_name.values())
def channeltitle_topkstems(video_ids, k=10, remove_single_occ=True): # returns the top stems from all (unique) channel titles; returns no more than k stems. # by default, returns only stems that have occurence greater than 1, so there may be less than k stems # > has to be unique bc then that'll just return stems of channel names w most uploads titles = query_videos("SELECT DISTINCT channelTitle FROM videos;") # assumes titles is a list of tuples, where each tuple contains the title at index 0 stems_list = [] for t_tuple in titles: t = t_tuple[0] stems_list.extend(process_title(t)) kmostcommon = Counter(stems_list).most_common(k) if not remove_single_occ: return kmostcommon else: return [(stem, count) for stem, count in kmostcommon if count > 1]
def smarter_topkstems(video_ids, k=10): # Like topkstems, returns the top k stems from all relevant video titles, # but omits the channel title from the considered stems custom_ignore_stems = ["com"] # stems to ignore across all channels titles = query_videos("SELECT id, channelTitle, title FROM videos;") # assumes titles is a list of tuples, where each tuple contains the title at index 0 stems_list = [] for vid_tuple in titles: vid, channelTitle, title = vid_tuple if vid in video_ids: channelTitleProcessed = process_title(channelTitle) stems_list.extend( process_title(title, stems_to_ignore=custom_ignore_stems + channelTitleProcessed)) return Counter(stems_list).most_common(k)
def published_dict(): # Returns: # Dictionary in the form published time -> video count published_dict = {} published_results = query_videos("SELECT id, publishedAt FROM videos;") for r_tuple in published_results: vid, published = r_tuple published_time = parser.parse(published).time() if published_time < datetime.time(1, 0, 0): if "00:00-00:59" in published_dict: published_dict["00:00-00:59"] += 1 else: published_dict["00:00-00:59"] = 1 elif published_time < datetime.time(2, 0, 0): if "01:00-01:59" in published_dict: published_dict["01:00-01:59"] += 1 else: published_dict["01:00-01:59"] = 1 elif published_time < datetime.time(3, 0, 0): if "02:00-02:59" in published_dict: published_dict["02:00-02:59"] += 1 else: published_dict["02:00-02:59"] = 1 elif published_time < datetime.time(4, 0, 0): if "03:00-03:59" in published_dict: published_dict["03:00-03:59"] += 1 else: published_dict["03:00-03:59"] = 1 elif published_time < datetime.time(5, 0, 0): if "04:00-04:59" in published_dict: published_dict["04:00-04:59"] += 1 else: published_dict["04:00-04:59"] = 1 elif published_time < datetime.time(6, 0, 0): if "05:00-05:59" in published_dict: published_dict["05:00-05:59"] += 1 else: published_dict["05:00-05:59"] = 1 elif published_time < datetime.time(7, 0, 0): if "06:00-06:59" in published_dict: published_dict["06:00-06:59"] += 1 else: published_dict["06:00-06:59"] = 1 elif published_time < datetime.time(8, 0, 0): if "07:00-07:59" in published_dict: published_dict["07:00-07:59"] += 1 else: published_dict["07:00-07:59"] = 1 elif published_time < datetime.time(9, 0, 0): if "08:00-08:59" in published_dict: published_dict["08:00-08:59"] += 1 else: published_dict["08:00-08:59"] = 1 elif published_time < datetime.time(10, 0, 0): if "09:00-09:59" in published_dict: published_dict["09:00-09:59"] += 1 else: published_dict["09:00-09:59"] = 1 elif published_time < datetime.time(11, 0, 0): if "10:00-10:59" in published_dict: published_dict["10:00-10:59"] += 1 else: published_dict["10:00-10:59"] = 1 elif published_time < datetime.time(12, 0, 0): if "11:00-11:59" in published_dict: published_dict["11:00-11:59"] += 1 else: published_dict["11:00-11:59"] = 1 elif published_time < datetime.time(13, 0, 0): if "12:00-12:59" in published_dict: published_dict["12:00-12:59"] += 1 else: published_dict["12:00-12:59"] = 1 elif published_time < datetime.time(14, 0, 0): if "13:00-13:59" in published_dict: published_dict["13:00-13:59"] += 1 else: published_dict["13:00-13:59"] = 1 elif published_time < datetime.time(15, 0, 0): if "14:00-14:59" in published_dict: published_dict["14:00-14:59"] += 1 else: published_dict["14:00-14:59"] = 1 elif published_time < datetime.time(16, 0, 0): if "15:00-15:59" in published_dict: published_dict["15:00-15:59"] += 1 else: published_dict["15:00-15:59"] = 1 elif published_time < datetime.time(17, 0, 0): if "16:00-16:59" in published_dict: published_dict["16:00-16:59"] += 1 else: published_dict["16:00-16:59"] = 1 elif published_time < datetime.time(18, 0, 0): if "17:00-17:59" in published_dict: published_dict["17:00-17:59"] += 1 else: published_dict["17:00-17:59"] = 1 elif published_time < datetime.time(19, 0, 0): if "18:00-18:59" in published_dict: published_dict["18:00-18:59"] += 1 else: published_dict["18:00-18:59"] = 1 elif published_time < datetime.time(20, 0, 0): if "19:00-19:59" in published_dict: published_dict["19:00-19:59"] += 1 else: published_dict["19:00-19:59"] = 1 elif published_time < datetime.time(21, 0, 0): if "20:00-20:59" in published_dict: published_dict["20:00-20:59"] += 1 else: published_dict["20:00-20:59"] = 1 elif published_time < datetime.time(22, 0, 0): if "21:00-21:59" in published_dict: published_dict["21:00-21:59"] += 1 else: published_dict["21:00-21:59"] = 1 elif published_time < datetime.time(23, 0, 0): if "22:00-22:59" in published_dict: published_dict["22:00-22:59"] += 1 else: published_dict["22:00-22:59"] = 1 elif published_time <= datetime.time(23, 59, 59): if "23:00-23:59" in published_dict: published_dict["23:00-23:59"] += 1 else: published_dict["23:00-23:59"] = 1 return published_dict
def duration_plot_averages(): # No params or return # Plots durations vs average viewCount for that duration, # and then uses polyfit to draw a polynomial line representing this curve published_results = query_videos( "SELECT id, publishedAt, viewCount FROM videos WHERE viewCount IS NOT NULL;" ) published_dict = {} for r_tuple in published_results: id, publishedAt, viewCount = r_tuple published_time = parser.parse(publishedAt).time() if published_time < datetime.time(1, 0, 0): if "00:00-00:59" in published_dict: published_dict["00:00-00:59"].append(viewCount) else: published_dict["00:00-00:59"] = [viewCount] elif published_time < datetime.time(2, 0, 0): if "01:00-01:59" in published_dict: published_dict["01:00-01:59"].append(viewCount) else: published_dict["01:00-01:59"] = [viewCount] elif published_time < datetime.time(3, 0, 0): if "02:00-02:59" in published_dict: published_dict["02:00-02:59"].append(viewCount) else: published_dict["02:00-02:59"] = [viewCount] elif published_time < datetime.time(4, 0, 0): if "03:00-03:59" in published_dict: published_dict["03:00-03:59"].append(viewCount) else: published_dict["03:00-03:59"] = [viewCount] elif published_time < datetime.time(5, 0, 0): if "04:00-04:59" in published_dict: published_dict["04:00-04:59"].append(viewCount) else: published_dict["04:00-04:59"] = [viewCount] elif published_time < datetime.time(6, 0, 0): if "05:00-05:59" in published_dict: published_dict["05:00-05:59"].append(viewCount) else: published_dict["05:00-05:59"] = [viewCount] elif published_time < datetime.time(7, 0, 0): if "06:00-06:59" in published_dict: published_dict["06:00-06:59"].append(viewCount) else: published_dict["06:00-06:59"] = [viewCount] elif published_time < datetime.time(8, 0, 0): if "07:00-07:59" in published_dict: published_dict["07:00-07:59"].append(viewCount) else: published_dict["07:00-07:59"] = [viewCount] elif published_time < datetime.time(9, 0, 0): if "08:00-08:59" in published_dict: published_dict["08:00-08:59"].append(viewCount) else: published_dict["08:00-08:59"] = [viewCount] elif published_time < datetime.time(10, 0, 0): if "09:00-09:59" in published_dict: published_dict["09:00-09:59"].append(viewCount) else: published_dict["09:00-09:59"] = [viewCount] elif published_time < datetime.time(11, 0, 0): if "10:00-10:59" in published_dict: published_dict["10:00-10:59"].append(viewCount) else: published_dict["10:00-10:59"] = [viewCount] elif published_time < datetime.time(12, 0, 0): if "11:00-11:59" in published_dict: published_dict["11:00-11:59"].append(viewCount) else: published_dict["11:00-11:59"] = [viewCount] elif published_time < datetime.time(13, 0, 0): if "12:00-12:59" in published_dict: published_dict["12:00-12:59"].append(viewCount) else: published_dict["12:00-12:59"] = [viewCount] elif published_time < datetime.time(14, 0, 0): if "13:00-13:59" in published_dict: published_dict["13:00-13:59"].append(viewCount) else: published_dict["13:00-13:59"] = [viewCount] elif published_time < datetime.time(15, 0, 0): if "14:00-14:59" in published_dict: published_dict["14:00-14:59"].append(viewCount) else: published_dict["14:00-14:59"] = [viewCount] elif published_time < datetime.time(16, 0, 0): if "15:00-15:59" in published_dict: published_dict["15:00-15:59"].append(viewCount) else: published_dict["15:00-15:59"] = [viewCount] elif published_time < datetime.time(17, 0, 0): if "16:00-16:59" in published_dict: published_dict["16:00-16:59"].append(viewCount) else: published_dict["16:00-16:59"] = [viewCount] elif published_time < datetime.time(18, 0, 0): if "17:00-17:59" in published_dict: published_dict["17:00-17:59"].append(viewCount) else: published_dict["17:00-17:59"] = [viewCount] elif published_time < datetime.time(19, 0, 0): if "18:00-18:59" in published_dict: published_dict["18:00-18:59"].append(viewCount) else: published_dict["18:00-18:59"] = [viewCount] elif published_time < datetime.time(20, 0, 0): if "19:00-19:59" in published_dict: published_dict["19:00-19:59"].append(viewCount) else: published_dict["19:00-19:59"] = [viewCount] elif published_time < datetime.time(21, 0, 0): if "20:00-20:59" in published_dict: published_dict["20:00-20:59"].append(viewCount) else: published_dict["20:00-20:59"] = [viewCount] elif published_time < datetime.time(22, 0, 0): if "21:00-21:59" in published_dict: published_dict["21:00-21:59"].append(viewCount) else: published_dict["21:00-21:59"] = [viewCount] elif published_time < datetime.time(23, 0, 0): if "22:00-22:59" in published_dict: published_dict["22:00-22:59"].append(viewCount) else: published_dict["22:00-22:59"] = [viewCount] elif published_time <= datetime.time(23, 59, 59): if "23:00-23:59" in published_dict: published_dict["23:00-23:59"].append(viewCount) else: published_dict["23:00-23:59"] = [viewCount] published_to_viewavg = { published: sum(viewlist) / len(viewlist) for published, viewlist in published_dict.items() } x_avg = [] y_avg = [] for key, value in sorted(published_to_viewavg.items(), key=lambda item: item[0]): x_avg.append(key) y_avg.append(value) # print("%s: %s" % (key, value)) # x_avg = list(published_dict.keys()) # y_avg = [published_to_viewavg[published] for published in x_avg] plt.bar(np.arange(len(x_avg)), np.array(y_avg), align='center', color='#2B8CBF') plt.xticks(np.arange(len(x_avg)), np.array(x_avg), rotation='vertical') plt.title("Average Views per Video for Each Published Time") plt.xlabel('Published Time') plt.ylabel('Average View Counts') plt.show()
) duration_objects = interpret_query_results(duration_results) video_id_objects = filter_durations(duration_objects, video_ids=video_ids) # print("testing feature vector,len(duration_objects), video_id_objects) peak_x, _ = peak_point(video_id_objects) feature_vector = [ abs(peak_x - video.duration) for video in video_id_objects ] return [feature_vector] if __name__ == '__main__': duration_results = query_videos( "SELECT id, duration, viewCount FROM videos WHERE viewCount IS NOT NULL;" ) duration_objects = interpret_query_results(duration_results) filtered_objects = filter_durations(duration_objects, dur_cutoff=4600, views_cutoff=None) grouped_durations = grouped_durations(filtered_objects, max_duration=4600) # Generate Points: x_scatter, y_scatter = points_from_durations(filtered_objects) x_grouped, y_grouped = points_from_durations(grouped_durations) x_poly, y_poly = points_for_polynomial_curve(grouped_durations) peak_x, peak_y = peak_point(grouped_durations) # Plot Points plt.title('Video Duration x Average Number of Views')
return feature_vecs def channeltitle_topkstems(video_ids, k=10, remove_single_occ=True): # returns the top stems from all (unique) channel titles; returns no more than k stems. # by default, returns only stems that have occurence greater than 1, so there may be less than k stems # > has to be unique bc then that'll just return stems of channel names w most uploads titles = query_videos("SELECT DISTINCT channelTitle FROM videos;") # assumes titles is a list of tuples, where each tuple contains the title at index 0 stems_list = [] for t_tuple in titles: t = t_tuple[0] stems_list.extend(process_title(t)) kmostcommon = Counter(stems_list).most_common(k) if not remove_single_occ: return kmostcommon else: return [(stem, count) for stem, count in kmostcommon if count > 1] if __name__ == '__main__': video_ids = [tup[0] for tup in query_videos("SELECT id FROM videos;")] stem_to_words, common_stem_counter = smarter_topkstems(video_ids, 50) print(common_stem_counter) plot_smarter_topkstems(stem_to_words, common_stem_counter) # print(channeltitle_topkstems(video_ids,50))
def category_plot_all(): # No params or return # Plots all durations vs viewCount duration_results = query_videos( "SELECT id, categoryId, viewCount FROM videos WHERE viewCount IS NOT NULL;" ) # TODO: get this info from youtube data API id_name = { 1: 'Film & Animation', 2: 'Autos & Vehicles', 10: 'Music', 15: 'Pets & Animals', 17: 'Sports', 18: 'Short Movies', 19: 'Travel & Events', 20: 'Gaming', 21: 'Videoblogging', 22: 'People & Blogs', 23: 'Comedy', 24: 'Entertainment', 25: 'News & Politics', 26: 'Howto & Style', 27: 'Education', 28: 'Science & Technology', 29: 'Nonprofits & Activism', 30: 'Movies', 31: 'Anime/Animation', 32: 'Action/Adventure', 33: 'Classics', 34: 'Comedy', 35: 'Documentary', 36: 'Drama', 37: 'Family', 38: 'Foreign', 39: 'Horror', 40: 'Sci-Fi/Fantasy', 41: 'Thriller', 42: 'Shorts', 43: 'Shows', 44: 'Trailers' } category_count = {} for r_tuple in duration_results: vid, cat, views = r_tuple if cat in category_count: category_count[cat] += 1 else: category_count[cat] = 1 print(category_count[2]) # del category_count['Autos & Vehicles'] # del category_count['Nonprofits & Activism'] # del category_count['Pets & Animals'] # del category_count['Music'] # del category_count['Film & Animation'] # del category_count['Sports'] # del category_count['Science & Technology'] # del category_count['News & Politics'] del category_count[2] del category_count[29] del category_count[15] del category_count[10] del category_count[1] del category_count[17] del category_count[28] del category_count[25] num_videos = sum(category_count.values()) categories = [] def round_to_1(x): return round(x, -int(floor(log10(abs(x))))) for key, value in category_count.items(): categories.append(id_name[key] + " " + str(round_to_1((value / num_videos) * 100)) + "%") # y_pos = np.arange(len(category_count)) # category_ids = list(category_count.keys()) # # # for id in category_ids: # # vals = list(category_count.values()) # plt.bar(y_pos, vals, align = 'center', alpha=0.5) # plt.xticks(y_pos, categories, rotation='vertical') # plt.xlabel('Category') # plt.ylabel('Number of Videos') # plt.title('Number of Videos per Category') patches, texts = plt.pie(vals, startangle=90) plt.legend(patches, categories, loc="best") plt.axis('equal') plt.tight_layout() plt.show()
def category_plot_averages(): # No params or return # Plots durations vs average viewCount for that duration, # and then uses polyfit to draw a polynomial line representing this curve category_results = query_videos( "SELECT id, categoryId, viewCount FROM videos WHERE viewCount IS NOT NULL;" ) # TODO: get this info from youtube data API id_name = { 1: 'Film & Animation', 2: 'Autos & Vehicles', 10: 'Music', 15: 'Pets & Animals', 17: 'Sports', 18: 'Short Movies', 19: 'Travel & Events', 20: 'Gaming', 21: 'Videoblogging', 22: 'People & Blogs', 23: 'Comedy', 24: 'Entertainment', 25: 'News & Politics', 26: 'Howto & Style', 27: 'Education', 28: 'Science & Technology', 29: 'Nonprofits & Activism', 30: 'Movies', 31: 'Anime/Animation', 32: 'Action/Adventure', 33: 'Classics', 34: 'Comedy', 35: 'Documentary', 36: 'Drama', 37: 'Family', 38: 'Foreign', 39: 'Horror', 40: 'Sci-Fi/Fantasy', 41: 'Thriller', 42: 'Shorts', 43: 'Shows', 44: 'Trailers' } category_to_viewlist = {} for r_tuple in category_results: vid, category, views = r_tuple if category in category_to_viewlist: category_to_viewlist[category].append(views) else: category_to_viewlist[category] = [views] category_to_viewavg = { id_name[category]: sum(viewlist) / len(viewlist) for category, viewlist in category_to_viewlist.items() } x = list(category_to_viewavg.keys()) y_avg = [category_to_viewavg[category] for category in x] plt.xlabel('Categories') plt.ylabel('View Counts') y_pos = np.arange(len(y_avg)) plt.bar(y_pos, y_avg, align='center') plt.xticks(y_pos, x, rotation='vertical') plt.xlabel('Category') plt.ylabel('Average Number of Views') plt.title('Average Number of Views per Video for Each Category') plt.show()