class Analysis(): """Main class responsible for downloading and analyzing data. Parameters ---------- path : str (default='data') The path to the directory where both raw and computed results should be stored. delay: float (default=0) Amount of time in seconds to wait between requests. Attributes ---------- raw : str Path to 'raw' directory in self.path directory ran : str Path to 'ran' directory in self.path directory df : Dataframe Pandas Dataframe used to store compiled results tags : [[str]] A list of tags for each downloaded video grapher : Grapher Creates the interactive graphs portion of the analysis seconds : int The sum of video durations formatted_time : str Seconds converted to W/D/H/M/S format all_likes : Series Video that has the most likes without a single dislike most_likes : Series Video with the most total likes most_viewed : Series Video with the most total views oldest_videos : Dataframe First 10 videos watched on user's account. oldest_upload : Series Video with the oldest upload date to youtube. HD : int The number of videos that have high-definition resolution UHD : int The number of videos that have ultra-high-definition resolution top_uploaders : Series The most watched channel names with corresponding video counts funny_counts : int The max number of times a video's description says the word 'funny' funny : Series The 'funniest' video as determined by funny_counts """ def __init__(self, path='data', delay=0): self.path = path self.delay = delay self.raw = os.path.join(self.path, 'raw') self.ran = os.path.join(self.path, 'ran') self.df = None self.tags = None self.grapher = None self.seconds= None self.formatted_time = None self.all_likes = None self.most_liked = None self.most_viewed = None self.oldest_videos = None self.oldest_upload = None self.HD = None self.UHD = None self.top_uploaders = None self.funny = None self.funny_counts = None def download_data(self): """Uses youtube_dl to download individual json files for each video.""" print('There\'s no data in this folder. Let\'s download some.') successful_login = False while not successful_login: successful_login = True user = input('Google username: '******'Google password: '******'%(autonumber)s') if not os.path.exists(self.raw): os.makedirs(self.raw) template = ('youtube-dl -u "{}" -p "{}" ' '-o "{}" --sleep-interval {} ' '--skip-download --write-info-json -i ' 'https://www.youtube.com/feed/history ') fake = template.format(user, '[$PASSWORD]', files, self.delay) print(f'Executing youtube-dl command:\n\n{fake}\n') cmd = template.format(user, pw, files, self.delay) p = sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.STDOUT, shell=True) while True: line = p.stdout.readline().decode("utf-8").strip() print(line) if line == 'WARNING: unable to log in: bad username or password': successful_login = False if not line: break def df_from_files(self): """Constructs a Dataframe from the downloaded json files. All json keys whose values are not lists are compiled into the dataframe. The dataframe is then saved as a csv file in the self.ran directory. The tags of each video are pickled and saved as tags.txt """ print('Creating dataframe...') num = len([name for name in os.listdir(self.raw) if not name[0]=='.']) files = os.path.join(self.raw, '~.info.json') # This is a weird hack files = files.replace('~', '{:05d}') # It allows path joining to work on Windows data = [json.load(open(files.format(i))) for i in range(1, num + 1)] columns = ['formats', 'tags', 'categories', 'thumbnails'] lists = [[], [], [], []] deletes = {k:v for k, v in zip(columns, lists)} for dt in data: for col, ls in deletes.items(): ls.append(dt[col]) del dt[col] self.df = pd.DataFrame(data) self.df['upload_date'] = pd.to_datetime(self.df['upload_date'], format='%Y%m%d') self.df.to_csv(os.path.join(self.ran, 'df.csv')) self.tags = deletes['tags'] pickle.dump(self.tags, open(os.path.join(self.ran, 'tags.txt'), 'wb')) def make_wordcloud(self): """Generate the wordcloud file and save it to static/images/.""" #plt.rcParams['figure.figsize'] = [24.0, 18.0] print('Creating wordcloud') flat_tags = [item for sublist in self.tags for item in sublist] wordcloud = WordCloud(width=1920, height=1080, relative_scaling=.5) wordcloud.generate(' '.join(flat_tags)) wordcloud.to_file(os.path.join('static', 'images', 'wordcloud.png')) def check_df(self): """Create the dataframe and tags from files if file doesn't exist.""" if not os.path.exists(self.ran): os.makedirs(self.ran) df_file = os.path.join(self.ran, 'df.csv') if os.path.isfile(df_file): self.df = pd.read_csv(df_file, index_col=0, parse_dates=[-11]) self.tags = pickle.load(open(os.path.join(self.ran, 'tags.txt'), 'rb')) self.df['upload_date'] = pd.to_datetime(self.df['upload_date']) else: self.df_from_files() def total_time(self): """The amount of time spent watching videos.""" self.seconds = self.df.duration.sum() seconds = self.seconds intervals = ( ('weeks', 604800), # 60 * 60 * 24 * 7 ('days', 86400), # 60 * 60 * 24 ('hours', 3600), # 60 * 60 ('minutes', 60), ('seconds', 1) ) result = [] for name, count in intervals: value = seconds // count if value: seconds -= value * count if value == 1: name = name.rstrip('s') result.append("{} {}".format(value, name)) self.formatted_time = ', '.join(result) def worst_videos(self): """Finds the lowest rated and most disliked videos""" self.df['total_votes'] = self.df['like_count'] + self.df['dislike_count'] self.df['average_rating'] = self.df['like_count'] / self.df['total_votes'] df_voted = self.df[self.df['total_votes'] > 0] self.lowest_rating = df_voted.loc[df_voted['average_rating'].idxmin()] self.most_disliked = self.df.loc[self.df['dislike_count'].idxmax()] def best_videos(self): """Finds well liked and highly viewed videos""" all_likes = self.df[(self.df['like_count'] > 0) & (self.df['dislike_count'] == 0)] all_likes = all_likes.sort_values('like_count', ascending=False) try: self.all_likes = all_likes.iloc[0] except IndexError: pass self.most_liked = self.df.loc[self.df['like_count'].idxmax()] self.most_viewed = self.df.loc[self.df['view_count'].idxmax()] def funniest_description(self): """Counts number of times 'funny' is in each description. Saves top result.""" funny_counts = [] descriptions = [] index = [] for i, d in enumerate(self.df.description): try: funny_counts.append(d.lower().count('funny')) descriptions.append(d) index.append(i) except AttributeError: pass funny_counts = np.array(funny_counts) funny_counts_idx = funny_counts.argmax() self.funny_counts = funny_counts[funny_counts_idx] if self.funny_counts > 0: self.funny = self.df.iloc[index[funny_counts_idx]] else: self.funny = 'Wait, 0? You\'re too cool to watch funny videos on youtube?' def three_randoms(self): """Finds results for video resolutions, most popular channels, and funniest video.""" self.HD = self.df[(720 <= self.df.height) & (self.df.height <= 1080)].shape[0] self.UHD = self.df[self.df.height > 1080].shape[0] self.top_uploaders = self.df.uploader.value_counts().head(n=15) self.funniest_description() def compute(self): print('Computing...') self.total_time() self.worst_videos() self.best_videos() self.oldest_videos = self.df[['title', 'webpage_url']].tail(n=10) self.oldest_upload = self.df.loc[self.df['upload_date'].idxmin()] self.three_randoms() def graph(self): self.grapher = Grapher(self.df, self.tags) self.grapher.average_rating() self.grapher.duration() self.grapher.views() self.grapher.gen_tags_plot() def start_analysis(self): self.check_df() if WordCloud is not None: self.make_wordcloud() self.compute() self.graph() def run(self): """Main function for downloading and analyzing data.""" file1 = os.path.join(self.raw, '00001.info.json') some_data = os.path.isfile(file1) if not some_data: self.download_data() some_data = os.path.isfile(file1) if some_data: self.start_analysis() else: print('No data was downloaded.')
class Analysis(): """Main class responsible for downloading and analyzing data. Parameters ---------- path : str (default='data') The path to the directory where both raw and computed results should be stored. Attributes ---------- raw : str Path to 'raw' directory in self.path directory ran : str Path to 'ran' directory in self.path directory df : Dataframe Pandas Dataframe used to store compiled results tags : [[str]] A list of tags for each downloaded video grapher : Grapher Creates the interactive graphs portion of the analysis seconds : int The sum of video durations formatted_time : str Seconds converted to W/D/H/M/S format all_likes : Series Video that has the most likes without a single dislike most_likes : Series Video with the most total likes most_viewed : Series Video with the most total views oldest_videos : Dataframe First 10 videos watched on user's account. oldest_upload : Series Video with the oldest upload date to youtube. HD : int The number of videos that have high-definition resolution UHD : int The number of videos that have ultra-high-definition resolution top_uploaders : Series The most watched channel names with corresponding video counts funny_counts : int The max number of times a video's description says the word 'funny' funny : Series The 'funniest' video as determined by funny_counts """ def __init__(self, path='data'): self.path = path self.raw = os.path.join(self.path, 'raw') self.ran = os.path.join(self.path, 'ran') self.df = None self.tags = None self.grapher = None self.seconds = None self.formatted_time = None self.all_likes = None self.most_liked = None self.most_viewed = None self.oldest_videos = None self.oldest_upload = None self.HD = None self.UHD = None self.top_uploaders = None self.funny = None self.funny_counts = None def download_data(self): """Uses youtube_dl to download individual json files for each video.""" print('There\'s no data in this folder. Let\'s download some.') successful_login = False while not successful_login: successful_login = True user = input('Google username: '******'Google password: '******'%(autonumber)s') if not os.path.exists(self.raw): os.makedirs(self.raw) cmd = ('youtube-dl -u "{}" -p "{}" ' + '-o "{}" ' + '--skip-download --write-info-json -i ' + 'https://www.youtube.com/feed/history ').format( user, pw, files) p = sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.STDOUT, shell=True) while True: line = p.stdout.readline().decode("utf-8").strip() print(line) if line == 'WARNING: unable to log in: bad username or password': successful_login = False if not line: break def df_from_files(self): """Constructs a Dataframe from the downloaded json files. All json keys whose values are not lists are compiled into the dataframe. The dataframe is then saved as a csv file in the self.ran directory. The tags of each video are pickled and saved as tags.txt """ print('Creating dataframe...') num = len( [name for name in os.listdir(self.raw) if not name[0] == '.']) files = os.path.join(self.raw, '~.info.json') # This is a weird hack files = files.replace( '~', '{:05d}') # It allows path joining to work on Windows data = [json.load(open(files.format(i))) for i in range(1, num + 1)] columns = ['formats', 'tags', 'categories', 'thumbnails'] lists = [[], [], [], []] deletes = {k: v for k, v in zip(columns, lists)} for dt in data: for col, ls in deletes.items(): ls.append(dt[col]) del dt[col] self.df = pd.DataFrame(data) self.df['upload_date'] = pd.to_datetime(self.df['upload_date'], format='%Y%m%d') self.df.to_csv(os.path.join(self.ran, 'df.csv')) self.tags = deletes['tags'] pickle.dump(self.tags, open(os.path.join(self.ran, 'tags.txt'), 'wb')) def make_wordcloud(self): """Generate the wordcloud file and save it to static/images/.""" #plt.rcParams['figure.figsize'] = [24.0, 18.0] print('Creating wordcloud') flat_tags = [item for sublist in self.tags for item in sublist] wordcloud = WordCloud(width=1920, height=1080, relative_scaling=.5) wordcloud.generate(' '.join(flat_tags)) wordcloud.to_file(os.path.join('static', 'images', 'wordcloud.png')) def check_df(self): """Create the dataframe and tags from files if file doesn't exist.""" if not os.path.exists(self.ran): os.makedirs(self.ran) df_file = os.path.join(self.ran, 'df.csv') if os.path.isfile(df_file): self.df = pd.read_csv(df_file, index_col=0, parse_dates=[-11]) self.tags = pickle.load( open(os.path.join(self.ran, 'tags.txt'), 'rb')) self.df['upload_date'] = pd.to_datetime(self.df['upload_date']) else: self.df_from_files() def total_time(self): """The amount of time spent watching videos.""" self.seconds = self.df.duration.sum() seconds = self.seconds intervals = ( ('weeks', 604800), # 60 * 60 * 24 * 7 ('days', 86400), # 60 * 60 * 24 ('hours', 3600), # 60 * 60 ('minutes', 60), ('seconds', 1)) result = [] for name, count in intervals: value = seconds // count if value: seconds -= value * count if value == 1: name = name.rstrip('s') result.append("{} {}".format(value, name)) self.formatted_time = ', '.join(result) def worst_videos(self): """Finds the lowest rated and most disliked videos""" df_liked = self.df[self.df.like_count > 0] self.lowest_rating = df_liked.ix[df_liked['average_rating'].idxmin()] self.most_disliked = self.df.ix[self.df['dislike_count'].idxmax()] def best_videos(self): """Finds well liked and highly viewed videos""" all_likes = self.df[self.df.average_rating == 5] all_likes = all_likes.sort_values('like_count', ascending=False) self.all_likes = all_likes.iloc[0] self.most_liked = self.df.ix[self.df['like_count'].idxmax()] self.most_viewed = self.df.ix[self.df['view_count'].idxmax()] def funniest_description(self): """Counts number of times 'funny' is in each description. Saves top result.""" funny_counts = [] descriptions = [] index = [] for i, d in enumerate(self.df.description): try: funny_counts.append(d.lower().count('funny')) descriptions.append(d) index.append(i) except AttributeError: pass funny_counts = np.array(funny_counts) funny_counts_idx = funny_counts.argmax() self.funny_counts = funny_counts[funny_counts_idx] if self.funny_counts > 0: self.funny = self.df.iloc[index[funny_counts_idx]] else: self.funny = 'Wait, 0? You\'re too cool to watch funny videos on youtube?' def three_randoms(self): """Finds results for video resolutions, most popular channels, and funniest video.""" self.HD = self.df[(720 <= self.df.height) & (self.df.height <= 1080)].shape[0] self.UHD = self.df[self.df.height > 1080].shape[0] self.top_uploaders = self.df.uploader.value_counts().head(n=15) self.funniest_description() def compute(self): print('Computing...') self.total_time() self.worst_videos() self.best_videos() self.oldest_videos = self.df[['title', 'webpage_url']].tail(n=10) self.oldest_upload = self.df.ix[self.df['upload_date'].idxmin()] self.three_randoms() def graph(self): self.grapher = Grapher(self.df, self.tags) self.grapher.average_rating() self.grapher.duration() self.grapher.views() self.grapher.gen_tags_plot() def start_analysis(self): self.check_df() if WordCloud is not None: self.make_wordcloud() self.compute() self.graph() def run(self): """Main function for downloading and analyzing data.""" file1 = os.path.join(self.raw, '00001.info.json') some_data = os.path.isfile(file1) if not some_data: self.download_data() some_data = os.path.isfile(file1) if some_data: self.start_analysis() else: print('No data was downloaded.')