def main(): from youtube_api import YouTubeDataAPI api_key = "AIzaSyAB5Dg1HeqzlV6y2d37-tektKrts2bNOSc" yt = YouTubeDataAPI(api_key) zout = yt.get_video_metadata(video_id='k7DGeWlKu0Q') Num_likes = zout['video_like_count'] Num_dislikes = zout['video_dislike_count'] Num_comments = zout['video_comment_count'] zcomment = yt.get_video_comments(video_id='k7DGeWlKu0Q') Commentlist = [] for item in zcomment: Commentlist.append(item['text']) zcaption = yt.get_captions(video_id='k7DGeWlKu0Q') Transcript = zcaption['caption'] Result1 = { 'Num_likes': Num_likes, 'Num_dislikes': Num_dislikes, 'Num_comments': Num_comments, 'Transcript': Transcript, 'Commentlist': Commentlist } Result2 = yt.search(q='vaccine', published_after=datetime.datetime(2018, 1, 1), published_before=datetime.datetime(2019, 1, 1)) return Result1, Result2
datetime.datetime(2019, 6, 30))) #converts data into a pandas dataframe df_searches_2019 = pd.DataFrame(searches_2019) #print data for videos pd.options.display.max_rows = 100 df_searches_2019.to_string(index=False) df_searches_2019.iloc[:, 0].tolist() df_searches_2019 #gets more information about the videos based on the video's video_id searches_2019_more_info = [] video_id_2019 = df_searches_2019.iloc[:, 0] for i in range(0, number_of_results): searches_2019_more_info_singluar = yt.get_video_metadata( video_id_2019[i], part=['statistics', 'snippet']) searches_2019_more_info_singluar['number'] = i searches_2019_more_info.append(searches_2019_more_info_singluar) #converts data into a pandas dataframe and prints it out searches_2019_more_info = pd.DataFrame(searches_2019_more_info) searches_2019_more_info #process of counting how many times either nothing is in the description, or the words "Para, Paralympic, Adaptive, Adapted, Disabled, Disability, Differently abled, Disability friendly, Wheelchair Accessible, and Inclusive" appeard. #search is case sensitive, so also did lowercase search None_2019 = 0 Para_2019 = 0 para_2019 = 0 Paralympic_2019 = 0 paralympic_2019 = 0 Adaptive_2019 = 0
class SuperchatArchiver: def __init__(self,vid_id, api_key, gen_WC = False, loop = None): self.cancelled = False self.loop = loop self.t_pool = concurrent.futures.ThreadPoolExecutor(max_workers=5) self.api_points_used = 1.0 self.api = YouTubeDataAPI(api_key) #uses 1p to check key self.videoid = vid_id self.channel_id = "" self.metadata = {} self.videoinfo = {} self.stats = [] self.dict_list = [] self.gen_wc = gen_WC self.clean_currency = {"¥": "JPY", "NT$": "TWD", "$": "USD", "CA$": "CAD", "MX$": "MXN", "HK$": "HKD", "A$": "AUD", "£": "GBP", "€": "EUR", "R$": "BRL", "₹": "INR", "\u20b1": "PHP"} self.metadata = self.get_video_info(self.videoid) self.api_points_used += 1.0 self.channel_id = self.metadata["channelId"] self.running = True if self.metadata is not None: self.videoinfo = self.metadata self.channel_id = self.metadata["channelId"] else: exit(-1) pathlib.Path('./' + self.channel_id + '/sc_logs').mkdir(parents=True, exist_ok=True) pathlib.Path('./' + self.channel_id + '/vid_stats').mkdir(parents=True, exist_ok=True) def get_video_info(self,video_ID:str): try: response = self.api.get_video_metadata(video_id=video_ID, parser=None, part=["liveStreamingDetails", "contentDetails", "snippet"]) api_metadata = {"channel": response["snippet"]["channelTitle"], "channelId": response["snippet"]["channelId"], "id": video_ID, "title": response["snippet"]["title"], "live": response["snippet"]["liveBroadcastContent"], "caught_while": response["snippet"]["liveBroadcastContent"], "publishDateTime": datetime.strptime(response["snippet"]["publishedAt"] + " +0000", "%Y-%m-%dT%H:%M:%SZ %z").timestamp()} delta = isodate.parse_duration(response["contentDetails"]["duration"]) api_metadata["length"] = delta.total_seconds() if 'liveStreamingDetails' in response.keys(): api_metadata["liveStreamingDetails"] = {} for d in response["liveStreamingDetails"].keys(): if "Time" in d or "time" in d: api_metadata["liveStreamingDetails"][d] = datetime.strptime( response["liveStreamingDetails"][d] + " +0000", "%Y-%m-%dT%H:%M:%SZ %z").timestamp() return api_metadata except Exception as e: print(self.videoid) print(e) return None async def async_get_video_info(self,video_ID:str): api_metadata = await self.loop.run_in_executor(self.t_pool,self.get_video_info,video_ID) self.api_points_used += 1.0 self.channel_id = api_metadata["channelId"] return api_metadata def cancel(self): self.cancelled = True async def main(self): if not self.loop: self.loop = asyncio.get_running_loop() self.chat_err = True retries = 0 while self.chat_err and not self.cancelled: if "liveStreamingDetails" in self.videoinfo.keys() or self.videoinfo["live"] != "none": self.stats.clear() #self.dict_list.clear() self.chat_err = False test_file = pathlib.Path(self.channel_id+"/sc_logs/"+self.videoid + ".txt") if test_file.is_file(): if test_file.stat().st_size > 2: await self.log_output(self.videoinfo["channel"]+" - " + self.videoinfo["title"]+" already analyzed, skipping. Existing file size: "+str(test_file.stat().st_size)+" bytes") continue f = open(self.channel_id+"/sc_logs/"+self.videoid + ".txt", "w") f_stats = open(self.channel_id+"/vid_stats/"+self.videoid + "_stats.txt", "w") analysis_ts = datetime.now(tz=pytz.timezone('Europe/Berlin')) await self.log_output("Started Analysis at: "+analysis_ts.isoformat()) await self.log_output("Analyzing Video " + datetime.fromtimestamp(self.videoinfo["publishDateTime"],timezone.utc).isoformat() + " " +self.videoinfo["channel"]+" - " + self.videoinfo["title"] + " ["+self.videoid+"]") chat = LiveChatAsync(self.videoid, callback = self.display, processor = (SuperChatLogProcessor(), SuperchatCalculator())) while chat.is_alive() and not self.cancelled: await asyncio.sleep(3) newmetadata = await self.async_get_video_info(self.videoid) #when livestream chat parsing ends, get some more metadata if newmetadata["live"] in ["upcoming","live"]: #in case the livestream has not ended yet! await self.log_output(datetime.now(tz=pytz.timezone('Europe/Berlin')).isoformat()+": Error! Chat monitor ended prematurely!") await self.log_output(chat.is_alive()) self.chat_err = True if self.videoinfo["caught_while"] in ["upcoming","live"]: #use newer metadata while rescuing certain fields from the old metadata createdDateTime = self.videoinfo["publishDateTime"] caught_while = self.videoinfo["caught_while"] old_title = self.videoinfo["title"] if newmetadata is not None: self.videoinfo = newmetadata self.videoinfo["createdDateTime"] = createdDateTime self.videoinfo["caught_while"] = caught_while if self.videoinfo["title"] != old_title: self.videoinfo["old_title"] = old_title else: exit(-1) self.videoinfo["startedLogAt"] = analysis_ts.timestamp() await self.log_output("writing to files") f.write(json.dumps(self.dict_list)) f.close() if self.chat_err: self.stats.append(await self.loop.run_in_executor(self.t_pool,recount_money,self.dict_list)) f_stats.write(json.dumps([self.videoinfo,self.stats[-1:]])) f_stats.close() if self.chat_err: os.rename(f.name, f.name+".err"+str(retries)) os.rename(f_stats.name, f_stats.name + ".err"+str(retries)) retries +=1 if not self.chat_err and retries == 0 and self.gen_wc and len(self.dict_list) > 0: await self.loop.run_in_executor(self.t_pool,self.generate_wordcloud,f.name) else: await self.log_output(self.videoinfo["title"]+" is not a broadcast recording or premiere") async def display(self,data,amount): if len(data.items) > 0: for c in data.items: #data.items contains superchat messages - save them in list while also saving the calculated #sums in a list if c.type == "superChat" or c.type == "superSticker": if c.currency in self.clean_currency.keys(): c.currency = self.clean_currency[c.currency] sc_datetime = datetime.fromtimestamp(c.timestamp/1000.0,timezone.utc) sc_weekday = sc_datetime.weekday() sc_hour = sc_datetime.hour sc_minute = sc_datetime.minute sc_user = c.author.name sc_userid = c.author.channelId sc_message = c.message sc_color = c.bgColor sc_currency = c.currency.replace(u'\xa0', '') sc_info = {"time":c.timestamp,"currency":sc_currency,"value":c.amountValue,"weekday":sc_weekday, "hour":sc_hour,"minute":sc_minute,"user":sc_user, "userid":sc_userid, "message":sc_message, "color":sc_color, "debugtime":sc_datetime.isoformat()} self.stats.append(amount) self.dict_list.append(sc_info) await self.log_output( self.videoinfo["channel"] + " " + self.videoinfo["title"] + " " + data.items[-1].elapsedTime + " " + str(amount["amount_sc"])) def generate_wordcloud(self,filepath): wordcloudmake = superchat_wordcloud(filepath) wordcloudmake.generate() async def log_output(self,logmsg): await self.loop.run_in_executor(self.t_pool,print,logmsg)
# topic information TOPIC = 'Coronavirus' SEARCH_TERMS = ['coronavirus'] # ['Judy Mikovits','Bill Gates coronavirus','QAnon coronavirus','Coronavirus Vaccination', # 'Wuhan lab', 'bioweapon coronavirus','5G coronavirus','coronavirus flu' # ,'dean koontz darkness'] ##adding eveloped search terms here VIDEOS_PER_QUERY = 10 # Check youtube api API_KEY = 'your api key' yt_api = YouTubeDataAPI(API_KEY) yt_api.verify_key() Metadata = lambda vid_list: yt_api.get_video_metadata(vid_list) ''' collect videos' metadata ''' def get_metadata(vid_list, VID_SEEN): if not vid_list: return [], [0] try: metadata_list = Metadata(vid_list) except Exception as exc: print('>>> cannot retrieve the metadata_list for [{}...] as {}'.format( vid_list[0], exc)) return [], [0] VID_SEEN = set(list(VID_SEEN)) return metadata_list, [len(vid_list)]
# For each playlist in the array, # create a directory inside downloads and # download each video into that directory for playlist in playlists: playlist_dir = playlist['title'] if not os.path.exists(playlist_dir): os.mkdir(playlist_dir) # Change directory so that we can save the videos in the proper place os.chdir(playlist_dir) # Download all episodes for this playlist playlist = yt.get_videos_from_playlist_id(playlist['playlist_id']) for videoix, v in enumerate(playlist): try: video = yt.get_video_metadata(v['video_id']) title = str(videoix) + '. ' + video['video_title'] link = 'http://youtube.com/watch?v=' + video['video_id'] try: with youtube_dl.YoutubeDL( {'outtmpl': str(videoix + 1) + '. %(title)s.%(ext)s'}) as ydl: result = ydl.download([link]) except Exception as e: print("Exception on download: %s" % e) except Exception as e: print("Exception on get_video_metadata: %s" % e) # Switch back to the root dir of project os.chdir(root_dir)
class SuperchatArchiver: def __init__(self,vid_id, api_key, gen_WC = False, loop = None, file_suffix = ".standalone.txt", minutes_wait = 30, retry_attempts = 72, min_successful_attempts = 2, logger = None): self.total_counted_msgs = 0 self.total_new_members = 0 self.max_retry_attempts = retry_attempts self.min_successful_attempts = min_successful_attempts self.file_suffix = file_suffix self.minutes_wait = minutes_wait self.started_at = None self.ended_at = None self.cancelled = False self.loop = loop self.t_pool = concurrent.futures.ThreadPoolExecutor(max_workers=100) self.api_points_used = 1.0 self.api = YouTubeDataAPI(api_key) #uses 1p to check key self.videoid = vid_id self.channel_id = "" self.metadata = {} self.videoinfo = {} self.donors = {} self.stats = [] self.sc_msgs = set() self.sc_logs_list = [] self.metadata_list = [] self.gen_wc = gen_WC self.unique_donors = {} self.clean_currency = {"¥": "JPY", "NT$": "TWD", "$": "USD", "CA$": "CAD", "MX$": "MXN", "HK$": "HKD", "A$": "AUD", "£": "GBP", "€": "EUR", "R$": "BRL", "₹": "INR", "\u20b1": "PHP", "\u20aa": "ILS"} self.metadata = self.get_video_info(self.videoid) self.api_points_used += 1.0 self.total_member_msgs = 0 self.running = True self.running_chat = None if self.metadata is not None: self.videoinfo = self.metadata self.videoinfo["retries_of_rerecording_had_scs"] = 0 self.videoinfo["retries_of_rerecording"] = 0 self.videoPostedAt = copy.deepcopy(self.videoinfo["publishDateTime"]) self.channel_id = self.metadata["channelId"] else: self.videoPostedAt = 0 self.channel_id = "privatted-deleted-memebershipped" self.skeleton_dict = {"channel": None, "channelId": None, "id": None, "title": None, "live": None, "caught_while": None, "publishDateTime": None, "length": None, "endedLogAt": None, "retries_of_rerecording": None, "retries_of_rerecording_had_scs": None, "createdDateTime": None, "liveStreamingDetails":{"scheduledStartTime": None, "actualStartTime": None, "actualEndTime": None} } self.sc_file = self.channel_id + "/sc_logs/" + self.videoid + ".txt"+self.file_suffix self.donor_file = self.channel_id + "/vid_stats/donors/" + self.videoid + ".txt"+self.file_suffix self.stats_file = self.channel_id + "/vid_stats/" + self.videoid + "_stats.txt"+self.file_suffix #await self.log_output((self.metadata, self.channel_id, self.videoid, self.file_suffix)) pathlib.Path('./' + self.channel_id + '/vid_stats/donors').mkdir(parents=True, exist_ok=True) pathlib.Path('./' + self.channel_id + '/sc_logs').mkdir(parents=True, exist_ok=True) self.placeholders = 0 if logger: self.logger = logger else: self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.DEBUG) fh = logging.FileHandler('./' + self.channel_id +"/"+args.yt_vid_id+'.applog') fh.setLevel(logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.INFO) dbg_formatter = config.mylogger.MyFormatter() formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') fh.setFormatter(dbg_formatter) ch.setFormatter(formatter) self.logger.addHandler(fh) self.logger.addHandler(ch) def __str__(self): return "["+self.videoid+"] " + self.videoinfo["channel"] + " - " + self.videoinfo["title"] + " - Running: "+str(self.running) + " Live: " + self.videoinfo["live"] def __repr__(self): return "["+self.videoid+"] " + self.videoinfo["channel"] + " - " + self.videoinfo["title"] + " - Running: "+str(self.running) + " Live: " + self.videoinfo["live"] def get_video_info(self,video_ID:str): response = None try: response = self.api.get_video_metadata(video_id=video_ID, parser=None, part=["liveStreamingDetails", "contentDetails", "snippet"]) api_metadata = {"channel": response["snippet"]["channelTitle"], "channelId": response["snippet"]["channelId"], "id": video_ID, "title": response["snippet"]["title"], "live": response["snippet"]["liveBroadcastContent"], "caught_while": response["snippet"]["liveBroadcastContent"], "publishDateTime": datetime.strptime(response["snippet"]["publishedAt"] + " +0000", "%Y-%m-%dT%H:%M:%SZ %z").timestamp()} delta = isodate.parse_duration(response["contentDetails"]["duration"]) api_metadata["length"] = delta.total_seconds() if 'liveStreamingDetails' in response.keys(): api_metadata["liveStreamingDetails"] = {} for d in response["liveStreamingDetails"].keys(): if "Time" in d or "time" in d: api_metadata["liveStreamingDetails"][d] = datetime.strptime( response["liveStreamingDetails"][d] + " +0000", "%Y-%m-%dT%H:%M:%SZ %z").timestamp() return api_metadata except Exception as e: print(self.videoid) print(e) print(response) return None async def async_get_video_info(self,video_ID:str): self.api_points_used += 1.0 api_metadata = await self.loop.run_in_executor(self.t_pool,self.get_video_info,video_ID) return api_metadata def cancel(self): self.cancelled = True if self.running_chat: self.running_chat.terminate() async def update_psql_metadata(self): async with self.conn.transaction(): await self.conn.execute( "UPDATE video SET caught_while = $2, live = $3, title = $4," "retries_of_rerecording = $5, retries_of_rerecording_had_scs = $6 WHERE video_id = $1", self.videoid, self.videoinfo["caught_while"], self.videoinfo["live"], self.videoinfo["title"], self.videoinfo["retries_of_rerecording"], self.videoinfo["retries_of_rerecording_had_scs"]) if "scheduledStartTime" in self.videoinfo["liveStreamingDetails"].keys(): await self.conn.execute("UPDATE video SET scheduledstarttime = $2 WHERE video_id = $1", self.videoid, datetime.fromtimestamp( self.videoinfo["liveStreamingDetails"]["scheduledStartTime"], timezone.utc)) if "actualStartTime" in self.videoinfo["liveStreamingDetails"].keys(): await self.conn.execute("UPDATE video SET actualstarttime = $2 WHERE video_id = $1", self.videoid, datetime.fromtimestamp( self.videoinfo["liveStreamingDetails"]["actualStartTime"], timezone.utc)) if "actualEndTime" in self.videoinfo["liveStreamingDetails"].keys(): await self.conn.execute("UPDATE video SET actualendtime = $2 WHERE video_id = $1", self.videoid, datetime.fromtimestamp( self.videoinfo["liveStreamingDetails"]["actualEndTime"], timezone.utc)) if "old_title" in self.videoinfo.keys(): await self.conn.execute("UPDATE video SET old_title = $2 WHERE video_id = $1", self.videoid, self.videoinfo["old_title"]) if "length" in self.videoinfo.keys(): await self.conn.execute("UPDATE video SET length = $2 WHERE video_id = $1", self.videoid, self.videoinfo["length"]) if "publishDateTime" in self.videoinfo.keys(): await self.conn.execute("UPDATE video SET publishDateTime = $2 WHERE video_id = $1", self.videoid, datetime.fromtimestamp(self.videoinfo["publishDateTime"], timezone.utc)) if "endedLogAt" in self.videoinfo.keys(): await self.conn.execute("UPDATE video SET endedLogAt = $2 WHERE video_id = $1", self.videoid, self.ended_at) async def already_done(self,conn): row = await conn.fetchrow('SELECT retries_of_rerecording_had_scs, retries_of_rerecording FROM video WHERE video_id = $1', self.videoid) successful_sc_recordings = 0 repeats = 0 if row: successful_sc_recordings = row["retries_of_rerecording_had_scs"] if row["retries_of_rerecording_had_scs"] else 0 repeats = row["retries_of_rerecording"] if row["retries_of_rerecording"] else 0 test_file = pathlib.Path(self.sc_file) file_has_content = False if test_file.is_file(): if test_file.stat().st_size > 2: file_has_content = True if successful_sc_recordings >= 2 and file_has_content: return True, test_file.stat().st_size, successful_sc_recordings, repeats else: return False, 0, successful_sc_recordings, repeats async def main(self): if not self.loop: self.loop = asyncio.get_running_loop() await self.log_output(self.videoinfo,10) pgsql_config_file = open("postgres-config.json") pgsql_creds = json.load(pgsql_config_file) self.conn = await asyncpg.connect(user = pgsql_creds["username"], password = pgsql_creds["password"], host = pgsql_creds["host"], database = pgsql_creds["database"]) old_meta_row = await self.conn.fetchrow('SELECT c.name, channel_id, title, caught_while, live, old_title, length, createdDateTime, publishDateTime, startedLogAt, endedLogAt, scheduledStartTime, actualStartTime, actualEndTime, retries_of_rerecording, retries_of_rerecording_had_scs FROM video INNER JOIN channel c on channel_id = c.id WHERE video_id = $1', self.videoid) old_meta = dict(old_meta_row) if old_meta_row else None if old_meta: old_time_meta = {"scheduledStartTime": old_meta["scheduledstarttime"].timestamp() if old_meta["scheduledstarttime"] else 0, "actualStartTime": old_meta["actualstarttime"].timestamp() if old_meta["actualstarttime"] else 0, "actualEndTime": old_meta["actualendtime"].timestamp() if old_meta["actualendtime"] else 0} if self.videoinfo: for time in old_time_meta.keys(): if "liveStreamingDetails" in self.videoinfo.keys(): if time in self.videoinfo["liveStreamingDetails"].keys(): if not old_time_meta[time] and self.videoinfo["liveStreamingDetails"][time]: old_time_meta[time] = self.videoinfo["liveStreamingDetails"][time] time_meta_keys = list(old_time_meta.keys()) for timekey in time_meta_keys: if not old_time_meta[timekey]: old_time_meta.pop(timekey) old_meta["liveStreamingDetails"] = old_time_meta if not self.videoinfo: self.videoinfo = copy.deepcopy(self.skeleton_dict) await self.log_output(self.videoinfo,10) if self.videoinfo["title"] != old_meta["title"] and self.videoinfo["title"]: old_meta["old_title"] = old_meta["title"] old_meta["title"] = self.videoinfo["title"] old_meta_keys_l = [k.lower() for k in old_meta.keys()] old_meta_keys_n = [k for k in old_meta.keys()] old_meta_keys = dict(zip(old_meta_keys_l, old_meta_keys_n)) #await self.log_output(old_meta_keys,10) for info in self.skeleton_dict.keys(): if info.lower() in old_meta_keys_l: if type(old_meta[old_meta_keys[info.lower()]]) is datetime: self.videoinfo[info] = old_meta[old_meta_keys[info.lower()]].timestamp() elif old_meta[old_meta_keys[info.lower()]]: self.videoinfo[info] = old_meta[old_meta_keys[info.lower()]] elif old_meta[old_meta_keys[info.lower()]] is None and "time" in info.lower(): if info in self.videoinfo.keys(): await self.log_output((info,"key found", self.videoinfo[info],self.videoinfo.keys())) self.videoinfo[info] = self.videoinfo[info] if self.videoinfo[info] else 0 else: await self.log_output((info,"key not found",self.videoinfo[info],self.videoinfo.keys())) self.videoinfo[info] = 0 else: await self.log_output("else case",10) self.channel_id = old_meta["channel_id"] self.videoinfo["channel"] = old_meta["name"] self.videoinfo["channelId"] = self.channel_id self.videoinfo["id"] = self.videoid self.videoPostedAt = self.videoinfo['publishDateTime'] self.metadata_list.append(self.videoinfo) self.ended_at = old_meta["endedlogat"] if old_meta["endedlogat"] else None self.videoinfo["endedLogAt"] = self.ended_at.timestamp() if self.ended_at else None if self.metadata: self.videoinfo["live"] = self.metadata["live"] await self.log_output(self.videoinfo) if not self.videoinfo: await self.conn.close() return self.insert_channels = await self.conn.prepare("INSERT INTO channel(id, name, tracked) VALUES ($1,$2,$3) " "ON CONFLICT DO NOTHING") self.channel_name_history = await self.conn.prepare("INSERT INTO chan_names(id, name, time_discovered, time_used) " "VALUES ($1,$2,$3,$4) ON CONFLICT (id,name) DO UPDATE SET time_used = $4") self.insert_messages = await self.conn.prepare("INSERT INTO messages(video_id, chat_id, user_id, message_txt, " "time_sent, currency, value, color) " "VALUES ($1,$2,$3,$4,$5,$6,$7,$8) ON CONFLICT DO NOTHING") async with self.conn.transaction(): if self.channel_id and self.videoinfo["channel"]: await self.conn.execute("INSERT INTO channel VALUES($1,$2,$3) ON CONFLICT (id) DO UPDATE SET tracked = $3", self.channel_id, self.videoinfo["channel"], True) await self.conn.execute("INSERT INTO chan_names VALUES($1,$2,$3) ON CONFLICT DO NOTHING", self.channel_id, self.videoinfo["channel"], datetime.now(tz=pytz.timezone('Europe/Berlin'))) self.chat_err = True repeats = 0 log_exist_test, filesize, db_retries_had_scs, repeats = await self.already_done(self.conn) self.videoinfo["retries_of_rerecording_had_scs"] = db_retries_had_scs self.videoinfo["retries_of_rerecording"] = repeats if log_exist_test: await self.log_output(self.videoinfo["channel"] + " - " + self.videoinfo[ "title"] + " already analyzed, skipping. Existing file size: " + str( filesize) + " bytes") return had_scs = db_retries_had_scs if db_retries_had_scs else 0 self.msg_counter = 0 islive = True while (repeats < self.max_retry_attempts and had_scs < self.min_successful_attempts and not self.cancelled and islive): self.msg_counter = 0 self.chat_err = True if self.metadata: islive = self.metadata["live"] in ["upcoming","live"] while self.chat_err and not self.cancelled: if "liveStreamingDetails" in self.videoinfo.keys() or self.videoinfo["live"] != "none" or repeats >= 1: self.stats.clear() self.chat_err = False self.started_at = datetime.now(tz=pytz.timezone('Europe/Berlin')) publishtime = datetime.fromtimestamp(self.videoPostedAt,timezone.utc) async with self.conn.transaction(): await self.conn.execute( "INSERT INTO video (video_id,channel_id,title,startedlogat,createddatetime) " "VALUES($1,$2,$3,$4,$5) ON CONFLICT DO NOTHING", self.videoid, self.videoinfo["channelId"], self.videoinfo["title"], self.started_at, publishtime) await self.update_psql_metadata() await self.log_output("Started Analysis #"+str(repeats+1)+" at: "+self.started_at.isoformat()) await self.log_output("of video " + publishtime.isoformat() + " " +self.videoinfo["channel"]+" - " + self.videoinfo["title"] + " ["+self.videoid+"]") if repeats >= 1: await self.log_output("Recording the YouTube-archived chat after livestream finished") self.httpclient = httpx.AsyncClient(http2=True) self.running_chat = LiveChatAsync(self.videoid, callback = self.display, processor = (SuperChatLogProcessor(), SuperchatCalculator()),logger=self.logger, client = self.httpclient, exception_handler = self.exception_handling) while self.running_chat.is_alive() and not self.cancelled: await asyncio.sleep(3) if type(self.running_chat.exception) is exceptions.InvalidVideoIdException or type(self.running_chat.exception) is exceptions.ChatParseException: #Video ID invalid: Private or Membership vid or deleted. Treat as cancelled #ChatParseException: No chat found self.cancelled = True if repeats == 0 and not self.chat_err and not self.cancelled and islive: self.ended_at = datetime.now(tz=pytz.timezone('Europe/Berlin')) self.videoinfo["endedLogAt"] = self.ended_at.timestamp() await self.httpclient.aclose() newmetadata = await self.async_get_video_info(self.videoid) #when livestream chat parsing ends, get some more metadata if newmetadata is not None: if newmetadata["live"] in ["upcoming","live"]: #in case the livestream has not ended yet! await self.log_output(("Error! Chat monitor ended prematurely!",self.running_chat.is_alive())) self.chat_err = True else: islive = False if self.videoinfo["caught_while"] in ["upcoming","live"]: #use newer metadata while rescuing certain fields from the old metadata createdDateTime = self.videoPostedAt caught_while = self.videoinfo["caught_while"] old_title = self.videoinfo["title"] retries_w_scs = self.videoinfo["retries_of_rerecording_had_scs"] retries_total = self.videoinfo["retries_of_rerecording"] if newmetadata is not None: self.videoinfo = newmetadata self.videoinfo["endedLogAt"] = self.ended_at.timestamp() if self.ended_at else None self.videoinfo["retries_of_rerecording_had_scs"] = retries_w_scs self.videoinfo["retries_of_rerecording"] = retries_total self.videoinfo["createdDateTime"] = createdDateTime self.videoinfo["caught_while"] = caught_while if self.videoinfo["title"] != old_title: self.videoinfo["old_title"] = old_title else: await self.log_output(("couldn't retrieve new metadata for",self.videoid,old_title)) else: islive = False if self.msg_counter > 0 and not self.chat_err: had_scs += 1 self.videoinfo["retries_of_rerecording_had_scs"] = had_scs self.total_counted_msgs = 0 self.total_member_msgs = 0 self.total_new_members = 0 self.videoinfo["startedLogAt"] = self.started_at.timestamp() self.videoinfo["retries_of_rerecording"] = repeats await self.update_psql_metadata() self.metadata_list.append(self.videoinfo) else: await self.log_output(self.videoinfo["title"]+" is not a broadcast recording or premiere") return repeats += 1 await self.log_output((repeats,self.cancelled,had_scs,self.videoinfo["live"])) if repeats >= 1 and not self.cancelled and had_scs < 2 and islive: await self.log_output("Waiting "+str(self.minutes_wait)+" minutes before re-recording sc-logs") await asyncio.sleep(self.minutes_wait*60) self.running = False await self.log_output("writing to files") proper_sc_list = [] unique_currency_donors={} count_scs = 0 for msg in self.sc_msgs: msg_loaded = json.loads(msg) if msg_loaded["type"] not in ["newSponsor", "sponsorMessage"]: count_scs += 1 donations = self.donors[msg_loaded["userid"]]["donations"].setdefault(msg_loaded["currency"],[0,0]) self.donors[msg_loaded["userid"]]["donations"][msg_loaded["currency"]][0] = donations[0] + 1 #amount of donations self.donors[msg_loaded["userid"]]["donations"][msg_loaded["currency"]][1] = donations[1] + msg_loaded["value"] #total amount of money donated self.unique_donors.setdefault(msg_loaded["currency"], set()) self.unique_donors[msg_loaded["currency"]].add(msg_loaded["userid"]) proper_sc_list.append(msg_loaded) for currency in self.unique_donors.keys(): unique_currency_donors[currency] = len(self.unique_donors[currency]) f = open(self.sc_file, "w") f_stats = open(self.stats_file, "w") f.write(json.dumps(proper_sc_list)) await self.log_output((len(proper_sc_list), "unique messages written",count_scs,"are superchats")) f.close() self.stats.append(await self.loop.run_in_executor(self.t_pool, recount_money, proper_sc_list)) f_stats.write(json.dumps([self.metadata_list[-1], self.stats[-1], unique_currency_donors])) f_stats.close() f_donors = open(self.donor_file,"w") f_donors.write(json.dumps(self.donors)) f_donors.close() await self.conn.close() if self.cancelled: os.rename(f.name, f.name+".cancelled") os.rename(f_stats.name, f_stats.name + ".cancelled") os.rename(f_donors.name, f_donors.name + ".cancelled") if not self.chat_err and self.gen_wc and len(self.sc_msgs) > 0 and repeats >= 1 and not self.cancelled: await self.loop.run_in_executor(self.t_pool, self.generate_wordcloud, proper_sc_list) async def display(self,data,amount): if len(data.items) > 0: start = datetime.now(timezone.utc) chatters = [] channels = [] messages = [] for c in data.items: #data.items contains superchat messages - save them in list while also saving the calculated if c.type == "placeholder": self.placeholders += 1 if c.type == "newSponsor": sc_datetime = datetime.fromtimestamp(c.timestamp/1000.0,timezone.utc) sc_info = {"type": c.type, "id": c.id, "time":c.timestamp, "userid":c.author.channelId, "member_level": c.member_level, "debugtime":sc_datetime.isoformat()} self.total_new_members += 1 self.sc_msgs.add(json.dumps(sc_info)) #sums in a list if c.type in ["superChat","superSticker","sponsorMessage"]: if c.currency in self.clean_currency.keys(): c.currency = self.clean_currency[c.currency] sc_datetime = datetime.fromtimestamp(c.timestamp/1000.0,timezone.utc) name_used_datetime = start if self.videoinfo["live"] == "none" else sc_datetime sc_weekday = sc_datetime.weekday() sc_hour = sc_datetime.hour sc_minute = sc_datetime.minute sc_user = c.author.name sc_userid = c.author.channelId chat_id = c.id chatters.append((sc_userid,sc_user,sc_datetime,name_used_datetime)) channels.append((sc_userid, sc_user, False)) if sc_userid not in self.donors.keys(): self.donors[sc_userid] = {"names":[sc_user], "donations": {}} else: if sc_user not in self.donors[sc_userid]["names"]: self.donors[sc_userid]["names"].append(sc_user) sc_message = c.message sc_color = c.bgColor sc_currency = c.currency.replace(u'\xa0', '') sc_info = {"type": c.type, "id": chat_id, "time":c.timestamp,"currency":sc_currency,"value":c.amountValue,"weekday":sc_weekday, "hour":sc_hour,"minute":sc_minute, "userid":sc_userid, "message":sc_message, "color":sc_color, "debugtime":sc_datetime.isoformat()} if c.type == "sponsorMessage": self.total_member_msgs += 1 sc_info["member_level"] = c.member_level #await self.log_output(sc_info) else: self.total_counted_msgs += 1 messages.append((self.videoid,chat_id,sc_userid,sc_message,sc_datetime,sc_currency,Decimal(c.amountValue),sc_color)) self.stats.append(amount) self.sc_msgs.add(json.dumps(sc_info)) self.msg_counter = amount["amount_sc"] async with self.conn.transaction(): await self.insert_channels.executemany(channels) await self.channel_name_history.executemany(chatters) await self.insert_messages.executemany(messages) end = datetime.now(timezone.utc) await self.log_output( self.videoinfo["channel"] + " " + self.videoinfo["title"] + " " + data.items[-1].elapsedTime + " " + str(self.msg_counter) + "/"+str(self.total_counted_msgs) + " superchats, "+str(self.total_new_members)+" new members, "+str(self.total_member_msgs)+" member anniversary scs took "+ str((end-start).total_seconds()*1000)+" ms, placeholders: " + str(self.placeholders)) def generate_wordcloud(self,log): wordcloudmake = superchat_wordcloud(log, logname=self.videoid) wordcloudmake.generate() async def log_output(self,logmsg,level = 20): msg_string = "" msg_len = len(logmsg) if isinstance(logmsg, tuple): part_count = 0 for msg_part in logmsg: part_count += 1 msg_string += str(msg_part) if msg_len > part_count: msg_string += " " elif isinstance(logmsg, str): msg_string = logmsg else: msg_string = str(logmsg) await self.loop.run_in_executor(self.t_pool,self.logger.log,level,msg_string) def exception_handling(self,loop,context): ex_time = datetime.now(timezone.utc) self.logger.log(40,"Exception caught") self.logger.log(40,context)
def get_channel_name(video_id, api_key): yt = YouTubeDataAPI(api_key) return yt.get_video_metadata(video_id)["channel_title"]
def get_video_title(video_id, api_key): yt = YouTubeDataAPI(api_key) return yt.get_video_metadata(video_id)["video_title"]
def convert_time(row): new_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(row['publish_date'])) return new_time videos_list.reset_index() #convert epoch to timestamp videos_list['time_pub'] = videos_list.apply(convert_time, axis=1) #Extract video ID's to a list videos = videos_list['video_id'].tolist() #Extract all the video metadata to a dataframe query = yt.get_video_metadata(videos) vids = pd.DataFrame(query) #Sum colums to work out totals likes = pd.to_numeric(vids['video_like_count']).sum() dislikes = pd.to_numeric(vids['video_dislike_count']).sum() comments = pd.to_numeric(vids['video_comment_count']).sum() #Apply those totals to columns in our original channel df channel['likes'] = likes channel['comments'] = comments channel['dislikes'] = dislikes #restrict the output to only useful columns channel = channel[[ 'description', 'video_count', 'view_count', 'subscription_count',
parser=lambda x: x): v["minerva_collected"] = time.time() out_file.write("%s\n" % json.dumps(v)) out_file.flush() video_ids.append(v["snippet"]["resourceId"]["videoId"]) video_count += 1 print("\t Captured playlist videos:", video_count) # Now we test our cache to see what videos we've already pulled video_ids_to_capture = [] for v in video_ids: this_v_file = os.path.join(target_path, "%s.json" % v) if not os.path.exists(this_v_file): video_ids_to_capture.append(v) # For all videos in this playlist, download their metadata slice_size = 50 for i in range(0, len(video_ids_to_capture), slice_size): v_ids = video_ids_to_capture[i:i + slice_size] vid_data = yt.get_video_metadata( v_ids, part=['statistics', 'snippet', 'contentDetails'], parser=lambda x: x) for v in vid_data: v["minerva_collected"] = time.time() this_v_file = os.path.join(target_path, "%s.json" % v["id"]) with open(this_v_file, "w") as out_file: out_file.write("%s" % json.dumps(v))
import pandas as pd from youtube_api import YouTubeDataAPI from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer analyser = SentimentIntensityAnalyzer() api_key = 'AIzaSyBR2kc8R5EzD1rnOjyXZfEL1FOGLKojsg4' yt = YouTubeDataAPI(api_key) #video parts video_parts = ['statistics', 'snippet', 'contentDetails', 'topicDetails'] #check api key is valid if yt.verify_key(): sonic = yt.get_video_metadata('szby7ZHLnkA', parser=None, part=video_parts) sonic_comments = yt.get_video_comments('szby7ZHLnkA', max_results=100) df_comments = pd.DataFrame(sonic_comments) df_graph_data = pd.DataFrame(columns=[ 'comment_id', 'commenter_channel_id', 'channel_country', 'text', 'date', 'neg', 'neu', 'pos', 'compound' ]) for index, row in df_comments.iterrows(): channel_id = df_comments.iloc[0].commenter_channel_id channel_data = yt.get_channel_metadata(channel_id) score = analyser.polarity_scores(row['text']) graph_row = { 'comment_id': row['comment_id'],