def request_videoComment(key:str,videoID:str,channelTitle:str) -> pd.DataFrame: res = [] params = { 'key': key, 'part': 'snippet,replies', 'videoId': videoID, 'maxResults': 100, # max } resp = requests.get(commentThreadsAPI, params) resp_json = json.loads(resp.text) res.extend(get_video_comment_list(resp_json)) while resp_json.get('nextPageToken',-1) != -1: params['pageToken'] = resp_json['nextPageToken'] #到下一頁 resp = requests.get(commentThreadsAPI, params) resp_json = json.loads(resp.text) res.extend(get_video_comment_list(resp_json)) comment_df = pd.DataFrame(res) #print(comment_df,resp_json,params) if 'publishedAt' not in comment_df.columns: log.processLog(f"=== {videoID}:沒有任何留言 ===") else: comment_df['publishedAt'] = pd.to_datetime(comment_df['publishedAt']) comment_df['publishedAt'] = comment_df['publishedAt'].dt.tz_localize(None) # remove timezone comment_df.sort_values('likeCount',ascending=False,inplace=True) comment_df.to_csv('頻道列表/%s/影片留言/%s.csv' % (channelTitle,videoID) ,index=False,encoding='utf-8-sig') return comment_df
def get_videoComment(key:str,titleList:list,startDate:str,endDate:str,force:bool)-> pd.DataFrame: log.processLog('========================================================') log.processLog('========================================================') log.processLog(f'開始執行get_videoComment()') dd = {'Auth_key':key,'查詢頻道':titleList,'查詢開始日期':startDate,'查詢終止日期':endDate,'強制重來':force} log.processLog(f" 本次查詢參數:{dd}") log.processLog(f" Step1: 判斷前次是否有遇到流量限制的問題:{os.path.exists('log/stopRecord.log')}") log.processLog(f" Step2: 是否從前次停止的地方開始:{force}") titleList = list(titleList) try: if ((os.path.exists('log/stopRecord.log')) & (force==False)): with open('log/stopRecord.log','r',encoding='utf-8') as f: for readline in f.readlines(): if readline.find('頻道名稱') != -1: s = json.loads(readline) log.processLog(f"Step3:取得前次進度儲存進度 {s}") log.processLog(f"Step4:進入VideoID比對程序") titleIndex = titleList.index(s['頻道名稱']) for title in titleList[titleIndex:]: df = pd.read_csv('頻道列表/'+title+'/'+title+'_影片列表.csv') channelTitle = df['channelTitle'].values[0] data = df[(df.publishedAt >= startDate) & (df.publishedAt <= str(datetime.strptime(endDate,'%Y-%m-%d')+timedelta(days=1))[:10] )] if data['videoId'].values.tolist().count(s['影片ID']) == 1: videoIDStart = data['videoId'].values.tolist().index(s['影片ID']) else: videoIDStart = 0 log.processLog(f'[{channelTitle}] 頻道影片總數: {len(df)}') log.processLog(f'[{channelTitle}] 指定時間內影片總數: {len(data)}') log.processLog(f'[{channelTitle}] 本次需爬影片數: {len(data)-videoIDStart-1}') log.processLog(f'[{channelTitle}] 開始爬取影片留言') # log.processLog(f'{data.columns}') # log.processLog(f'{data.head(5)}') for index in trange(len(data)): if index < (videoIDStart+1): continue row = data.iloc[index,:] videoId = row['videoId'] comment_df = request_videoComment(key,videoId,channelTitle) log.processLog(f'[{channelTitle}] 第{index}支影片留言:{videoId} Done') log.processLog('--------------------------------------------------------') return comment_df else: for title in titleList: df = pd.read_csv('頻道列表/'+title+'/'+title+'_影片列表.csv') channelTitle = df['channelTitle'].values[0] data = df[(df.publishedAt >= startDate) & (df.publishedAt <= str(datetime.strptime(endDate,'%Y-%m-%d')+timedelta(days=1))[:10])] log.processLog(f' [{channelTitle}] 頻道影片總數: {len(df)}') log.processLog(f' [{channelTitle}] 指定時間內影片總數: {len(data)}') log.processLog(f' [{channelTitle}] 本次需爬影片數: {len(data)}') log.processLog(f' [{channelTitle}] 開始爬取影片留言') for index in trange(len(data)): row = data.iloc[index,:] videoId = row['videoId'] comment_df = request_videoComment(key,videoId,channelTitle) log.processLog(f' [{channelTitle}] 第{index}支影片留言:{videoId} Done') log.processLog('--------------------------------------------------------') return comment_df except: log.processLog(f'[{channelTitle}] 執行 get_videoComment() 發生錯誤,請查看錯誤LOG檔') log.errorLog('====================================================') log.errorLog(f'[{channelTitle}] 錯誤函式:get_videoComment()') log.errorLog(f'【錯誤影片ID】{videoId}') log.errorLog('【錯誤訊息】') log.errorLog(traceback.format_exc()) log.errorLog('--------------------------------------------------------') stopRecord(channelTitle,videoId,startDate,endDate) traceback.print_exc()
def requset_playlistItems(chaneel_name:str,channel_uploadId:str,key:str) -> pd.DataFrame: ''' 利用播放清單的ID去取得該播放清單下的所有影片 ''' log.processLog('========================================================') log.processLog(f'[{chaneel_name}] 開始爬取所有影片') log.processLog(f'[{chaneel_name}] 執行requset_playlistItems()') res = [] params = { 'key': key, ##你的API KEY 'part': 'snippet', 'playlistId': channel_uploadId, 'maxResults': 50, # max } log.processLog(f'[{chaneel_name}] 本次初始參數{json.dumps(params)}') try: resp = requests.get(playlistItemsAPI, params) resp_json = json.loads(resp.text) totalResults = resp_json['pageInfo']['totalResults'] log.processLog(f'[{chaneel_name}] 共有{totalResults}支影片') res = get_channel_video_lists(resp_json) log.processLog(f'[{chaneel_name}] 開始爬取影片基本資訊') log.processLog(f'-----> 第1~{len(res)}支影片 Done <-----') if totalResults > 50 : nextPageToken = resp_json['nextPageToken'] for page in trange(totalResults//50): lenBefore = len(res) params['pageToken'] = nextPageToken resp = requests.get(playlistItemsAPI, params) resp_json = json.loads(resp.text) res.extend(get_channel_video_lists(resp_json)) try: nextPageToken = resp_json['nextPageToken'] except: log.processLog(f'-----> 第{lenBefore+1}~{len(res)}支影片 Done <-----') # No next page break log.processLog(f'-----> 第{lenBefore+1}~{len(res)}支影片 Done <-----') video_df = pd.DataFrame(res) video_df['publishedAt'] = pd.to_datetime(video_df['publishedAt']) video_df['publishedAt'] = video_df['publishedAt'].dt.tz_localize(None) # remove timezone ## Get Video Statistic log.processLog(f'[{chaneel_name}] 開始爬取影片統計資訊') viewCount,likeCount,dislikeCount,commentCount = [],[],[],[] for row in trange(len(video_df)//40): string = ','.join(video_df['videoId'].values[40*row:40*(row+1)]) resp_statistic_json = request_videoStatistic(key,string) lenViewBefore = len(viewCount) for item in resp_statistic_json: viewCount.append(item['statistics'].get('viewCount',-1)) likeCount.append(item['statistics'].get('likeCount',-1)) dislikeCount.append(item['statistics'].get('dislikeCount',-1)) commentCount.append(item['statistics'].get('commentCount',-1)) log.processLog(f'-----> 第{lenViewBefore+1}~{len(viewCount)}支影片 Done <-----') string = ','.join(video_df['videoId'].values[40*(len(video_df)//40):]) resp_statistic_json = request_videoStatistic(key,string) lenViewBefore = len(viewCount) for item in resp_statistic_json: viewCount.append(item['statistics'].get('viewCount',-1)) likeCount.append(item['statistics'].get('likeCount',-1)) dislikeCount.append(item['statistics'].get('dislikeCount',-1)) commentCount.append(item['statistics'].get('commentCount',-1)) log.processLog(f'-----> 第{lenViewBefore+1}~{len(viewCount)}支影片 Done <-----') video_df['viewCount'] = viewCount video_df['likeCount'] = likeCount video_df['dislikeCount'] = dislikeCount video_df['commentCount'] = commentCount video_df.to_csv('頻道列表/%s/%s_影片列表.csv' % (video_df.channelTitle[0],video_df.channelTitle[0]) ,index=False,encoding='utf-8-sig') log.processLog('--------------------------------------------------------') return video_df except: log.processLog(f'[{chaneel_name}] 執行 requset_playlistItems() 發生錯誤,請查看錯誤LOG檔') log.errorLog('====================================================') log.errorLog(f'[{chaneel_name}] 錯誤函式:requset_playlistItems()') log.errorLog('【錯誤參數】') log.errorLog(log.processLog(json.dumps(params,indent=4))) log.errorLog('【錯誤訊息】') log.errorLog(traceback.format_exc()) log.errorLog('----------------------------------------------------') traceback.print_exc()