def getNewsView(urlQueue): while True: try: # 不阻塞的讀取佇列資料 news_url = urlQueue.get_nowait() i = urlQueue.qsize() except Exception as e: break #print('Current Thread Name %s, Url: %s ' % (threading.currentThread().name, news_url)) ## 爬蟲程式內容 # News Tag轉換表 tag_dict = {"財經": "finance", "房地產": "finance", "國內": "local", "國際": "international", "中港澳": "international", "政治": "politics", "公共政策": "politics", "公民運動": "politics", "風生活": "life", "風攝影": "life", "品味生活": "life", "運動": "sports", "評論": "forum", "軍事": "military", "科技": "technology", "藝文": "arts", "影音": "entertainment", "歷史": "history", "調查": "research"} news_response = urlopen(news_url) news_html = BeautifulSoup(news_response) artical_number = news_url.split("/")[-1] view_response = urlopen("https://service-pvapi.storm.mg/pvapi/get_pv/" + artical_number) view_html = BeautifulSoup(view_response) news_view = json.loads(view_html.text)["total_count"] news_tag = news_html.find("a", class_="tags_link").text #print("正在處理:", news_url) # 將新聞觀看數放入佇列 try: viewQueue.put({"id": "storm-" + tag_dict[news_tag] + "-" + artical_number, "news_link": news_url, "view": news_view, "time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M")}) except KeyError as e: lineNotify("Got KeyError: " + str(e))
import os from aws_linenotify import lineNotify if os.path.exists("usage_output.txt"): with open("usage_output.txt", "r", encoding="utf-8") as f: output = f.read().split(" ")[8] #print(output) lineNotify("Disk usage " + output + " used!")
def getNewsContent(urlQueue): while True: try: # 不阻塞的讀取佇列資料 news_url = urlQueue.get_nowait() i = urlQueue.qsize() except Exception as e: break #print('Current Thread Name %s, Url: %s ' % (threading.currentThread().name, news_url)) ## 爬蟲程式內容 # News Tag轉換表 tag_dict = { "財經": "finance", "房地產": "finance", "國內": "local", "國際": "international", "中港澳": "international", "政治": "politics", "公共政策": "politics", "公民運動": "politics", "風生活": "life", "風攝影": "life", "品味生活": "life", "運動": "sports", "評論": "forum", "軍事": "military", "科技": "technology", "藝文": "arts", "影音": "entertainment", "歷史": "history", "調查": "research" } news_response = urlopen(news_url) news_html = BeautifulSoup(news_response) news_tag = news_html.find("a", class_="tags_link").text news_title = news_html.find("h1", id="article_title").text artical_number = news_url.split("/")[-1] view_response = urlopen( "https://service-pvapi.storm.mg/pvapi/get_pv/" + artical_number) view_html = BeautifulSoup(view_response) news_view = json.loads(view_html.text)["total_count"] news_create_time = news_html.find("span", class_="info_time").text artical = news_html.find("div", class_="article_content_inner") content = [] for p in artical.find_all("p"): content.append(p.text) news_content = "".join(content) news_keyword = [] key_word = news_html.find_all("a", class_="tag tags_content") for word in key_word: news_keyword.append(word.text) #print("正在處理:", news_url) # 將新聞內容放入佇列 try: newsQueue.put({ "id": "storm-" + tag_dict[news_tag] + "-" + artical_number, "news_link": news_url, "news_title": news_title, "news_create_time": news_create_time, "news_content": news_content, "news_keyword": news_keyword, "news_tag": news_tag, "news_view": [{ "view": news_view, "time": news_create_time }] }) except KeyError as e: lineNotify("Got KeyError: " + str(e))
"news_view": [{ "view": news_view, "time": news_create_time }] }) except KeyError as e: lineNotify("Got KeyError: " + str(e)) # 爲了突出效果,設定延時 #time.sleep(1) if __name__ == "__main__": now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') lineNotify("Content of news started updating " + now) # 開啟要爬的新聞網址檔案 while True: if os.path.exists("update_storm_news_url.txt"): with open("update_storm_news_url.txt", "r", encoding="utf-8") as f: url_list = f.read().split("\n") break else: time.sleep(120) if os.path.exists("update_for_view.txt"): view_update_url_list = url_list.copy() with open("update_for_view.txt", "r", encoding="utf-8") as f: old_view_list = f.read().split("\n") old_view_list.remove("")
viewQueue.put({"id": "storm-" + tag_dict[news_tag] + "-" + artical_number, "news_link": news_url, "view": news_view, "time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M")}) except KeyError as e: lineNotify("Got KeyError: " + str(e)) # 爲了突出效果,設定延時 #time.sleep(1) if __name__ == "__main__": now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') lineNotify("Views of news started updating " + now) # 開啟要爬的新聞網址檔案 while True: if os.path.exists("update_for_view.txt"): with open("update_for_view.txt", "r", encoding="utf-8") as f: url_list = f.read().split("\n") break else: time.sleep(120) # 紀錄爬蟲開始時間 start_time = time.time() for url in url_list: if url == "":
# 將依照發布日期分類的新聞內容存檔 with open("./newsfolder/" + date + "_storm_news.json", "w", encoding="utf-8") as f: json.dump(news_dict, f) # 紀錄存檔結束時間 end_time = time.time() print('Done, Time cost: %s ' % (end_time - start_time)) # 檢查用 # print(len(news_list)) # print(count) # 紀錄刪除檔案開始時間 start_time = time.time() # 使用系統指令刪除檔案 os.remove("update_storm_news_url.txt.bak") path = './tmpfolder/*' r = glob.glob(path) for i in r: os.remove(i) # 紀錄刪除檔案結束時間 end_time = time.time() print('Done, Time cost: %s ' % (end_time - start_time)) now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') lineNotify("News content updated successfully " + now)