def chinatime_GET_NEWS_time_threading(decide_time_begin, decide_time_end, q): Title = [] Publish_time = [] Section = [] Body = [] Source = [] b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M") e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M") #print(b_time, "\n", e_time) loop_flag = False for page in range(1, 11): chinatime_home = "https://www.chinatimes.com/money/total?page=" + str(page) + "&chdtv" try: print("start collecting ChinaTime page..%s" % page) r = crawler_tool.url_retry(chinatime_home) soup = BeautifulSoup(r, "lxml") time.sleep(5) for i in range(len(soup.select("h3.title a"))): chinatime_financial_url = "https://www.chinatimes.com" + soup.select("h3.title a")[i]["href"] try: r2 = crawler_tool.url_retry(chinatime_financial_url) soup2 = BeautifulSoup(r2, "lxml") r_time = datetime.datetime.strptime(soup2.find("meta", attrs={"name": "pubdate"})["content"], "%Y-%m-%dT%H:%M:%S+08:00") if r_time > b_time: continue if r_time < e_time: print("Web Crawler has collected ChinaTime data from {b_time} to {e_time}".format(b_time= b_time, e_time= e_time)) loop_flag = True break else: Publish_time.append(r_time) Title.append(re.sub(r"\s{1,}","",soup2.find("h1").string)) Section.append(soup2.find("meta", attrs={"name": "section"})["content"]) Source.append(soup2.find("meta", attrs={"name": "source"})["content"]) body = soup2.select("div.article-body p") Body.append(crawler_tool.clean_html("".join(str(x) for x in body))) time.sleep(random.uniform(0, 2)) print("ChinaTime:", r_time) except rq.exceptions.RequestException as e: print("in", e) except rq.exceptions.RequestException as e: print("home", e) if loop_flag: break df = pd.DataFrame({"Title": Title, "Time": Publish_time, "Section": Section, "Source": Source, "Body": Body}).sort_values(by=["Time"]) file_name = "D:/User/Desktop/corpus/news/chinatime/" + decide_time_begin + "_" + decide_time_end + "_chinatime.csv" df.to_csv(file_name, encoding="utf-8") q.put(df)
def setn_GET_NEWS_time_threading(decide_time_begin, decide_time_end, q): Title = [] Publish_time = [] Section = [] Body = [] Source = [] b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M") e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M") #print(b_time, "\n", e_time) loop_flag = False for page in range(1, 19): print("start collecting Setn page {page}".format(page=page)) home_url = "https://www.setn.com/ViewAll.aspx?PageGroupID=2&p=" + str( page) time.sleep(5) try: r = crawler_tool.url_retry(home_url) soup = BeautifulSoup(r, "lxml") for i in range(len(soup.select("h3.view-li-title a"))): content_url = ("https://www.setn.com/" + soup.select("h3.view-li-title a")[i]["href"]) r2 = crawler_tool.url_retry(content_url) soup2 = BeautifulSoup(r2, "lxml") r_time = datetime.datetime.strptime( soup2.find("meta", attrs={"name": "pubdate"})["content"], "%Y-%m-%dT%H:%M:%S") if r_time > b_time: continue elif r_time < e_time: loop_flag = True print( "Web Crawler has collected Setn data from {b_time} to {e_time}" .format(b_time=b_time, e_time=e_time)) break else: Section.append( soup2.find("meta", attrs={"property": "og:title" })["content"].split("|")[1]) Title.append( re.sub( r"\s{1,}", "", soup2.find("meta", attrs={"property": "og:title" })["content"].split("|")[0])) Source.append( soup2.find("meta", attrs={"property": "og:title" })["content"].split("|")[2]) Publish_time.append(r_time) Body.append( crawler_tool.clean_html("".join( str(x) for x in soup2.select("div#Content1 p")))) print("Setn:", r_time) time.sleep(random.uniform(0, 2)) except rq.exceptions.RequestException as e2: print("home", e2) if loop_flag == True: break df = pd.DataFrame({ "Title": Title, "Time": Publish_time, "Section": Section, "Source": Source, "Body": Body }) file_name = "D:/User/Desktop/corpus/news/setn/" + decide_time_begin + "_" + decide_time_end + "_setn.csv" df.to_csv(file_name, encoding="utf-8") q.put(df)
def moneyDJ_GET_NEWS_time(decide_time_begin, decide_time_end): title = [] publish_time = [] body = [] section = [] source = [] loop_flag = False decide_time_begin = '202104200830' decide_time_end = '202104210830' b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M") e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M") for page in range(1, 50): home_url = "https://www.moneydj.com/KMDJ/News/NewsRealList.aspx?index1=" + str( page) + "&a=MB06" r = crawler_tool.url_retry(home_url) soup = BeautifulSoup(r, "lxml") print("start collecting moneyDJ page..%s" % page) for i in range( len( soup.find("table", attrs={ "class": "forumgrid" }).find_all("a"))): i = 4 url = "https://www.moneydj.com" + soup.find( "table", attrs={ "class": "forumgrid" }).find_all("a")[i]["href"] r2 = crawler_tool.url_retry(url) soup2 = BeautifulSoup(r2, "lxml") maindata = soup2.select( "article#MainContent_Contents_mainArticle")[0] r_time = datetime.datetime.strptime( soup2.find('span', attrs={ 'id': 'MainContent_Contents_lbDate' }).text, "%Y/%m/%d %H:%M") if r_time > b_time: continue elif r_time < e_time: print( "Web Crawler has collected moneyDJ from {b_time} to {e_time}" .format(b_time=b_time, e_time=e_time)) loop_flag = True break else: title_temp = re.sub(r"\s{1, }", "", soup2.select("h1 span")[0].string) body_temp = re.sub(r"\s{1, }", "", crawler_tool.clean_html(str(maindata))) if len(re.sub(r"[0-9.]", "", body_temp)) / len(body_temp) < 0.5: title.append(title_temp) body.append(title_temp) elif len(title_temp) + 100 > len( re.sub(r"[a-zA-Z0-9/,=?:;.{}()#%'&-]", "", body_temp)): title.append(title_temp) body.append(title_temp) else: title.append(title_temp) body.append(body_temp) publish_time.append(r_time) section.append("台股") source.append("moneyDJ") print("moneyDJ:", r_time) time.sleep(random.uniform(0.5, 1.5)) if loop_flag: break df = pd.DataFrame({ "Title": title, "Time": publish_time, "Section": section, "Source": source, "Body": body }) file_name = "D:/User/Desktop/corpus/news/temporarily/" + decide_time_begin + "_" + decide_time_end + "_moneyDJ.csv" df.to_csv(file_name, encoding="utf-8")
def moneyudn_GET_NEWS_time(decide_time_begin, decide_time_end): begin_time = datetime.datetime.today() title = [] publish_time = [] section = [] body = [] source = [] b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M") e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M") loop_flag = False for page in range(1, 100): print("start collecting page {page}".format(page=page)) home_url = "https://money.udn.com/rank/newest/1001/0/" + str(page) time.sleep(3) try: r = crawler_tool.url_retry(home_url) soup = BeautifulSoup(r, "lxml") for i in range(len(soup.select("td a"))): url2 = soup.select("td a")[i]["href"] html_page = crawler_tool.url_retry(url2) soup2 = BeautifulSoup(html_page, "lxml") r_time = datetime.datetime.strptime( soup2.find("meta", attrs={"name": "date"})["content"], "%Y/%m/%d %H:%M:%S") if r_time > b_time: continue elif r_time < e_time: loop_flag = True print( "Web Crawler has collected money_udn data from {b_time} to {e_time}" .format(b_time=b_time, e_time=e_time)) break else: sub_section = soup2.select("div#nav a")[-1].string if sub_section == "品味" or sub_section == "會員專區" or sub_section == "兩岸": time.sleep(random.randint(1, 5)) continue else: print(r_time) body.append( crawler_tool.clean_html("".join( str(x) for x in soup2.select("div#article_body p ")))) publish_time.append(r_time) # time title.append( re.sub( r"\s{1, }", "", soup2.find("meta", attrs={"property": "og:title" })["content"].split("|")[0])) section.append(sub_section) source.append(soup2.select("div#nav a")[0].string) time.sleep(0.2) except rq.exceptions.RequestException as e2: print("home", e2) if loop_flag == True: break df = pd.DataFrame({ "Title": title, "Time": publish_time, "Section": section, "Source": source, "Body": body }).sort_values(by=["Time"]) file_name = "D:/User/Desktop/corpus/news/temporarily/" + decide_time_begin + "_" + decide_time_end + "_moneyudn.csv" df.to_csv(file_name, encoding="utf-8") print("processing time:", datetime.datetime.today() - begin_time) return df
def anue_GET_NEWS_time(decide_time_begin, decide_time_end): dt = datetime.datetime.today() - datetime.datetime.fromtimestamp( 1611763199) # 1/27差值 dta = (dt.days + 1) * 86400 + 1611763199 dtb = str(dta - 11 * 86400 + 1) dta = str(dta) title = [] publish_time = [] section = [] body = [] source = [] b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M") e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M") loop_flag = False for page in range(1, 30): print("start collecting Anue page {page}".format(page=page)) home_url = "https://api.cnyes.com/media/api/v1/newslist/category/headline?limit=30&startAt="+dtb+"&endAt="+dta+"&page="\ + str(page) r = crawler_tool.url_retry_json(home_url) time.sleep(5) for i in range(len(r["items"]["data"])): content_url = "https://news.cnyes.com/news/id/" + str( r["items"]["data"][i]["newsId"]) r2 = crawler_tool.url_retry(content_url) soup = BeautifulSoup(r2, "lxml") try: r_time = datetime.datetime.strptime( soup.find("time").string, "%Y/%m/%d %H:%M") if r_time > b_time: continue elif r_time < e_time: loop_flag = True print( "---Web Crawler has collected Anue data from {b_time} to {e_time}---" .format(b_time=b_time, e_time=e_time)) break else: section.append( soup.find("meta", attrs={ "property": "og:title" })["content"].split("|")[-1].split("-")[-1]) title.append( re.sub( r"\s{1,}", "", soup.find("meta", attrs={"property": "og:title" })["content"].split("|")[0])) source.append( soup.find("meta", attrs={ "property": "og:title" })["content"].split("|")[-1].split("-")[0]) publish_time.append(r_time) body.append( crawler_tool.clean_html("".join( str(x) for x in soup.select("div._2E8y p")))) print("Anue:", r_time) time.sleep(random.uniform(0, 1.5)) except: pass if loop_flag == True: break df = pd.DataFrame({ "Title": title, "Time": publish_time, "Section": section, "Source": source, "Body": body }) file_name = "D:/User/Desktop/corpus/news/temporarily/" + decide_time_begin + "_" + decide_time_end + "_Anue.csv" df.to_csv(file_name, encoding="utf-8")
def ctee_GET_NEWS_time_threading(decide_time_begin, decide_time_end, q): Title = [] Publish_time = [] Section = [] Body = [] Source = [] b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M") e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M") loop_flag = False for page in range(1, 11): print("start ctee collecting page {page}".format(page=page)) home_url = "https://m.ctee.com.tw/livenews/all/page/" + str(page) time.sleep(5) try: r = crawler_tool.url_retry(home_url) soup = BeautifulSoup(r, "lxml") for i in range(len(soup.select("p.now-title "))): content_url = soup.select("p.now-title ")[i].find_all( "a")[-1]["href"] section = soup.select("p.now-title ")[i].find("span").string r_time = datetime.datetime.strptime( str(b_time.year) + "/" + crawler_tool.clean_html( str(soup.select("p.now-title ")[i].find_all("a") [1]).split("|")[-1]), "%Y/ %m/%d %H:%M ") if r_time > b_time: continue elif r_time < e_time: loop_flag = True print( "collected ctee news from {b_time} to {e_time}".format( b_time=b_time, e_time=e_time)) break else: r2 = crawler_tool.url_retry(content_url) soup2 = BeautifulSoup(r2, "lxml") if section == "生活" or section == "政治": time.sleep(random.uniform(0, 1.5)) continue else: Title.append(soup2.select("span.post-title")[0].string) Section.append(section) Source.append("工商時報") Publish_time.append(r_time) Body.append( crawler_tool.clean_html("".join( str(x) for x in soup2.select("div.entry-content p")))) print("ctee:", r_time) time.sleep(random.uniform(0, 1.5)) except rq.exceptions.RequestException as e2: print("home", e2) if loop_flag == True: break df = pd.DataFrame({ "Title": Title, "Time": Publish_time, "Section": Section, "Source": Source, "Body": Body }).sort_values(by=["Time"]) file_name = "D:/User/Desktop/corpus/news/ctee/" + decide_time_begin + "_" + decide_time_end + "_ctee.csv" df.to_csv(file_name, encoding="utf-8") q.put(df)
def cna_GET_NEWS_time(decide_time_begin, decide_time_end): title = [] publish_time = [] body = [] section = [] source = [] b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M") e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M") for category in ["aie", "asc"]: loop_flag = False for pageidx in range(1, 6): resp = rq.post( "https://www.cna.com.tw/cna2018api/api/WNewsList", { "action": "0", "category": category, "pageidx": pageidx, "pagesize": "20" }) j = json.loads(resp.content) for i in range(len(j['ResultData']["Items"])): r_time = datetime.datetime.strptime( j['ResultData']["Items"][i]["CreateTime"], "%Y/%m/%d %H:%M") if r_time > b_time: continue elif r_time < e_time: loop_flag = True print( "Web Crawler has collected 中央通訊社 from {b_time} to {e_time}" .format(b_time=b_time, e_time=e_time)) break else: url = j['ResultData']["Items"][i]["PageUrl"] section.append(j['ResultData']["Items"][i]["ClassName"]) title.append( re.sub(r"\s{1, }", "", j['ResultData']["Items"][i]["HeadLine"])) publish_time.append(r_time) soup = BeautifulSoup(rq.get(url).text, "lxml") source.append("中央通訊社") body.append("".join( crawler_tool.clean_html(str(x)) for x in soup.select("div.paragraph p"))) print("中央通訊社:", category, r_time) time.sleep(random.uniform(0.5, 1.5)) if loop_flag == True: break df = pd.DataFrame({ "Title": title, "Time": publish_time, "Section": section, "Source": source, "Body": body }) file_name = "D:/User/Desktop/corpus/news/temporarily/" + decide_time_begin + "_" + decide_time_end + "_cna.csv" df.to_csv(file_name, encoding="utf-8")
def rti_GET_NEWS_time_threading(decide_time_begin, decide_time_end, q): title = [] publish_time = [] section = [] body = [] source = [] b_time = datetime.datetime.strptime(decide_time_begin, "%Y%m%d%H%M") e_time = datetime.datetime.strptime(decide_time_end, "%Y%m%d%H%M") #print(b_time, "\n", e_time) loop_flag = False for page in range(1, 100): print("start collecting Rti page {page}".format(page=page)) home_url = "https://www.rti.org.tw/news/list/categoryId/2/page/" + str( page) r = crawler_tool.url_retry(home_url) soup = BeautifulSoup(r, "lxml") time.sleep(5) for i in range(len(soup.select("div.main_wrapper ul a"))): content_url = "https://www.rti.org.tw" + soup.select( "div.main_wrapper ul a")[i]["href"] r2 = crawler_tool.url_retry(content_url) soup2 = BeautifulSoup(r2, "lxml") r_time = datetime.datetime.strptime( re.sub("[^0-9]", "", soup2.find("li", attrs={ "class": "date" }).string), "%Y%m%d%H%M") if r_time > b_time: continue elif r_time < e_time: loop_flag = True print( "Web Crawler has collected Rti data from {b_time} to {e_time}" .format(b_time=b_time, e_time=e_time)) break else: section.append("財經") title.append( re.sub(r"\s{1,}", "", soup2.find("title").string.split("-")[0])) source.append(soup2.find("title").string.split("-")[-1]) publish_time.append(r_time) body.append( crawler_tool.clean_html("".join( str(x) for x in soup2.select("article p")))) print("Rti:", r_time) time.sleep(random.uniform(0, 2)) if loop_flag == True: break df = pd.DataFrame({ "Title": title, "Time": publish_time, "Section": section, "Source": source, "Body": body }) file_name = "D:/User/Desktop/corpus/news/rti/" + decide_time_begin + "_" + decide_time_end + "_rti.csv" df.to_csv(file_name, encoding="utf-8") q.put(df)