def start_crawl(file_path, keywords, start_time, end_time): keywords_str = "%20".join(keywords) url = f"https://api.foxnews.com/search/web?q={keywords_str}+-filetype:amp+-filetype:xml+more:pagemap:metatags-prism.section+more:pagemap:metatags-pagetype:article+more:pagemap:metatags-dc.type:Text.Article&siteSearch=foxnews.com&siteSearchFilter=i&sort=date:r:{start_time}:{end_time}" r = requests.get(url=url, headers=get_header()) data = json.loads(r.text) total_results = int(data["searchInformation"]["totalResults"]) item_set = set() for start_index in range(1, total_results + 1, 10): time.sleep(2) url = f"https://api.foxnews.com/search/web?q={keywords}+-filetype:amp+-filetype:xml+more:pagemap:metatags-prism.section+more:pagemap:metatags-pagetype:article+more:pagemap:metatags-dc.type:Text.Article&siteSearch=foxnews.com&siteSearchFilter=i&sort=date:r:{start_time}:{end_time}&start={start_index}" try: r = requests.get(url=url, headers=get_header()) data = json.loads(r.text) # 每次查询总数会波动 total_results = int(data["searchInformation"]["totalResults"]) if start_index > total_results: break for j in data["items"]: item = j["pagemap"]["metatags"][0] article = entity.Article() article.title = item["dc.title"] article.title_cn = utils.translate_with_webdriver( article.title) article.date = item["dc.date"] article.url = item["og:url"] item_set.add(article) except Exception as exc: continue global TOTALS TOTALS += len(item_set) # 解析链接对应正文 for item in item_set: try: time.sleep(1) art = ns.Article(item.url, headers=get_header(), language='en') art.download() art.parse() item.text = art.text if art.text.strip() == "": title, publish_date, content = utils.get_title_time_content( item.url, header=get_header()) item.text = content item.text_cn = utils.translate_with_webdriver(item.text) except Exception as exc: continue finally: utils.write_xlsx_apend(file_path, [ item, ])
def start_crawl(file_path, keywords, start_time, end_time): keywords_str = "+".join(keywords) start_date = start_time[4:6] + "%2F" + start_time[ 6:8] + "%2F" + start_time[0:4] end_date = end_time[4:6] + "%2F" + end_time[6:8] + "%2F" + end_time[0:4] item_set = set() for page in range(1, 11): url = f"https://www.politico.com/search/{page}?adv=true&userInitiated=true&s=newest&q={keywords_str}&start={start_date}&end={end_date}" try: soup = driver_url(url) search_result = soup.find_all("article", class_=re.compile("story-frag")) if search_result and len(search_result) > 0: for li in search_result: article = entity.Article() a = li.find_next("header").find_next("a") article.url = a.get("href") article.title = a.string article.title_cn = utils.translate_with_webdriver( article.title) article.date = li.find_next("time").get("datetime").split( "T")[0].replace("-", "") # 解析正文 try: art = ns.Article(article.url, headers=get_header(), language='en') art.download() art.parse() article.text = art.text if art.text.strip() == "": title, publish_date, content = utils.get_title_time_content( article.url, header=get_header()) article.text = content article.text_cn = utils.translate_with_webdriver( article.text) except Exception as exc: pass time.sleep(1) item_set.add(article) else: return except: pass try: global TOTALS # 每获取一部分数据写入,避免单次过大 TOTALS += len(item_set) utils.write_xlsx_apend(file_path, item_set) item_set.clear() except: pass
def start_crawl(file_path, keywords, start_time, end_time): keywords_str = "%20".join(keywords) item_set = set() url = f"https://search.api.cnn.io/content?size=1&q={keywords_str}" r = requests.get(url=url, headers=get_header()) data = json.loads(r.text) total_results = data["meta"]["of"] for start_index in range(1, total_results + 1, 20): time.sleep(1) try: url = f"https://search.api.cnn.io/content?size=20&q={keywords_str}&from={start_index}" r = requests.get(url=url, headers=get_header()) data = json.loads(r.text) for result in data["result"]: date_str = result["lastPublishDate"] date = int(date_str[0:4] + date_str[5:7] + date_str[8:10]) if int(start_time) <= date <= int(end_time): article = entity.Article() article.url = result["url"] article.title = result["headline"] article.title_cn = utils.translate_with_webdriver(article.title) article.date = date article.text = result["body"] time.sleep(1) article.text_cn = utils.translate_with_webdriver(article.text) item_set.add(article) # 时间逆序,小于开始时间则后续文章都不满足时间要求 if date < int(start_time): return item_set except Exception as exc: continue finally: global TOTALS TOTALS += len(item_set) utils.write_xlsx_apend(file_path, item_set) item_set.clear()
def start_crawl(file_path, keywords, start_time, end_time): keywords_str = "%20".join(keywords) start_date = start_time[0: 4] + "%2F" + start_time[4: 6] + "%2F" + start_time[6: 8] end_date = end_time[0: 4] + "%2F" + end_time[4: 6] + "%2F" + end_time[6: 8] item_set = set() # 获取不到总数,限定页数上限 for page in range(1, 30): url = f"http://[email protected]:[email protected]/search?query={keywords_str}&isToggleOn=true&operator=AND&sort=date-desc&duration=1y&startDate={start_date}&endDate={end_date}&source=wsjie%2Cblog%2Cwsjsitesrch%2Cwsjpro%2Cautowire%2Capfeed&page={page}" try: # 模拟浏览器登录 options = webdriver.ChromeOptions() # 关闭可视化 options.add_argument('--headless') # 关闭图片视频加载 options.add_argument('blink-settings=imagesEnabled=false') driver = webdriver.Chrome(utils.DRIVER_PATH, options=options) driver.get(url) div = driver.find_element_by_id("root") soup = BeautifulSoup(div.get_attribute('innerHTML'), "html.parser") finally: driver.quit() # 添加发布时间和url search_result = soup.find_all("div", class_=re.compile("search-result")) if len(search_result) > 0: for div in search_result: article = entity.Article() a = div.find_next("a", href=re.compile("https://www.wsj.com/articles/")) article.url = a.get("href") p = div.find_next("p", class_=re.compile("timestamp")) # April 9, 2021 04:06 pm ET article.date = p.string item_set.add(article) else: break global TOTALS TOTALS += len(item_set) # 解析正文和标题 for item in item_set: try: art = ns.Article(item.url, headers=get_header(), language='en') art.download() art.parse() item.title = art.title item.text = art.text if art.title.strip() == "" or art.text.strip() == "": title, publish_date, content = utils.get_title_time_content(item.url, header=get_header()) item.title = title item.text = content item.title_cn = utils.translate_with_webdriver(item.title) item.text_cn = utils.translate_with_webdriver(item.text) except Exception as exc: pass try: utils.write_xlsx_apend(file_path, [item, ]) except: pass time.sleep(1) item_set.clear()
def start_crawl(file_path, keywords, start_time, end_time): keywords_str = "+".join(keywords) item_set = set() url = f"https://www.bbc.co.uk/search?q={keywords_str}&page=1" r = requests.get(url=url, headers=get_header()) html_content = r.text soup = BeautifulSoup(html_content, "html.parser") match = re.search(">window.__INITIAL_DATA__=(.+);</script>", str(soup.find_all("script")[3])) if match: data = json.loads(match[1]) initial_results = jsonpath.jsonpath(data, "$..initialResults")[0] totals = initial_results["count"] for page in range(1, totals // 10): # 结果太多,限制条数 if page == 10: break try: time.sleep(1) url = f"https://www.bbc.co.uk/search?q={keywords_str}&page={page}" r = requests.get(url=url, headers=get_header()) html_content = r.text soup = BeautifulSoup(html_content, "html.parser") match = re.search(">window.__INITIAL_DATA__=(.+);</script>", str(soup.find_all("script")[3])) if match: data = json.loads(match[1]) initial_results = jsonpath.jsonpath( data, "$..initialResults")[0] for item in initial_results["items"]: # 17 April 2017 # 8 hours ago origin_date = utils.format_date( item["metadataStripItems"][0]["text"]) if origin_date != -1 and int( start_time) <= origin_date <= int(end_time): article = entity.Article() article.title = item["headline"] article.title_cn = utils.translate_with_webdriver( article.title) article.url = item["url"] article.date = str(origin_date) try: time.sleep(1) art = ns.Article(item["url"], headers=get_header()) art.download() art.parse() article.text = art.text if art.text.strip() == "": title, publish_date, content = utils.get_title_time_content( item["url"], header=get_header()) article.text = content article.text_cn = utils.translate_with_webdriver( article.text) except Exception as exc: continue item_set.add(article) except Exception as exc: continue finally: try: global TOTALS TOTALS += len(item_set) utils.write_xlsx_apend(file_path, item_set) item_set.clear() except: pass
def start_crawl(file_path, keywords, start_time, end_time): keywords_str = "+".join(keywords) item_set = set() url = f"https://olympics.com/tokyo-2020/en/search/?q={keywords_str}" try: # 模拟浏览器登录 options = webdriver.ChromeOptions() # 关闭可视化 options.add_argument('--headless') # 关闭图片视频加载 options.add_argument('blink-settings=imagesEnabled=false') driver = webdriver.Chrome(utils.DRIVER_PATH, options=options) driver.get(url) div = driver.find_element_by_xpath( "//body/main[@id='tk-main-content']/section[1]/div[1]/div[2]/div[2]/div[1]/div[1]/ul[1]" ) soup = BeautifulSoup(div.get_attribute('innerHTML'), "html.parser") finally: driver.quit() # 添加发布时间和url search_result = soup.find_all("li", class_=re.compile("tk-cardsgroup")) if len(search_result) > 0: for li in search_result: article = entity.Article() a = li.find_next( "a", href=re.compile("https://olympics.com/tokyo-2020/en/news/")) article.url = a.get("href") h3 = li.find_next("h3", class_="tk-card__title") article.title = h3.get("title") article.title_cn = utils.translate_with_webdriver(article.title) origin_date = li.find_next( "time", class_="tk-card__pubdate").get("datetime") # 解析正文 try: article.date = origin_date[0:11].replace("-", "") if int(start_time) <= int(article.date) <= int(end_time): art = ns.Article(article.url, headers=get_header(), language='en') art.download() art.parse() article.text = art.text if art.text.strip() == "": title, publish_date, content = utils.get_title_time_content( article.url, header=get_header()) article.text = content article.text_cn = utils.translate_with_webdriver( article.text) else: continue except Exception as exc: pass time.sleep(1) item_set.add(article) try: global TOTALS TOTALS += len(item_set) utils.write_xlsx_apend(file_path, item_set) item_set.clear() except Exception as exc: pass
def start_crawl(file_path, keywords, start_time, end_time): keywords_str = "+".join(keywords) item_set = set() url = f"https://olympics.com/en/search/stories/?q={keywords_str}" try: # 模拟浏览器登录 options = webdriver.ChromeOptions() # 关闭可视化 options.add_argument('--headless') # 关闭图片视频加载 options.add_argument('blink-settings=imagesEnabled=false') driver = webdriver.Chrome(utils.DRIVER_PATH, options=options) driver.get(url) # driver.find_element_by_id("onetrust-accept-btn-handler").click() div = driver.find_element_by_xpath("//main[@id='content']") soup = BeautifulSoup(div.get_attribute('innerHTML'), "html.parser") finally: driver.quit() # 添加发布时间和url search_result = soup.find_all("h1", class_=re.compile("article--title")) if len(search_result) > 0: for h1 in search_result: article = entity.Article() a = h1.find_next("a") href = a.get("href") if not href.startswith("https://olympics.com/"): href = "https://olympics.com" + href article.url = href article.title = a.string article.title_cn = utils.translate_with_webdriver(article.title) # 解析正文和时间 try: art = ns.Article(href, headers=get_header(), language='en') art.download() art.parse() date = art.publish_date.strftime("%Y%m%d") # 判断时间 if int(start_time) <= int(date) <= int(end_time): article.text = art.text if art.text.strip() == "": title, publish_date, content = utils.get_title_time_content( article.url, header=get_header()) article.text = content article.text_cn = utils.translate_with_webdriver( article.text) article.date = date else: continue except Exception as exc: pass time.sleep(1) item_set.add(article) try: global TOTALS TOTALS += len(item_set) utils.write_xlsx_apend(file_path, item_set) item_set.clear() except Exception as exc: pass
def save_to_excel(file_path, keywords, item_set): # 创建空Excel并写入表头 utils.create_xlsx_with_head(file_path=file_path, sheet_name='+'.join(keywords)) # 写入数据 utils.write_xlsx_apend(file_path, item_set)