Exemplo n.º 1
0
def start_crawl(file_path, keywords, start_time, end_time):
    keywords_str = "%20".join(keywords)
    url = f"https://api.foxnews.com/search/web?q={keywords_str}+-filetype:amp+-filetype:xml+more:pagemap:metatags-prism.section+more:pagemap:metatags-pagetype:article+more:pagemap:metatags-dc.type:Text.Article&siteSearch=foxnews.com&siteSearchFilter=i&sort=date:r:{start_time}:{end_time}"

    r = requests.get(url=url, headers=get_header())
    data = json.loads(r.text)
    total_results = int(data["searchInformation"]["totalResults"])
    item_set = set()

    for start_index in range(1, total_results + 1, 10):
        time.sleep(2)
        url = f"https://api.foxnews.com/search/web?q={keywords}+-filetype:amp+-filetype:xml+more:pagemap:metatags-prism.section+more:pagemap:metatags-pagetype:article+more:pagemap:metatags-dc.type:Text.Article&siteSearch=foxnews.com&siteSearchFilter=i&sort=date:r:{start_time}:{end_time}&start={start_index}"
        try:
            r = requests.get(url=url, headers=get_header())
            data = json.loads(r.text)

            # 每次查询总数会波动
            total_results = int(data["searchInformation"]["totalResults"])
            if start_index > total_results:
                break

            for j in data["items"]:
                item = j["pagemap"]["metatags"][0]
                article = entity.Article()
                article.title = item["dc.title"]
                article.title_cn = utils.translate_with_webdriver(
                    article.title)
                article.date = item["dc.date"]
                article.url = item["og:url"]
                item_set.add(article)
        except Exception as exc:
            continue

    global TOTALS
    TOTALS += len(item_set)
    # 解析链接对应正文
    for item in item_set:
        try:
            time.sleep(1)
            art = ns.Article(item.url, headers=get_header(), language='en')
            art.download()
            art.parse()
            item.text = art.text
            if art.text.strip() == "":
                title, publish_date, content = utils.get_title_time_content(
                    item.url, header=get_header())
                item.text = content
            item.text_cn = utils.translate_with_webdriver(item.text)

        except Exception as exc:
            continue
        finally:
            utils.write_xlsx_apend(file_path, [
                item,
            ])
Exemplo n.º 2
0
def start_crawl(file_path, keywords, start_time, end_time):
    keywords_str = "+".join(keywords)
    start_date = start_time[4:6] + "%2F" + start_time[
        6:8] + "%2F" + start_time[0:4]
    end_date = end_time[4:6] + "%2F" + end_time[6:8] + "%2F" + end_time[0:4]

    item_set = set()
    for page in range(1, 11):
        url = f"https://www.politico.com/search/{page}?adv=true&userInitiated=true&s=newest&q={keywords_str}&start={start_date}&end={end_date}"

        try:
            soup = driver_url(url)

            search_result = soup.find_all("article",
                                          class_=re.compile("story-frag"))
            if search_result and len(search_result) > 0:
                for li in search_result:
                    article = entity.Article()
                    a = li.find_next("header").find_next("a")
                    article.url = a.get("href")
                    article.title = a.string
                    article.title_cn = utils.translate_with_webdriver(
                        article.title)
                    article.date = li.find_next("time").get("datetime").split(
                        "T")[0].replace("-", "")
                    # 解析正文
                    try:
                        art = ns.Article(article.url,
                                         headers=get_header(),
                                         language='en')
                        art.download()
                        art.parse()
                        article.text = art.text
                        if art.text.strip() == "":
                            title, publish_date, content = utils.get_title_time_content(
                                article.url, header=get_header())
                            article.text = content
                        article.text_cn = utils.translate_with_webdriver(
                            article.text)
                    except Exception as exc:
                        pass
                    time.sleep(1)
                    item_set.add(article)
            else:
                return
        except:
            pass
        try:
            global TOTALS
            # 每获取一部分数据写入,避免单次过大
            TOTALS += len(item_set)
            utils.write_xlsx_apend(file_path, item_set)
            item_set.clear()
        except:
            pass
Exemplo n.º 3
0
def start_crawl(file_path, keywords, start_time, end_time):
    keywords_str = "%20".join(keywords)
    start_date = start_time[0: 4] + "%2F" + start_time[4: 6] + "%2F" + start_time[6: 8]
    end_date = end_time[0: 4] + "%2F" + end_time[4: 6] + "%2F" + end_time[6: 8]

    item_set = set()
    # 获取不到总数,限定页数上限
    for page in range(1, 30):
        url = f"http://[email protected]:[email protected]/search?query={keywords_str}&isToggleOn=true&operator=AND&sort=date-desc&duration=1y&startDate={start_date}&endDate={end_date}&source=wsjie%2Cblog%2Cwsjsitesrch%2Cwsjpro%2Cautowire%2Capfeed&page={page}"

        try:
            # 模拟浏览器登录
            options = webdriver.ChromeOptions()
            # 关闭可视化
            options.add_argument('--headless')
            # 关闭图片视频加载
            options.add_argument('blink-settings=imagesEnabled=false')
            driver = webdriver.Chrome(utils.DRIVER_PATH, options=options)
            driver.get(url)

            div = driver.find_element_by_id("root")
            soup = BeautifulSoup(div.get_attribute('innerHTML'), "html.parser")
        finally:
            driver.quit()

        # 添加发布时间和url
        search_result = soup.find_all("div", class_=re.compile("search-result"))
        if len(search_result) > 0:
            for div in search_result:
                article = entity.Article()
                a = div.find_next("a", href=re.compile("https://www.wsj.com/articles/"))
                article.url = a.get("href")

                p = div.find_next("p", class_=re.compile("timestamp"))
                # April 9, 2021 04:06 pm ET
                article.date = p.string
                item_set.add(article)
        else:
            break

    global TOTALS
    TOTALS += len(item_set)
    # 解析正文和标题
    for item in item_set:
        try:
            art = ns.Article(item.url, headers=get_header(), language='en')
            art.download()
            art.parse()
            item.title = art.title
            item.text = art.text
            if art.title.strip() == "" or art.text.strip() == "":
                title, publish_date, content = utils.get_title_time_content(item.url, header=get_header())
                item.title = title
                item.text = content
            item.title_cn = utils.translate_with_webdriver(item.title)
            item.text_cn = utils.translate_with_webdriver(item.text)
        except Exception as exc:
            pass
        try:
            utils.write_xlsx_apend(file_path, [item, ])
        except:
            pass
        time.sleep(1)

    item_set.clear()
Exemplo n.º 4
0
def start_crawl(file_path, keywords, start_time, end_time):
    keywords_str = "+".join(keywords)
    item_set = set()

    url = f"https://www.bbc.co.uk/search?q={keywords_str}&page=1"
    r = requests.get(url=url, headers=get_header())

    html_content = r.text
    soup = BeautifulSoup(html_content, "html.parser")
    match = re.search(">window.__INITIAL_DATA__=(.+);</script>",
                      str(soup.find_all("script")[3]))
    if match:
        data = json.loads(match[1])
        initial_results = jsonpath.jsonpath(data, "$..initialResults")[0]
        totals = initial_results["count"]

        for page in range(1, totals // 10):
            # 结果太多,限制条数
            if page == 10:
                break
            try:
                time.sleep(1)
                url = f"https://www.bbc.co.uk/search?q={keywords_str}&page={page}"
                r = requests.get(url=url, headers=get_header())

                html_content = r.text
                soup = BeautifulSoup(html_content, "html.parser")
                match = re.search(">window.__INITIAL_DATA__=(.+);</script>",
                                  str(soup.find_all("script")[3]))
                if match:
                    data = json.loads(match[1])
                    initial_results = jsonpath.jsonpath(
                        data, "$..initialResults")[0]
                    for item in initial_results["items"]:
                        # 17 April 2017
                        # 8 hours ago
                        origin_date = utils.format_date(
                            item["metadataStripItems"][0]["text"])

                        if origin_date != -1 and int(
                                start_time) <= origin_date <= int(end_time):
                            article = entity.Article()
                            article.title = item["headline"]
                            article.title_cn = utils.translate_with_webdriver(
                                article.title)
                            article.url = item["url"]
                            article.date = str(origin_date)
                            try:
                                time.sleep(1)
                                art = ns.Article(item["url"],
                                                 headers=get_header())
                                art.download()
                                art.parse()
                                article.text = art.text
                                if art.text.strip() == "":
                                    title, publish_date, content = utils.get_title_time_content(
                                        item["url"], header=get_header())
                                    article.text = content
                                article.text_cn = utils.translate_with_webdriver(
                                    article.text)
                            except Exception as exc:
                                continue

                            item_set.add(article)
            except Exception as exc:
                continue
            finally:
                try:
                    global TOTALS
                    TOTALS += len(item_set)
                    utils.write_xlsx_apend(file_path, item_set)
                    item_set.clear()
                except:
                    pass
Exemplo n.º 5
0
def start_crawl(file_path, keywords, start_time, end_time):
    keywords_str = "+".join(keywords)

    item_set = set()
    url = f"https://olympics.com/tokyo-2020/en/search/?q={keywords_str}"

    try:
        # 模拟浏览器登录
        options = webdriver.ChromeOptions()
        # 关闭可视化
        options.add_argument('--headless')
        # 关闭图片视频加载
        options.add_argument('blink-settings=imagesEnabled=false')
        driver = webdriver.Chrome(utils.DRIVER_PATH, options=options)
        driver.get(url)

        div = driver.find_element_by_xpath(
            "//body/main[@id='tk-main-content']/section[1]/div[1]/div[2]/div[2]/div[1]/div[1]/ul[1]"
        )
        soup = BeautifulSoup(div.get_attribute('innerHTML'), "html.parser")
    finally:
        driver.quit()

    # 添加发布时间和url
    search_result = soup.find_all("li", class_=re.compile("tk-cardsgroup"))
    if len(search_result) > 0:
        for li in search_result:
            article = entity.Article()
            a = li.find_next(
                "a",
                href=re.compile("https://olympics.com/tokyo-2020/en/news/"))
            article.url = a.get("href")
            h3 = li.find_next("h3", class_="tk-card__title")
            article.title = h3.get("title")
            article.title_cn = utils.translate_with_webdriver(article.title)
            origin_date = li.find_next(
                "time", class_="tk-card__pubdate").get("datetime")
            # 解析正文
            try:
                article.date = origin_date[0:11].replace("-", "")
                if int(start_time) <= int(article.date) <= int(end_time):
                    art = ns.Article(article.url,
                                     headers=get_header(),
                                     language='en')
                    art.download()
                    art.parse()
                    article.text = art.text
                    if art.text.strip() == "":
                        title, publish_date, content = utils.get_title_time_content(
                            article.url, header=get_header())
                        article.text = content
                    article.text_cn = utils.translate_with_webdriver(
                        article.text)
                else:
                    continue
            except Exception as exc:
                pass
            time.sleep(1)
            item_set.add(article)

    try:
        global TOTALS
        TOTALS += len(item_set)
        utils.write_xlsx_apend(file_path, item_set)
        item_set.clear()
    except Exception as exc:
        pass
Exemplo n.º 6
0
def start_crawl(file_path, keywords, start_time, end_time):
    keywords_str = "+".join(keywords)

    item_set = set()
    url = f"https://olympics.com/en/search/stories/?q={keywords_str}"

    try:
        # 模拟浏览器登录
        options = webdriver.ChromeOptions()
        # 关闭可视化
        options.add_argument('--headless')
        # 关闭图片视频加载
        options.add_argument('blink-settings=imagesEnabled=false')
        driver = webdriver.Chrome(utils.DRIVER_PATH, options=options)
        driver.get(url)

        # driver.find_element_by_id("onetrust-accept-btn-handler").click()
        div = driver.find_element_by_xpath("//main[@id='content']")
        soup = BeautifulSoup(div.get_attribute('innerHTML'), "html.parser")
    finally:
        driver.quit()

    # 添加发布时间和url
    search_result = soup.find_all("h1", class_=re.compile("article--title"))
    if len(search_result) > 0:
        for h1 in search_result:
            article = entity.Article()
            a = h1.find_next("a")
            href = a.get("href")
            if not href.startswith("https://olympics.com/"):
                href = "https://olympics.com" + href
            article.url = href
            article.title = a.string
            article.title_cn = utils.translate_with_webdriver(article.title)
            # 解析正文和时间
            try:
                art = ns.Article(href, headers=get_header(), language='en')
                art.download()
                art.parse()
                date = art.publish_date.strftime("%Y%m%d")
                # 判断时间
                if int(start_time) <= int(date) <= int(end_time):
                    article.text = art.text
                    if art.text.strip() == "":
                        title, publish_date, content = utils.get_title_time_content(
                            article.url, header=get_header())
                        article.text = content
                    article.text_cn = utils.translate_with_webdriver(
                        article.text)
                    article.date = date
                else:
                    continue
            except Exception as exc:
                pass
            time.sleep(1)
            item_set.add(article)

    try:
        global TOTALS
        TOTALS += len(item_set)
        utils.write_xlsx_apend(file_path, item_set)
        item_set.clear()
    except Exception as exc:
        pass