Exemplo n.º 1
0
    def get_news_detail(self, response):
        item = PhilippinesItem()
        soup = bs(response.text, 'html.parser')

        title = soup.find("h1").text if soup.find("h1") else None

        pub_time_div = soup.find("div", class_="col-sm-6 cell-1")
        pub_time = Util.format_time4(
            pub_time_div.find("span",
                              class_="date").get_text()) if pub_time_div.find(
                                  "span", class_="date") else None
        #January 13, 2021, 1:19 pm

        if soup.find("figure", class_="image align-right"):
            image_figure = soup.find("figure", class_="image align-right")
            image = image_figure.select_one("img").get("src")
        else:
            None

        body = ''
        p_list = soup.find("div", class_="page-content").select("p")
        for p in p_list[1:]:
            body += (p.text + '\n')

        abstract = ''
        category2 = ''

        item["title"] = title
        item["pub_time"] = pub_time
        item["images"] = image
        item["body"] = body
        item["abstract"] = abstract
        item["category2"] = category2

        yield item
Exemplo n.º 2
0
    def get_news_detail(self, response):
        item = PhilippinesItem()
        soup = bs(response.text, 'html.parser')

        title = soup.find("h1", class_="tdb-title-text").text if soup.find(
            "h1", class_="tdb-title-text") else None

        pub_time = Util.format_time4(
            soup.find(
                "time",
                class_="entry-date updated td-module-date").text) if soup.find(
                    "time",
                    class_="entry-date updated td-module-date") else None

        body = ''
        p_list = soup.find_all("p")
        for p in p_list[:-5]:
            body += (p.text + '\n')

        image = ''
        abstract = ''
        category2 = ''

        item["title"] = title
        item["pub_time"] = pub_time
        item["images"] = image
        item["body"] = body
        item["abstract"] = abstract
        item["category2"] = category2

        yield item
Exemplo n.º 3
0
    def get_news_url(self, response):
        soup = bs(response.text, 'html.parser')

        div_list = soup.find_all(
            "div", class_="tdb_module_loop td_module_wrap td-animation-stack")
        for div in div_list:
            news_url = div.find(
                "h3", class_="entry-title td-module-title").select_one(
                    "a").get("href")
            yield scrapy.Request(news_url, callback=self.get_news_detail)

        div = soup.find("div", class_="page-nav td-pb-padding-side")
        a_list = div.select("a")
        temp_last_page = a_list[-2].text.split(",")
        last_page_num = int(temp_last_page[0] + temp_last_page[1])
        temp = a_list[-1].get("href").rsplit("/", 3)
        first_page_url = temp[0]
        page_num = 1
        if self.time == None or Util.format_time3(
                Util.format_time4(
                    soup.select("time",
                                class_="entry-date updated td-module-date").
                    get_text())) >= int(self.time):
            while page_num <= last_page_num:
                page_num += 1
                next_url = first_page_url + "/page/" + str(page_num) + "/"

            if next_url:
                yield scrapy.Request(next_url,
                                     meta=response.meta,
                                     callback=self.get_news_url)
        else:
            self.logger.info('时间截止')
Exemplo n.º 4
0
    def get_news_url(self, response):
        soup = bs(response.text, 'html.parser')

        div = soup.find("div", class_="articles")
        div_list = div.find_all("div", class_="article media")
        for d in div_list:
            news_url = "http://www.pna.gov.ph" + d.select_one("a").get("href")
            yield scrapy.Request(news_url, callback=self.get_news_detail)

        if self.time == None or Util.format_time3(
                Util.format_time4(soup.select(
                    "span", class_="date")).get_text()) >= int(self.time):
            nav = soup.find("nav", class_="pagination-area")
            li_list = nav.select("ul > li")
            next_url = "http://www.pna.gov.ph" + li_list[-2].select_one(
                "a").get("href") if li_list[-2].select_one("a").get(
                    "href") else None  #用'>'来进行下一页操作
            if next_url:
                yield scrapy.Request(next_url,
                                     meta=response.meta,
                                     callback=self.get_news_url)
        else:
            self.logger.info('时间截止')