Exemplo n.º 1
0
    def parse_category(self, response):
        meta = dict(response.meta)

        # Navigate to article
        article_urls = response.css(
            "div.main-content .late-news-lst li .late-news-tit a::attr(href)"
        ).extract()

        self.logger.info("Parse url {}, Num Article urls : {}".format(
            response.url, len(article_urls)))
        for article_url in article_urls:
            if utils.is_valid_url(article_url):
                yield Request(article_url,
                              self.parse_article,
                              meta={"category": meta["category"]},
                              errback=self.errback)

        # Navigate to next page
        if meta["page_idx"] < self.page_per_category_limit and len(
                article_urls) > 0:
            meta["page_idx"] += 1
            next_page = meta["category_url_fmt"].format(meta["page_idx"])
            yield Request(next_page,
                          self.parse_category,
                          meta=meta,
                          errback=self.errback)
Exemplo n.º 2
0
    def parse_category(self, response):
        meta = response.meta

        articles_urls = response.css("li>div> a::attr(href)").extract()

        # Navigate to article
        self.logger.info("Parse url {}, Num Article urls : {}".format(
            response.url, len(articles_urls)))
        for article_url in articles_urls:
            article_url = self.base_url + article_url

            if utils.is_valid_url(article_url):
                yield Request(article_url,
                              self.parse_article,
                              meta=meta,
                              errback=self.errback)

        # Navigate to next page
        if meta["page_idx"] < self.page_per_category_limit and len(
                articles_urls) > 0:
            meta["page_idx"] += 1
            next_page = meta["category_url_fmt"].format(
                meta["category_id"], meta["page_idx"])
            yield Request(next_page,
                          self.parse_category,
                          meta=meta,
                          errback=self.errback)
Exemplo n.º 3
0
    def parse_category_type2(self, response):
        # Example: thanhnien.vn/giao-duc
        meta = response.meta

        # Navigate to article
        article_urls = response.css(
            ".cate-content .zone--timeline article>a::attr(href)").extract()

        self.logger.info("Parse url {}, Num Article urls : {}".format(
            response.url, len(article_urls)))
        for article_url in article_urls:
            article_url = meta["base_url"] + article_url
            if utils.is_valid_url(article_url):
                yield Request(article_url,
                              self.parse_article_type2,
                              meta={"category": meta["category"]},
                              errback=self.errback)

        # Navigate to next page
        if meta["page_idx"] < self.page_per_category_limit and len(
                article_urls) > 0:
            meta["page_idx"] += 1
            next_page = meta["category_url_fmt"].format(meta["page_idx"])
            yield Request(next_page,
                          self.parse_category_type2,
                          meta=meta,
                          errback=self.errback)
Exemplo n.º 4
0
    def parse_category(self, response):
        meta = response.meta

        # Navigate to article
        article_urls = []
        article_urls.extend(
            response.css(
                "section.featured .title_news a:first-child::attr(href)").
            extract())
        article_urls.extend(
            response.css(
                "section.sidebar_1 .title_news a:first-child::attr(href)").
            extract())
        article_urls = list(set(article_urls))

        self.logger.info("Parse url {}, Num Article urls : {}".format(
            response.url, len(article_urls)))
        for article_url in article_urls:
            if utils.is_valid_url(article_url):
                yield Request(article_url,
                              self.parse_article,
                              meta={"category": meta["category"]},
                              errback=self.errback)

        # Navigate to next page
        if meta["page_idx"] < self.page_per_category_limit and len(
                article_urls) > 0:
            meta["page_idx"] += 1
            next_page = meta["category_url_fmt"].format(meta["page_idx"])
            yield Request(next_page,
                          self.parse_category,
                          meta=meta,
                          errback=self.errback)
Exemplo n.º 5
0
    def parse_category(self, response):
        meta = dict(response.meta)

        # Navigate to article
        article_urls = response.css(
            ".contentpage .listhlv21 a::attr(href)").extract()
        article_urls.extend(
            response.css(
                ".contentpage .listitem .item-bt>a::attr(href)").extract())

        self.logger.info("Parse url {}, Num Article urls : {}".format(
            response.url, len(article_urls)))
        for article_url in article_urls:
            article_url = self.base_url + article_url
            if utils.is_valid_url(article_url):
                yield Request(article_url,
                              self.parse_article,
                              meta={"category": meta["category"]},
                              errback=self.errback)

        # Navigate to next page
        if meta["page_idx"] < self.page_per_category_limit and len(
                article_urls) > 0:
            meta["page_idx"] += 1
            next_page = meta["category_url_fmt"].format(meta["page_idx"])
            yield Request(next_page,
                          self.parse_category,
                          meta=meta,
                          errback=self.errback)
Exemplo n.º 6
0
    def parse_category(self, response):
        meta = response.meta

        prefix_str = "retvar ="
        data = response.css("::text").extract_first()
        data = json.loads(data[len(prefix_str):])

        # Navigate to article
        self.logger.info("Parse url {}, Num Article urls : {}".format(response.url, len(data)))
        for article in data:
            time = '_'.join([article["publishdate"], article["publishtime"]])
            time = self.transform_time_fmt(time, src_fmt="%d/%m/%Y_%H:%M")
            article_info = {
                "category": meta["category"],
                "title": article["title"],
                "intro": article["lead"],
                "time": time
            }
            article_url = article["link"]
            if utils.is_valid_url(article_url):
                yield Request(article_url, self.parse_article, meta=article_info, errback=self.errback)

        # Navigate to next page
        if meta["page_idx"] < self.page_per_category_limit and len(data) > 0:
            meta["page_idx"] += 1
            next_page = meta["category_url_fmt"].format(meta["c_query"], meta["page_idx"])
            yield Request(next_page, self.parse_category, meta=meta, errback=self.errback)