Exemplo n.º 1
0
class AutoHomeArticlePVSpider(CrawlSpider):
    name = "article_pv"
    article_list = StructureArticleID().get_article_id()
    article_index = 0
    base_url = "https://cont.app.autohome.com.cn/cont_v8.5.0/cont/articlepv?callback=updateArticlePV&pm=1&ids=%s"
    start_urls = [base_url % (article_list[article_index][0])]

    def parse(self, response):
        item = ArticlePVScrapyItem()
        print(response.url)
        yield Request(url=response.url, callback=self.parse_article_pv_items, meta={"item": copy.deepcopy(item)},
                      dont_filter=True)

    def parse_article_pv_items(self, response):
        item = response.meta['item']
        content = re.search(r"{[^}]+}][^}]+}", response.body.decode()).group()
        content = json.loads(content, strict=False)
        result = content["result"][0]
        item["article_id"] = result["id"]
        item["pv_count"] = result["pvcount"]
        item["update_time"] = get_current_date()
        yield item
        self.article_index += 1
        if len(self.article_list) > self.article_index:
            url = self.base_url % (self.article_list[self.article_index][0])
            yield Request(url=url, callback=self.parse_article_pv_items, meta={"item": copy.deepcopy(item)},
                          dont_filter=True)
Exemplo n.º 2
0
class AutoHomeArticleContentSpider(CrawlSpider):
    name = "article_content"
    article_id_list = StructureArticleID().get_article_id()
    article_id_index = 0
    base_url = "https://cont.app.autohome.com.cn/cont_v8.5.0/content/news/newscontent-pm2-n%s-t0-rct1-ish0-ver%s.json"
    start_urls = [
        base_url % (article_id_list[article_id_index][0],
                    article_id_list[article_id_index][1])
    ]

    def parse(self, response):
        item = ArticleContentScrapyItem()
        yield Request(url=response.url,
                      callback=self.parse_article_content_items,
                      meta={"item": copy.deepcopy(item)},
                      dont_filter=True)

    def parse_article_content_items(self, response):
        item = response.meta['item']
        item["id"] = self.article_id_list[self.article_id_index][0]
        item["content"] = extract_article(response.url)
        yield item
        self.article_id_index += 1
        if self.article_id_index < len(self.article_id_list):
            url = self.base_url % (
                self.article_id_list[self.article_id_index][0],
                self.article_id_list[self.article_id_index][1])
            yield Request(url=url,
                          callback=self.parse_article_content_items,
                          meta={"item": copy.deepcopy(item)},
                          dont_filter=True)
Exemplo n.º 3
0
class ArticleSeriesSpider(Spider):
    name = 'article_serices'
    base_url = "https://cont.app.autohome.com.cn/cont_v9.2.5/content/news/newscontent-pm2-n%s-t0-rct0-ver%s.json"
    article_list = StructureArticleID().get_article_id()
    article_index = 0
    start_urls = [base_url % article_list[article_index]]

    def parse(self, response):
        item = ArticleSeriesItems()
        try:
            set_value = re.search('setValue\(".+"\)',
                                  response.body.decode()).group()
        except Exception as e:
            set_value = ""
        if set_value != "":
            set_value = set_value.lstrip('setValue("').rstrip('")')
            unquote_set_value = parse.unquote(set_value)  # 解码
            series_list = json.loads(unquote_set_value)["serieslist"]
            if len(series_list) > 0:
                for series in series_list:
                    item["id"] = self.article_list[self.article_index][
                        0]  # 文章ID
                    item["series_id"] = series['seriesid']  # 文章相关车系id
                    yield item

        self.article_index += 1

        if self.article_index < len(self.article_list):  # 进行翻页
            url = self.base_url % self.article_list[self.article_index]
            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 dont_filter=True)
Exemplo n.º 4
0
class AutoHomeArticleCommentSpider(CrawlSpider):
    name = "article_comment"
    article_list = StructureArticleID().get_article_id()
    article_index = 0
    page_index = 1
    last_time = 0
    base_url = "https://newsnc.app.autohome.com.cn/reply_v7.9.0/news/comments-pm2-n%s-o0-s20-lastid%s-t0.json"
    start_urls = [base_url % (article_list[article_index][0], last_time)]

    def parse(self, response):
        item = ArticleCommentScrapyItem()
        print(response.url)
        yield Request(url=response.url,
                      callback=self.parse_article_comment_items,
                      meta={"item": copy.deepcopy(item)},
                      dont_filter=True)

    def parse_article_comment_items(self, response):
        item = response.meta['item']
        content = json.loads(response.body.decode(), strict=False)
        comment_list = content["result"]["list"]
        for comment in comment_list:
            item["article_id"] = self.article_list[self.article_index][0]
            item["comment_id"] = comment["id"]
            item["floor"] = comment["floor"]
            item["user_id"] = comment["nameid"]
            item["publish_time"] = comment["time"]
            item["content"] = comment["content"]
            item["update_time"] = get_current_date()
            yield item
        self.last_time = item["comment_id"]
        self.page_index += 1
        if math.ceil(content["result"]["totalcount"] / 20) >= self.page_index:
            url = self.base_url % (self.article_list[self.article_index][0],
                                   self.last_time)
            yield Request(url=url,
                          callback=self.parse_article_comment_items,
                          meta={"item": copy.deepcopy(item)},
                          dont_filter=True)
        else:
            self.article_index += 1
            if self.article_index < len(self.article_list):
                self.page_index = 1
                self.last_time = 0
                url = self.base_url % (
                    self.article_list[self.article_index][0], self.last_time)
                print(url)
                yield Request(url=url,
                              callback=self.parse_article_comment_items,
                              meta={"item": copy.deepcopy(item)},
                              dont_filter=True)