Пример #1
0
class AutoHomeKoubeiArticleSpider(scrapy.Spider):
    name = 'auto_home_koubei_article'
    koubei_list = StructureStartUrl().get_koubei_id()
    koubei_index = 0
    base_url = "https://koubei.app.autohome.com.cn/autov8.6.5/alibi/NewEvaluationInfo.ashx?eid=%s&useCache=1"
    start_urls = [base_url % koubei_list[koubei_index]]

    def parse(self, response):
        item = KoubeiArticleItem()
        content = json.loads(response.body.decode())
        item["koubei_id"] = content["result"]["eid"]
        node_value = content["result"]
        for node in node_value:
            if re.search(r"[a-zA-Z]+Scene", node):
                item["feeling_name"] = node_value[node]["feelingname"]
                item["feeling"] = node_value[node]["feeling"]
                item["score"] = node_value[node]["score"]
                item["time"] = get_current_date()
                yield item

        # 翻页操作
        self.koubei_index += 1
        if self.koubei_index < len(self.koubei_list):
            url = self.base_url % (self.koubei_list[self.koubei_index])
            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 dont_filter=True)
Пример #2
0
class KoubeiHomeSpider(Spider):
    name = "koubei_home"
    base_url = "https://koubei.app.autohome.com.cn/autov8.6.5/alibi/NewEvaluationInfo.ashx?eid=%s&useCache=1"
    series_id_list = StructureStartUrl().get_home_koubei_id()
    # series_id_list = [(94353,)]
    series_index = 0
    start_urls = [base_url % series_id_list[series_index][0]]

    # 解析口碑分页
    def parse(self, response):
        item = KoubeiHomeItem()
        try:
            content = json.loads(response.body.decode())
            result = content["result"]
            item["koubei_id"] = self.series_id_list[self.series_index][0]
            item["pid"] = result["boughtprovince"]
            item["cid"] = result["boughtcity"]
            item["dealer_id"] = result["dealer"]
            yield item
            self.series_index += 1
            if self.series_index < len(self.series_id_list):
                url = self.base_url % (self.series_id_list[self.series_index][0])
                yield Request(url=url, callback=self.parse, meta={'item': copy.deepcopy(item)}, dont_filter=True)
        except Exception as e:
            print(self.series_id_list[self.series_index][0])
            yield
Пример #3
0
class AutoHomeKoubeiTagSpider(scrapy.Spider):
    name = 'auto_home_koubei_tag'
    series_list = StructureStartUrl().get_series_id()
    base_url = "https://koubei.app.autohome.com.cn/autov8.6.5/alibi/seriesalibiinfos-pm2-ss%s-st0-p1-s20-isstruct1-o0.json"
    series_index = 0
    start_urls = [base_url % series_list[series_index][0]]

    def parse(self, response):
        item = KoubeiTagItem()
        content = json.loads(response.body.decode(), strict=False)
        result = content["result"]
        if len(result["structuredlist"]) > 0:
            for structure in result["structuredlist"]:
                item["series_id"] = self.series_list[self.series_index][0]
                item["tag_id"] = structure["id"]
                for summary in structure["Summary"]:
                    if summary["SummaryKey"] != 0:
                        item["combination"] = summary["Combination"]
                        item["summary_key"] = summary["SummaryKey"]
                        yield item

        self.series_index += 1
        if self.series_index < len(self.series_list):
            url = self.base_url % self.series_list[self.series_index][0]
            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 dont_filter=True)
Пример #4
0
class AutoHomeKoubeiTagNumSpider(scrapy.Spider):
    name = 'auto_home_koubei_tag_num'
    series_list = StructureStartUrl().get_series_id()  # 获取在售车系的口碑数据
    series_index = 0
    base_url = "https://koubei.app.autohome.com.cn/autov8.6.5/alibi/seriesalibiinfos-pm2-ss%s-st0-p1-s20-isstruct1-o0.json"
    start_urls = [base_url % series_list[series_index][0]]

    def parse(self, response):
        item = KoubeiTagNumItem()
        content = json.loads(response.body.decode())
        result = content["result"]
        if len(result["structuredlist"]) > 0:
            structure = result["structuredlist"][0]
            for summary in structure["Summary"]:
                if summary["SummaryKey"] != 0:
                    item["summary_key"] = summary["SummaryKey"]
                    item["volume"] = summary["Volume"]
                    item["time"] = get_current_date()
                    yield item

        # 翻页操作
        self.series_index += 1
        if self.series_index < len(self.series_list):
            url = self.base_url % self.series_list[self.series_index][0]
            yield scrapy.Request(url=url,
                                 callback=self.parse,
                                 dont_filter=True)
Пример #5
0
class AutoHomeKoubeiReadSpider(scrapy.spiders.Spider):
    name = "koubei_read"
    koubei_list = StructureStartUrl().get_koubei_id()
    koubei_index = 0
    base_url = "https://koubei.app.autohome.com.cn/autov8.6.5/alibi/NewEvaluationInfo.ashx?eid=%s&useCache=1"
    start_urls = [base_url % (koubei_list[koubei_index])]

    def parse(self, response):
        item = KoubeiReadScrapyItem()
        yield Request(url=response.url,
                      callback=self.parse_koubei_article_item,
                      meta={"item": copy.deepcopy(item)},
                      dont_filter=True)

    def parse_koubei_article_item(self, response):
        item = response.meta['item']
        content = json.loads(response.body.decode())
        item["koubei_id"] = content["result"]["eid"]
        node = content["result"]
        item["visit_count"] = node["visitcount"]
        item["helpful_count"] = node["helpfulcount"]
        item["comment_count"] = node["commentcount"]
        item["time"] = get_current_date()
        yield item

        self.koubei_index += 1
        if self.koubei_index < len(self.koubei_list):
            url = self.base_url % (self.koubei_list[self.koubei_index])
            yield Request(url=url,
                          callback=self.parse_koubei_article_item,
                          meta={"item": copy.deepcopy(item)},
                          dont_filter=True)
Пример #6
0
class AutoHomeKoubeiSpider(scrapy.spiders.Spider):
    name = "koubei"
    base_url = "https://koubei.app.autohome.com.cn/autov9.1.0/alibi/seriesalibiinfos-pm2-ss%s-st0-p%s-s20-isstruct0-o0.json"
    series_id_list = StructureStartUrl().get_series_id()
    # series_id_list = [(145,)]
    page_index = 1
    url_index = 0
    start_urls = [base_url % (series_id_list[url_index][0], page_index)]

    # 解析口碑分页
    def parse(self, response):
        item = KoubeiScrapyItem()
        yield Request(url=response.url,
                      callback=self.parse_koubei_item,
                      meta={'item': copy.deepcopy(item)},
                      dont_filter=True)

    # 解析口碑排行内容
    def parse_koubei_item(self, response):
        item = response.meta['item']
        content = json.loads(response.body.decode())
        result = content["result"]
        # content_base_url = "https://koubei.app.autohome.com.cn/autov8.6.5/alibi/NewEvaluationInfo.ashx?eid=%s&useCache=1"
        if len(result["list"]) > 0:
            for koubei in result["list"]:
                item["koubei_id"] = koubei["Koubeiid"]
                item["spec_id"] = koubei["specid"]
                item["user_id"] = koubei["userid"]
                item["buy_price"] = koubei["buyprice"].rstrip("万")
                item["post_time"] = koubei["posttime"]
                item["page_num"] = content["result"]["pagecount"]
                yield item
                # detail_url = content_base_url % koubei["Koubeiid"]
                # yield Request(url=detail_url, callback=self.parse_koubei_details_item,
                #               meta={'item': copy.deepcopy(item)}, dont_filter=True)
        else:
            item["page_num"] = 0
        self.page_index += 1
        if self.page_index <= item["page_num"]:
            url = self.base_url % (self.series_id_list[self.url_index][0],
                                   self.page_index)
            yield Request(url=url,
                          callback=self.parse_koubei_item,
                          meta={'item': copy.deepcopy(item)},
                          dont_filter=True)
        else:
            self.url_index += 1
            if self.url_index < len(self.series_id_list):
                self.page_index = 1
                url = self.base_url % (self.series_id_list[self.url_index][0],
                                       self.page_index)
                yield Request(url=url,
                              callback=self.parse_koubei_item,
                              meta={'item': copy.deepcopy(item)},
                              dont_filter=True)
Пример #7
0
class AutoHomeKoubeiCommentSpider(scrapy.spiders.Spider):
    name = "koubei_comment"
    base_url = "https://koubei.app.autohome.com.cn/autov8.6.5/news/koubeicomments-pm2-n%s-s20-lastid%s.json"
    comment_list = StructureStartUrl().get_koubei_id()
    comment_index = 0
    last_id = 0
    comment_id_list = []
    start_urls = [base_url % (comment_list[comment_index], last_id)]

    def parse(self, response):
        item = KoubeiCommentScrapyItem()
        yield Request(url=response.url,
                      callback=self.parse_koubei_comment_item,
                      meta={"item": copy.deepcopy(item)},
                      dont_filter=True)

    def parse_koubei_comment_item(self, response):
        item = response.meta['item']
        content = json.loads(response.body.decode())
        result = content["result"]
        item["koubei_id"] = self.comment_list[self.comment_index]
        if len(result["list"]) > 0:
            for comment_list in result["list"]:
                item["id"] = comment_list["id"]
                item["user_id"] = comment_list["nameid"]
                item["content"] = comment_list["content"]
                item["carname"] = comment_list["carname"]
                item["create_time"] = comment_list["time"]
                item["time"] = get_current_date()
                self.comment_id_list.append(str(comment_list["id"]))
                yield item
        if len(result["list"]) == 20 and (result["pageid"]
                                          == self.comment_id_list[-1]):
            url = self.base_url % (self.comment_list[self.comment_index],
                                   self.comment_id_list[-1])
            yield Request(url=url,
                          callback=self.parse_koubei_comment_item,
                          meta={"item": copy.deepcopy(item)},
                          dont_filter=True)
        else:
            self.comment_index += 1
            if self.comment_index < len(self.comment_list):
                self.last_id = 0
                url = self.base_url % (self.comment_list[self.comment_index],
                                       self.last_id)
                yield Request(url=url,
                              callback=self.parse_koubei_comment_item,
                              meta={"item": copy.deepcopy(item)},
                              dont_filter=True)
Пример #8
0
class AutoHomeKoubeiRankSpider(scrapy.spiders.Spider):
    name = "koubei_rank"
    base_url = "https://koubei.app.autohome.com.cn/autov8.8.5/alibi/alibiseriesrank-pm2-categoryid%s-struct0-order0-price0.json"
    level_list = StructureStartUrl().get_level_id()
    url_index = 0
    start_urls = [base_url % url_index]

    def parse(self, response):
        result = json.loads(response.body.decode())["result"]
        item = KoubeiRankScrapyItem()
        for series in result["serieslist"]:
            item["level_id"] = result["categoryid"]
            item["series_id"] = series["seriesid"]
            item["koubei_rank"] = series["rank"]
            item["koubei_score"] = series["score"]
            item["koubei_evaluation_count"] = series["evaluationcount"]
            item["koubei_update_time"] = get_current_date()
            yield item
            time.sleep(1)
        self.url_index += 1
        if self.url_index < len(self.level_list):
            url = self.base_url % self.level_list[self.url_index]
            yield Request(url=url, callback=self.parse)