class AutoHomeKoubeiArticleSpider(scrapy.Spider): name = 'auto_home_koubei_article' koubei_list = StructureStartUrl().get_koubei_id() koubei_index = 0 base_url = "https://koubei.app.autohome.com.cn/autov8.6.5/alibi/NewEvaluationInfo.ashx?eid=%s&useCache=1" start_urls = [base_url % koubei_list[koubei_index]] def parse(self, response): item = KoubeiArticleItem() content = json.loads(response.body.decode()) item["koubei_id"] = content["result"]["eid"] node_value = content["result"] for node in node_value: if re.search(r"[a-zA-Z]+Scene", node): item["feeling_name"] = node_value[node]["feelingname"] item["feeling"] = node_value[node]["feeling"] item["score"] = node_value[node]["score"] item["time"] = get_current_date() yield item # 翻页操作 self.koubei_index += 1 if self.koubei_index < len(self.koubei_list): url = self.base_url % (self.koubei_list[self.koubei_index]) yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
class KoubeiHomeSpider(Spider): name = "koubei_home" base_url = "https://koubei.app.autohome.com.cn/autov8.6.5/alibi/NewEvaluationInfo.ashx?eid=%s&useCache=1" series_id_list = StructureStartUrl().get_home_koubei_id() # series_id_list = [(94353,)] series_index = 0 start_urls = [base_url % series_id_list[series_index][0]] # 解析口碑分页 def parse(self, response): item = KoubeiHomeItem() try: content = json.loads(response.body.decode()) result = content["result"] item["koubei_id"] = self.series_id_list[self.series_index][0] item["pid"] = result["boughtprovince"] item["cid"] = result["boughtcity"] item["dealer_id"] = result["dealer"] yield item self.series_index += 1 if self.series_index < len(self.series_id_list): url = self.base_url % (self.series_id_list[self.series_index][0]) yield Request(url=url, callback=self.parse, meta={'item': copy.deepcopy(item)}, dont_filter=True) except Exception as e: print(self.series_id_list[self.series_index][0]) yield
class AutoHomeKoubeiTagSpider(scrapy.Spider): name = 'auto_home_koubei_tag' series_list = StructureStartUrl().get_series_id() base_url = "https://koubei.app.autohome.com.cn/autov8.6.5/alibi/seriesalibiinfos-pm2-ss%s-st0-p1-s20-isstruct1-o0.json" series_index = 0 start_urls = [base_url % series_list[series_index][0]] def parse(self, response): item = KoubeiTagItem() content = json.loads(response.body.decode(), strict=False) result = content["result"] if len(result["structuredlist"]) > 0: for structure in result["structuredlist"]: item["series_id"] = self.series_list[self.series_index][0] item["tag_id"] = structure["id"] for summary in structure["Summary"]: if summary["SummaryKey"] != 0: item["combination"] = summary["Combination"] item["summary_key"] = summary["SummaryKey"] yield item self.series_index += 1 if self.series_index < len(self.series_list): url = self.base_url % self.series_list[self.series_index][0] yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
class AutoHomeKoubeiTagNumSpider(scrapy.Spider): name = 'auto_home_koubei_tag_num' series_list = StructureStartUrl().get_series_id() # 获取在售车系的口碑数据 series_index = 0 base_url = "https://koubei.app.autohome.com.cn/autov8.6.5/alibi/seriesalibiinfos-pm2-ss%s-st0-p1-s20-isstruct1-o0.json" start_urls = [base_url % series_list[series_index][0]] def parse(self, response): item = KoubeiTagNumItem() content = json.loads(response.body.decode()) result = content["result"] if len(result["structuredlist"]) > 0: structure = result["structuredlist"][0] for summary in structure["Summary"]: if summary["SummaryKey"] != 0: item["summary_key"] = summary["SummaryKey"] item["volume"] = summary["Volume"] item["time"] = get_current_date() yield item # 翻页操作 self.series_index += 1 if self.series_index < len(self.series_list): url = self.base_url % self.series_list[self.series_index][0] yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
class AutoHomeKoubeiReadSpider(scrapy.spiders.Spider): name = "koubei_read" koubei_list = StructureStartUrl().get_koubei_id() koubei_index = 0 base_url = "https://koubei.app.autohome.com.cn/autov8.6.5/alibi/NewEvaluationInfo.ashx?eid=%s&useCache=1" start_urls = [base_url % (koubei_list[koubei_index])] def parse(self, response): item = KoubeiReadScrapyItem() yield Request(url=response.url, callback=self.parse_koubei_article_item, meta={"item": copy.deepcopy(item)}, dont_filter=True) def parse_koubei_article_item(self, response): item = response.meta['item'] content = json.loads(response.body.decode()) item["koubei_id"] = content["result"]["eid"] node = content["result"] item["visit_count"] = node["visitcount"] item["helpful_count"] = node["helpfulcount"] item["comment_count"] = node["commentcount"] item["time"] = get_current_date() yield item self.koubei_index += 1 if self.koubei_index < len(self.koubei_list): url = self.base_url % (self.koubei_list[self.koubei_index]) yield Request(url=url, callback=self.parse_koubei_article_item, meta={"item": copy.deepcopy(item)}, dont_filter=True)
class AutoHomeKoubeiSpider(scrapy.spiders.Spider): name = "koubei" base_url = "https://koubei.app.autohome.com.cn/autov9.1.0/alibi/seriesalibiinfos-pm2-ss%s-st0-p%s-s20-isstruct0-o0.json" series_id_list = StructureStartUrl().get_series_id() # series_id_list = [(145,)] page_index = 1 url_index = 0 start_urls = [base_url % (series_id_list[url_index][0], page_index)] # 解析口碑分页 def parse(self, response): item = KoubeiScrapyItem() yield Request(url=response.url, callback=self.parse_koubei_item, meta={'item': copy.deepcopy(item)}, dont_filter=True) # 解析口碑排行内容 def parse_koubei_item(self, response): item = response.meta['item'] content = json.loads(response.body.decode()) result = content["result"] # content_base_url = "https://koubei.app.autohome.com.cn/autov8.6.5/alibi/NewEvaluationInfo.ashx?eid=%s&useCache=1" if len(result["list"]) > 0: for koubei in result["list"]: item["koubei_id"] = koubei["Koubeiid"] item["spec_id"] = koubei["specid"] item["user_id"] = koubei["userid"] item["buy_price"] = koubei["buyprice"].rstrip("万") item["post_time"] = koubei["posttime"] item["page_num"] = content["result"]["pagecount"] yield item # detail_url = content_base_url % koubei["Koubeiid"] # yield Request(url=detail_url, callback=self.parse_koubei_details_item, # meta={'item': copy.deepcopy(item)}, dont_filter=True) else: item["page_num"] = 0 self.page_index += 1 if self.page_index <= item["page_num"]: url = self.base_url % (self.series_id_list[self.url_index][0], self.page_index) yield Request(url=url, callback=self.parse_koubei_item, meta={'item': copy.deepcopy(item)}, dont_filter=True) else: self.url_index += 1 if self.url_index < len(self.series_id_list): self.page_index = 1 url = self.base_url % (self.series_id_list[self.url_index][0], self.page_index) yield Request(url=url, callback=self.parse_koubei_item, meta={'item': copy.deepcopy(item)}, dont_filter=True)
class AutoHomeKoubeiCommentSpider(scrapy.spiders.Spider): name = "koubei_comment" base_url = "https://koubei.app.autohome.com.cn/autov8.6.5/news/koubeicomments-pm2-n%s-s20-lastid%s.json" comment_list = StructureStartUrl().get_koubei_id() comment_index = 0 last_id = 0 comment_id_list = [] start_urls = [base_url % (comment_list[comment_index], last_id)] def parse(self, response): item = KoubeiCommentScrapyItem() yield Request(url=response.url, callback=self.parse_koubei_comment_item, meta={"item": copy.deepcopy(item)}, dont_filter=True) def parse_koubei_comment_item(self, response): item = response.meta['item'] content = json.loads(response.body.decode()) result = content["result"] item["koubei_id"] = self.comment_list[self.comment_index] if len(result["list"]) > 0: for comment_list in result["list"]: item["id"] = comment_list["id"] item["user_id"] = comment_list["nameid"] item["content"] = comment_list["content"] item["carname"] = comment_list["carname"] item["create_time"] = comment_list["time"] item["time"] = get_current_date() self.comment_id_list.append(str(comment_list["id"])) yield item if len(result["list"]) == 20 and (result["pageid"] == self.comment_id_list[-1]): url = self.base_url % (self.comment_list[self.comment_index], self.comment_id_list[-1]) yield Request(url=url, callback=self.parse_koubei_comment_item, meta={"item": copy.deepcopy(item)}, dont_filter=True) else: self.comment_index += 1 if self.comment_index < len(self.comment_list): self.last_id = 0 url = self.base_url % (self.comment_list[self.comment_index], self.last_id) yield Request(url=url, callback=self.parse_koubei_comment_item, meta={"item": copy.deepcopy(item)}, dont_filter=True)
class AutoHomeKoubeiRankSpider(scrapy.spiders.Spider): name = "koubei_rank" base_url = "https://koubei.app.autohome.com.cn/autov8.8.5/alibi/alibiseriesrank-pm2-categoryid%s-struct0-order0-price0.json" level_list = StructureStartUrl().get_level_id() url_index = 0 start_urls = [base_url % url_index] def parse(self, response): result = json.loads(response.body.decode())["result"] item = KoubeiRankScrapyItem() for series in result["serieslist"]: item["level_id"] = result["categoryid"] item["series_id"] = series["seriesid"] item["koubei_rank"] = series["rank"] item["koubei_score"] = series["score"] item["koubei_evaluation_count"] = series["evaluationcount"] item["koubei_update_time"] = get_current_date() yield item time.sleep(1) self.url_index += 1 if self.url_index < len(self.level_list): url = self.base_url % self.level_list[self.url_index] yield Request(url=url, callback=self.parse)