class inman(scrapy.Spider): name = 'inman' allowed_domains = ['jd.com'] start_urls = [str(geturl(0, name))] count = 1 """generation""" def __init__(self): # 禁止加载图片,提升爬取速度 opt = webdriver.ChromeOptions() prefs = {'profile.default_content_setting_values': {'images': 2}} opt.add_experimental_option('prefs', prefs) self.driver = webdriver.Chrome(chrome_options=opt) self.driver.implicitly_wait(10) super(inman, self).__init__() # 向CloseSpider方法发送爬虫结束信号,准备关闭pthantomjs浏览器 dispatcher.connect(self.CloseSpider, signals.spider_closed) def CloseSpider(self, spider): print("spider closed") # 当爬虫结束时关闭浏览器 self.driver.quit() def parse(self, response): url_list = geturl(1, self.name) for i in url_list: yield Request(url=i, meta={'url': i}, callback=self.parse_detail) def parse_detail(self, response): com_url = response.meta['url'] title = response.css('div.sku-name::text').extract_first() n_price = response.css( 'span.p-price span:nth-child(2)::text').extract_first() o_price = response.css('#page_dpprice::text').extract_first() product_params = str( response.css('div.p-parameter ul:nth-child(2) li::text').extract()) #单独js网址发起请求 match_re = re.match('.*?/(\d*).html', com_url) id = '' if match_re: id = match_re.group(1) else: print('正则提取商品id出错') #时间戳获取 current_milli_time = lambda: int(round(time.time() * 1000)) json_url = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={0}&callback=json&_={1}'.format( id, current_milli_time()) headers = Random_Headers(id) cookies = Random_Cookies() while True: r = requests.get(url=json_url, headers=headers, cookies=cookies) if r.status_code >= 200 and r.status_code < 300: break else: time.sleep(300) continue jscontent = re.search('^[^(]*?\((.*)\)[^)]*$', r.text).group(1) jsdict = json.loads(jscontent) comment_nums = jsdict['CommentsCount'][0]['CommentCountStr'] good = jsdict['CommentsCount'][0]['GoodCountStr'] good_rate = jsdict['CommentsCount'][0]['GoodRate'] general = jsdict['CommentsCount'][0]['GeneralCountStr'] general_rate = jsdict['CommentsCount'][0]['GeneralRate'] poor = jsdict['CommentsCount'][0]['PoorCountStr'] poor_rate = jsdict['CommentsCount'][0]['PoorCount'] average_score = jsdict['CommentsCount'][0]['AverageScore'] DefaultGoodCount = jsdict['CommentsCount'][0]['DefaultGoodCountStr'] item = JdCommoditydetailItem() item['title'] = title item['n_price'] = n_price item['o_price'] = o_price item['comment_nums'] = comment_nums item['product_params'] = product_params item['shop_name'] = self.name item['good_rate'] = good_rate item['general_rate'] = general_rate item['com_url'] = com_url item['good'] = good item['general'] = general item['poor'] = poor item['poor_rate'] = poor_rate item['average_score'] = average_score item['DefaultGoodCount'] = DefaultGoodCount yield item
def parse(self, response): url_list = geturl(1, self.name) for i in url_list: yield Request(url=i, meta={'url': i}, callback=self.parse_detail)
class moco(scrapy.Spider): name = 'moco' allowed_domains = ['jd.com'] start_urls = [str(geturl(0))] """generation""" def __init__(self): # 禁止加载图片,提升爬取速度 opt = webdriver.ChromeOptions() prefs = {'profile.default_content_setting_values': {'images': 2}} opt.add_experimental_option('prefs', prefs) self.driver = webdriver.Chrome(chrome_options=opt) self.driver.implicitly_wait(10) super(moco, self).__init__() # 向CloseSpider方法发送爬虫结束信号,准备关闭pthantomjs浏览器 dispatcher.connect(self.CloseSpider, signals.spider_closed) def CloseSpider(self, spider): print("spider closed") # 当爬虫结束时关闭浏览器 self.driver.quit() def parse(self, response): url_list = geturl(1) for i in url_list: yield Request(url=i, meta={'url': i}, callback=self.parse_detail) def parse_detail(self, response): title = response.css('div.sku-name::text').extract_first() n_price = response.css( 'span.p-price span:nth-child(2)::text').extract_first() o_price = response.css('#page_dpprice::text').extract_first() comment_nums = response.css( 'div#comment-count a::text').extract_first() product_params = str( response.css('div.p-parameter ul:nth-child(2) li::text').extract()) fav_rate = response.css('div.percent-con::text').extract_first() tag = str(response.css('div.percent-info span::text').extract()) com_url = response.meta['url'] good = response.css('li[clstag$="haoping"] em::text').extract_first() mid = response.css('li[clstag$="zhongping"] em::text').extract_first() bad = response.css('li[clstag$="chaping"] em::text').extract_first() po_tu = response.css( 'li[clstag$="shaidantab"] em::text').extract_first() item = JdCommoditydetailItem() item['title'] = title item['n_price'] = n_price item['o_price'] = o_price item['comment_nums'] = comment_nums item['product_params'] = product_params item['shop_name'] = self.name item['fav_rate'] = fav_rate item['tag'] = tag item['com_url'] = com_url item['good'] = good item['mid'] = mid item['bad'] = bad item['po_tu'] = po_tu yield item