예제 #1
0
class inman(scrapy.Spider):

    name = 'inman'
    allowed_domains = ['jd.com']
    start_urls = [str(geturl(0, name))]
    count = 1
    """generation"""
    def __init__(self):
        # 禁止加载图片,提升爬取速度
        opt = webdriver.ChromeOptions()
        prefs = {'profile.default_content_setting_values': {'images': 2}}
        opt.add_experimental_option('prefs', prefs)

        self.driver = webdriver.Chrome(chrome_options=opt)
        self.driver.implicitly_wait(10)
        super(inman, self).__init__()
        # 向CloseSpider方法发送爬虫结束信号,准备关闭pthantomjs浏览器
        dispatcher.connect(self.CloseSpider, signals.spider_closed)

    def CloseSpider(self, spider):
        print("spider closed")
        # 当爬虫结束时关闭浏览器
        self.driver.quit()

    def parse(self, response):
        url_list = geturl(1, self.name)
        for i in url_list:
            yield Request(url=i, meta={'url': i}, callback=self.parse_detail)

    def parse_detail(self, response):

        com_url = response.meta['url']
        title = response.css('div.sku-name::text').extract_first()
        n_price = response.css(
            'span.p-price span:nth-child(2)::text').extract_first()
        o_price = response.css('#page_dpprice::text').extract_first()
        product_params = str(
            response.css('div.p-parameter ul:nth-child(2) li::text').extract())

        #单独js网址发起请求
        match_re = re.match('.*?/(\d*).html', com_url)
        id = ''
        if match_re:
            id = match_re.group(1)
        else:
            print('正则提取商品id出错')
        #时间戳获取
        current_milli_time = lambda: int(round(time.time() * 1000))
        json_url = 'https://club.jd.com/comment/productCommentSummaries.action?referenceIds={0}&callback=json&_={1}'.format(
            id, current_milli_time())
        headers = Random_Headers(id)
        cookies = Random_Cookies()

        while True:
            r = requests.get(url=json_url, headers=headers, cookies=cookies)
            if r.status_code >= 200 and r.status_code < 300:
                break
            else:
                time.sleep(300)
                continue

        jscontent = re.search('^[^(]*?\((.*)\)[^)]*$', r.text).group(1)
        jsdict = json.loads(jscontent)

        comment_nums = jsdict['CommentsCount'][0]['CommentCountStr']
        good = jsdict['CommentsCount'][0]['GoodCountStr']
        good_rate = jsdict['CommentsCount'][0]['GoodRate']
        general = jsdict['CommentsCount'][0]['GeneralCountStr']
        general_rate = jsdict['CommentsCount'][0]['GeneralRate']
        poor = jsdict['CommentsCount'][0]['PoorCountStr']
        poor_rate = jsdict['CommentsCount'][0]['PoorCount']
        average_score = jsdict['CommentsCount'][0]['AverageScore']
        DefaultGoodCount = jsdict['CommentsCount'][0]['DefaultGoodCountStr']

        item = JdCommoditydetailItem()
        item['title'] = title
        item['n_price'] = n_price
        item['o_price'] = o_price
        item['comment_nums'] = comment_nums
        item['product_params'] = product_params
        item['shop_name'] = self.name
        item['good_rate'] = good_rate
        item['general_rate'] = general_rate
        item['com_url'] = com_url
        item['good'] = good
        item['general'] = general
        item['poor'] = poor
        item['poor_rate'] = poor_rate
        item['average_score'] = average_score
        item['DefaultGoodCount'] = DefaultGoodCount

        yield item
예제 #2
0
 def parse(self, response):
     url_list = geturl(1, self.name)
     for i in url_list:
         yield Request(url=i, meta={'url': i}, callback=self.parse_detail)
예제 #3
0
class moco(scrapy.Spider):

    name = 'moco'
    allowed_domains = ['jd.com']
    start_urls = [str(geturl(0))]
    """generation"""
    def __init__(self):
        # 禁止加载图片,提升爬取速度
        opt = webdriver.ChromeOptions()
        prefs = {'profile.default_content_setting_values': {'images': 2}}
        opt.add_experimental_option('prefs', prefs)

        self.driver = webdriver.Chrome(chrome_options=opt)
        self.driver.implicitly_wait(10)
        super(moco, self).__init__()
        # 向CloseSpider方法发送爬虫结束信号,准备关闭pthantomjs浏览器
        dispatcher.connect(self.CloseSpider, signals.spider_closed)

    def CloseSpider(self, spider):
        print("spider closed")
        # 当爬虫结束时关闭浏览器
        self.driver.quit()

    def parse(self, response):
        url_list = geturl(1)
        for i in url_list:
            yield Request(url=i, meta={'url': i}, callback=self.parse_detail)

    def parse_detail(self, response):
        title = response.css('div.sku-name::text').extract_first()
        n_price = response.css(
            'span.p-price span:nth-child(2)::text').extract_first()
        o_price = response.css('#page_dpprice::text').extract_first()
        comment_nums = response.css(
            'div#comment-count a::text').extract_first()
        product_params = str(
            response.css('div.p-parameter ul:nth-child(2) li::text').extract())
        fav_rate = response.css('div.percent-con::text').extract_first()
        tag = str(response.css('div.percent-info span::text').extract())
        com_url = response.meta['url']
        good = response.css('li[clstag$="haoping"] em::text').extract_first()
        mid = response.css('li[clstag$="zhongping"] em::text').extract_first()
        bad = response.css('li[clstag$="chaping"] em::text').extract_first()
        po_tu = response.css(
            'li[clstag$="shaidantab"] em::text').extract_first()

        item = JdCommoditydetailItem()
        item['title'] = title
        item['n_price'] = n_price
        item['o_price'] = o_price
        item['comment_nums'] = comment_nums
        item['product_params'] = product_params
        item['shop_name'] = self.name
        item['fav_rate'] = fav_rate
        item['tag'] = tag
        item['com_url'] = com_url
        item['good'] = good
        item['mid'] = mid
        item['bad'] = bad
        item['po_tu'] = po_tu
        yield item