Exemplo n.º 1
0
    def parse(self, response):
        # print response.body
        if '页面不存在' in response.body:
            Cookie = {
                '_hc.v':
                '9c20f3b2-274d-8559-c306-1785c4b96ebc.%s;' %
                (int(time.time())),
                'JSESSIONID':
                '%s' % self.__md5sum("%s" % time.time()),
                's_ViewType':
                '10',
                'PHOENIX_ID':
                '0a0102f1-15c114151d0-436b141'
            }
            header['User-Agent'] = random.choice(ua_list)
            yield Request(response.url,
                          callback=self.parse,
                          headers=header,
                          cookies=Cookie,
                          dont_filter=True)
        else:
            sel = Selector(response)
            detail_list = sel.xpath('//div[@class="reviews-items"]/ul/li')
            if detail_list:
                for detail in detail_list:
                    item = DianpingcommentItem()
                    comment_id = detail.xpath(
                        './div//span[@class="actions"]/a/@data-id').extract()
                    if comment_id:
                        comment_id = comment_id[0]
                    item['comment_id'] = comment_id
                    shop_id = ''.join(
                        re.findall('com/shop/(.*?)/review', response.url))
                    item['shop_id'] = shop_id
                    href = ''.join(
                        detail.xpath('./a/@href').extract()).strip().replace(
                            '\n', '')
                    name = ''.join(
                        detail.xpath(
                            './div[@class="main-review"]/div/a[@class="name"]/text()'
                        ).extract()).strip().replace('\n', '')
                    # print href
                    item['user_name'] = name
                    user_id = href.replace('/member/',
                                           '').strip().replace('\n', '')
                    item['user_id'] = user_id
                    total_score = ''.join(
                        detail.xpath(
                            './div[@class="main-review"]/div[@class="review-rank"]/span[1]/@class'
                        ).extract()).strip().replace('\n', '')
                    if not total_score:
                        total_score = ''.join(
                            detail.xpath(
                                './div[@class="content"]/p[@class="shop-info"]/span[1]/@class'
                            ).extract()).strip().replace('\n', '')
                    total_score = total_score.replace('sml-rank-stars sml-str',
                                                      '').replace(' star', '')
                    if total_score:
                        total_score = int(total_score) / 10
                    item['total_score'] = total_score
                    scores = detail.xpath(
                        './div[@class="main-review"]//span[@class="score"]/span/text()'
                    ).extract()
                    if scores:
                        if len(scores) == 3:

                            score1 = scores[0].replace('\n',
                                                       '').replace(' ', '')
                            score2 = scores[1].replace('\n',
                                                       '').replace(' ', '')
                            score3 = scores[2].replace('\n',
                                                       '').replace(' ', '')
                            score1_name = score1[:-1]
                            score1 = score1[-1:]
                            item['score1_name'] = score1_name
                            item['score1'] = score1

                            score2_name = score2[:-1]
                            score2 = score2[-1:]
                            item['score2_name'] = score2_name
                            item['score2'] = score2

                            score3_name = score3[:-1]
                            score3 = score3[-1:]
                            item['score3_name'] = score3_name
                            item['score3'] = score3
                    else:
                        item['score1_name'] = ''
                        item['score2_name'] = ''
                        item['score3_name'] = ''
                        item['score1'] = 0
                        item['score2'] = 0
                        item['score3'] = 0
                    comment_txt = ''.join(
                        detail.xpath(
                            './div[@class="main-review"]/div[@class="review-words"]/text()|./div[@class="main-review"]/div[@class="review-words Hide"]/text()'
                        ).extract()).strip().replace('\n', '')
                    item['comment_text'] = comment_txt
                    comment_dt = ''.join(
                        detail.xpath(
                            './div[@class="main-review"]//span[@class="time"]/text()'
                        ).extract()).strip().replace('\n', '')
                    if comment_dt:
                        comment_dt = comment_dt.replace(u'更新于', '')
                        comment_dt = comment_dt.replace('\n', '').replace(
                            '\r', '').replace('\t', '').strip()
                        comment_dt = comment_dt.split(u'\xa0')
                        if comment_dt:
                            comment_dt = comment_dt[0]

                        if len(comment_dt) == 5:
                            comment_dt = '2017-' + comment_dt
                        elif len(comment_dt) == 8:
                            comment_dt = '20' + comment_dt
                        if ' ' in comment_dt:
                            comment_dt = comment_dt.split(' ')[0]
                    else:
                        comment_dt = ''.join(
                            detail.xpath(
                                './div[@class="content"]/div[@class="misc-info"]/span/a[@class="time"]/text()'
                            ).extract()).strip().replace('\n', '')

                    item['comment_dt'] = comment_dt
                    contribution = ''.join(
                        detail.xpath(
                            './div[@class="pic"]/p[@class="contribution"]/span/@title'
                        ).extract()).strip().replace('\n', '')
                    contribution = contribution.replace('贡献值', '').strip()
                    item['user_contrib_val'] = contribution
                    # try:
                    #     db_insert.insert('t_hh_dianping_shop_comments', **item)
                    # except:
                    #     pass
                    yield item
                # next_page = sel.xpath('//a[@class="NextPage"]/@href')
                # if next_page:
                #     next_page = ''.join(next_page.extract())
                #     next_page = urljoin(response.url, next_page)
                #     print next_page
                #     Cookie = {'_hc.v': '9c20f3b2-274d-8559-c306-1785c4b96ebc.%s;' % (int(time.time())),
                #               'JSESSIONID': '%s' % self.__md5sum("%s" % time.time()),
                #               's_ViewType': '10',
                #               'PHOENIX_ID': '0a0102f1-15c114151d0-436b141'
                #               }
                #     header['User-Agent'] = random.choice(ua_list)
                #     yield Request(next_page, errback=self.parse_failure,
                #                   callback=self.parse, headers=header, cookies=Cookie, dont_filter=True, )
                page_now = response.url.split('review_all/p')
                if int(page_now[1]) == 1:
                    next_page = sel.xpath(
                        '//a[@class="PageLink"][last()]/@data-pg').extract()
                    if next_page:
                        print next_page
                        next_page = ''.join(next_page)
                        for i in xrange(2, int(next_page) + 1):
                            next_page = '%sreview_all/p%s' % (page_now[0], i)
                            # print next_page
                            Cookie = {
                                '_hc.v':
                                '9c20f3b2-274d-8559-c306-1785c4b96ebc.%s;' %
                                (int(time.time())),
                                'JSESSIONID':
                                '%s' % self.__md5sum("%s" % time.time()),
                                's_ViewType':
                                '10',
                                'PHOENIX_ID':
                                '0a0102f1-15c114151d0-436b141'
                            }
                            header['User-Agent'] = random.choice(ua_list)
                            print next_page
                            yield Request(
                                next_page,
                                callback=self.parse,
                                headers=header,
                                cookies=Cookie,
                                dont_filter=True,
                            )
            else:
                print response.body
Exemplo n.º 2
0
    def parse(self, response):
        # print response.body
        if '页面不存在' in response.body:
            Cookie = {
                '_hc.v':
                '9c20f3b2-274d-8559-c306-1785c4b96ebc.%s;' %
                (int(time.time())),
                'JSESSIONID':
                '%s' % self.__md5sum("%s" % time.time()),
                's_ViewType':
                '10',
                'PHOENIX_ID':
                '0a0102f1-15c114151d0-436b141'
            }
            yield Request(response.url,
                          errback=self.parse_failure,
                          callback=self.parse,
                          headers=header,
                          cookies=Cookie,
                          dont_filter=True)
        else:
            sel = Selector(response)
            detail_list = sel.xpath('//div[@class="comment-list"]/ul/li')
            if detail_list:
                for detail in detail_list:
                    item = DianpingcommentItem()
                    comment_id = ''.join(detail.xpath('@data-id').extract())
                    item['comment_id'] = comment_id
                    shop_id = ''.join(
                        re.findall('com/shop/(.*?)/review', response.url))
                    item['shop_id'] = shop_id
                    href = ''.join(
                        detail.xpath(
                            './div[@class="pic"]/p[@class="name"]/a/@href').
                        extract()).strip().replace('\n', '')
                    name = ''.join(
                        detail.xpath(
                            './div[@class="pic"]/p[@class="name"]/a/text()').
                        extract()).strip().replace('\n', '')
                    # print href
                    item['user_name'] = name
                    user_id = href.replace('/member/',
                                           '').strip().replace('\n', '')
                    item['user_id'] = user_id
                    total_score = ''.join(
                        detail.xpath(
                            './div[@class="content"]/div[@class="user-info"]/span[1]/@class'
                        ).extract()).strip().replace('\n', '')
                    if not total_score:
                        total_score = ''.join(
                            detail.xpath(
                                './div[@class="content"]/p[@class="shop-info"]/span[1]/@class'
                            ).extract()).strip().replace('\n', '')
                    total_score = total_score.replace('item-rank-rst irr-star',
                                                      '')
                    if total_score:
                        total_score = int(total_score) / 10
                    item['total_score'] = total_score
                    scores = detail.xpath(
                        './div[@class="content"]/div[@class="user-info"]/div/span/text()'
                    ).extract()
                    if scores:
                        if len(scores) == 3:
                            score1 = scores[0]
                            score2 = scores[1]
                            score3 = scores[2]
                            score1_name = score1[:-1]
                            score1 = score1[-1:]
                            item['score1_name'] = score1_name
                            item['score1'] = score1

                            score2_name = score2[:-1]
                            score2 = score2[-1:]
                            item['score2_name'] = score2_name
                            item['score2'] = score2

                            score3_name = score3[:-1]
                            score3 = score3[-1:]
                            item['score3_name'] = score3_name
                            item['score3'] = score3
                    comment_txt = ''.join(
                        detail.xpath(
                            './div[@class="content"]/div[@class="comment-txt"]/div/text()'
                        ).extract()).strip().replace('\n', '')
                    item['comment_text'] = comment_txt
                    comment_dt = ''.join(
                        detail.xpath(
                            './div[@class="content"]/div[@class="misc-info"]/span/text()'
                        ).extract()).strip().replace('\n', '')
                    if comment_dt:
                        comment_dt = comment_dt.split(u'\xa0')
                        if comment_dt:
                            comment_dt = comment_dt[0]

                        if len(comment_dt) == 5:
                            comment_dt = '2017-' + comment_dt
                        elif len(comment_dt) == 8:
                            comment_dt = '20' + comment_dt
                    item['comment_dt'] = comment_dt
                    contribution = ''.join(
                        detail.xpath(
                            './div[@class="pic"]/p[@class="contribution"]/span/@title'
                        ).extract()).strip().replace('\n', '')
                    contribution = contribution.replace('贡献值', '').strip()
                    item['user_contrib_val'] = contribution
                    # try:
                    #     db_insert.insert('t_hh_dianping_shop_comments', **item)
                    # except:
                    #     pass
                    yield item
                next_page = sel.xpath('//a[@class="NextPage"]/@href')
                if next_page:
                    next_page = ''.join(next_page.extract())
                    next_page = urljoin(response.url, next_page)
                    print next_page
                    Cookie = {
                        '_hc.v':
                        '9c20f3b2-274d-8559-c306-1785c4b96ebc.%s;' %
                        (int(time.time())),
                        'JSESSIONID':
                        '%s' % self.__md5sum("%s" % time.time()),
                        's_ViewType':
                        '10',
                        'PHOENIX_ID':
                        '0a0102f1-15c114151d0-436b141'
                    }
                    yield Request(
                        next_page,
                        errback=self.parse_failure,
                        callback=self.parse,
                        headers=header,
                        cookies=Cookie,
                        dont_filter=True,
                    )
            else:
                print response.body