Python Selector.Selector 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: scrapy

클래스/타입: Selector

메소드/함수: Selector

hotexamples.com에서의 예제들: 30

Python Selector.Selector - 30개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 scrapy.Selector.Selector에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Selector(30)

css(30)

split(30)

xpath(30)

re(24)

extract(22)

replace(11)

strip(9)

__len__(8)

remove_namespaces(7)

startswith(7)

find(6)

select(6)

__contains__(4)

extract_first(3)

index(3)

append(2)

register_namespace(2)

re_first(2)

group(2)

get(2)

findall(2)

endswith(1)

rsplit(1)

json(1)

select_by_visible_text(1)

isdigit(1)

예제 #1

파일 보기

파일: Consumers12315_Detail.py 프로젝트: gdky005/ScrapyPro

    def parse(self, response):
        self.driver.get(response.url)
        # time.sleep(5)

        # content = self.driver.page_source
        # print("爬取的内容如下：" + content)

        # selector = Selector(text=content)
        selector = Selector(response)

        # bigTitle = selector.xpath('//div[@class="hd"]/h2/text()').extract()


        # self.getBigTitle(selector)
        # self.getSmallTitle(selector)



        myContent = selector.xpath('//div[@class="WordSection1"]/p[@class="MsoNormal"]/span//text()').extract()

        i = 0
        isTitle = False

        space = "\r\n\n\t"
        space1 = "\r\n"
        content1 = ""

        # self.singleText(content1, i, isTitle, myContent, space)

        for line in myContent:

            if isTitle:
                content1 += "^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + space1
                content1 += "当前的问题是：" + line + space1
                content1 += "^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + space

                isTitle = False
                continue

            if Utils.matchTitle(line):
                i += 1
                if i > 10:
                    break

                content1 += "______________________" + space1
                content1 += line + "---------------" + space1
                content1 += "______________________" + space1

                isTitle = True
                continue

            if ~isTitle:
                l = line
                # for l in line:

                if Utils.matchTitle(l):
                    # content1 += line
                    content1 += space
                    continue

                content1 += l
                endChar = l[len(l) - 1]

                if Utils.isEndChar(endChar):
                    content1 += space
        print(content1)

예제 #2

파일 보기

    def parse(self, response):
        shop_id = response.url.split('/')[-1].strip()
        shop_url = response.url

        data = Selector(response).xpath('//div[@class="page-main"]')
        main = Selector(response).xpath('//div[@class="market-main"]')
        detail = Selector(response).xpath('//div[@class="market-detail"]')
        detail_other = Selector(response).xpath(
            '//div[@class="market-detail-other Hide"]')

        navigator_div = Selector(response).xpath('//div[@class="breadcrumb"]')

        location = []
        for loc in navigator_div.xpath('b/a/span'):
            location.append(loc.xpath('text()').extract()[0].strip())
        shop_navigation_path = '>'.join(location)

        print shop_navigation_path

        shop_name = Selector(response).xpath(
            '//div[@class="shop-name"]/h1/text()').extract()[0].strip()

        print shop_name

        shop_district = Selector(response).xpath(
            '//span[@class="region"]/text()').extract()[0].strip()

        print shop_district

        shop_address = Selector(response).xpath(
            '//span[@itemprop="street-address"]/text()').extract()[0].strip()

        print shop_address

        shop_phone_1 = None
        shop_phone_2 = None
        shop_rank = Selector(response).xpath(
            '//div[@class="comment-rst"]/span/@title').extract()[0].strip()
        shop_taste_score = None
        shop_env_score = None
        shop_service_score = None
        shop_price = Selector(response).xpath(
            '//div[@class="comment-rst"]/dl/dd/text()').extract()[0].strip()
        shop_review = Selector(response).xpath(
            '//div[@class="comment-rst"]/a/span/text()').extract()[0].strip()

        print shop_phone_1
        print shop_rank
        print shop_taste_score, shop_env_score, shop_service_score, shop_price, shop_review

예제 #3

파일 보기

 def detial_parse(self, response):
     autohomeforumItem = response.meta['autohomeforumItem']
     selector = Selector(response)
     mainBody = selector.xpath("//div[@id='cont_main']")
     main_topic = mainBody.xpath("./div[@id='maxwrap-maintopic']")
     detailContent = autohomeforumItem['contents']
     topic_text = main_topic.xpath(".//div[contains(@class,'conttxt')]")
     topic_text = topic_text.xpath("string(.)").extract_first()
     # 检查是否有ttf
     cmp = re.compile(",url\('(//.*.ttf)'\) format\('woff'\)")
     rst = cmp.findall(response.body.decode('utf-8'))
     if rst:
         self.loopGet(self.savefont, rst[0])
     currentPage = int(re.findall('-(\d+)\.html', response.url)[0])
     if currentPage == 1:
         imageRecognizer = ImageRecognizer(orignText=topic_text,
                                           orignFont='temp.ttf')
         try:
             topic_text = ' '.join(imageRecognizer.outterCall().replace(
                 '\n', '').split())
             detailContent.append(
                 [' '.join(topic_text.replace('\n', '').split()), '楼主'])
         except Exception as e:
             print(e)
     main_replyList = mainBody.xpath("./div[@id='maxwrap-reply']/div")
     for replyItem in main_replyList:
         try:
             tempList = dict()
             floor = replyItem.xpath(".//button/text()").extract_first()
             authorId = replyItem.xpath(".//a[@xname='uname']/@href"
                                        ).extract_first().split('/')[-2]
             authorName = replyItem.xpath(
                 ".//a[@xname='uname']/text()").extract_first().strip()
             replyWho = replyItem.xpath(
                 ".//div[@class='relyhfcon']//a[2]/text()")
             publistTime = replyItem.xpath(
                 ".//span[@xname='date']/text()").extract_first()
             tempList['publishTime'] = publistTime
             if replyWho:  # 如果是回复某楼层的，则tempList第一个是内容，第二个是楼层
                 thisContent = replyItem.xpath(
                     ".//div[@class = 'yy_reply_cont']")
                 thisContent = thisContent.xpath(
                     "string(.)").extract_first()
                 if rst:
                     try:
                         imageRecognizer = ImageRecognizer(
                             orignText=thisContent, orignFont='temp.ttf')
                         thisContent = ' '.join(
                             imageRecognizer.outterCall().replace(
                                 '\n', '').split())
                         tempList['thisContent'] = ' '.join(
                             thisContent.replace('\n', '').split())
                         tempList['replyWho'] = ' '.join(
                             replyWho.extract_first().replace('\n',
                                                              '').split())
                     except Exception as e:
                         print(e)
             else:
                 thisContent = replyItem.xpath(
                     ".//div[contains(@class,'x-reply')]")
                 thisContent = thisContent.xpath(
                     "string(.)").extract_first()
                 if rst:
                     try:
                         imageRecognizer = ImageRecognizer(
                             orignText=thisContent, orignFont='temp.ttf')
                         thisContent = ' '.join(
                             imageRecognizer.outterCall().replace(
                                 '\n', '').split())
                         tempList['thisContent'] = ' '.join(
                             thisContent.replace('\n', '').split())
                         tempList['replyWho'] = '楼主'
                     except Exception as e:
                         print(e)
             tempList['floor'] = floor
             tempList['authorId'] = authorId
             tempList['authorName'] = authorName
             detailContent.append(tempList)
         except Exception as e:
             print(e)
     autohomeforumItem['scrapyTime'] = time.strftime(
         "%Y-%m-%d %H:%M:%S", time.localtime())
     autohomeforumItem['contents'] = detailContent
     nextPageUrl = selector.xpath("//a[text()='下一页']/@href").extract_first()
     if nextPageUrl is not None:
         yield SplashRequest(url=self.baseUrl + nextPageUrl,
                             callback=self.detial_parse,
                             args={
                                 'wait': 1,
                                 'timeout': 60,
                                 'images': 0
                             },
                             meta={'autohomeforumItem': autohomeforumItem})
     else:
         yield autohomeforumItem

예제 #4

파일 보기

    def company_info(self, response):
        # company_data = {'unit_type': response.meta['unit_type'], 'city': '',
        #                 'start_date': '', 'number': '', 'authority': '', 'type_of_registration': '',
        #                 'business_area': '', 'security_number': '', 'capital': '', 'unit_property': '',
        #                 'social_registration': '', 'registered_address': '', 'registered__postal_code': '',
        #                 'business_address': '', 'business_postal_number': '', 'legal_person': '',
        #                 'website': '',
        #                 }
        company_name = Selector(response=response).xpath(
            '//td[@colspan="3"]')[0].xpath('./a/@title').extract_first()
        # company_data['company_name'] = company_name
        # # test = self.r.sadd('title_name1', company_name)
        # unit_property = Selector(response=response).xpath(
        #     '//td[@style="width: 350px;padding-top: 9px;"]/text()').extract_first()
        # if unit_property.split():
        #     unit_property = unit_property.split()[0]
        #     company_data['unit_property'] = unit_property
        #
        # capital = Selector(response=response).xpath('//td[@colspan="3"]')[2].xpath('text()').extract_first()
        # if capital is not None:
        #     if capital != '/':
        #         company_data['capital'] = capital + '万元'
        #
        # city = Selector(response=response).xpath('//td[@colspan="3"]')[1].xpath('text()').extract_first()
        # if city.split():
        #     city = city.split()[0]
        #     company_data['city'] = city
        #
        # start_company_data = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[
        #     3].xpath('text()').extract_first()
        # if start_company_data.split():
        #     start_company_data = start_company_data.split()[0]
        #     company_data['start_date'] = start_company_data
        #
        # number = Selector(response=response).xpath('//td[@colspan="3"]')[3].xpath(
        #     'text()').extract_first()
        # if number.split():
        #     number = number.split()[0]
        #     company_data['number'] = number
        #
        # authority = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[5].xpath(
        #     'text()').extract_first()
        # if authority is not None:
        #     authority = authority.split()[0]
        #     if authority != '/':
        #         company_data['authority'] = authority
        #
        # type_of_registration = Selector(response=response).xpath('//td[@colspan="5"]')[0].xpath(
        #     'text()').extract_first()
        # if type_of_registration.split():
        #     type_of_registration = type_of_registration.split()[0]
        #     company_data['type_of_registration'] = type_of_registration
        #
        # business_area = Selector(response=response).xpath('//td[@colspan="5"]')[1].xpath(
        #     'text()').extract_first()
        # if business_area is not None:
        #     business_area = business_area.split()[0]
        #     if business_area != '/':
        #         company_data['business_area'] = business_area
        #
        # security_number = Selector(response=response).xpath('//td[@colspan="3"]')[4].xpath(
        #     'text()').extract_first()
        # if security_number is not None:
        #     security_number = security_number.split()[0]
        #     if security_number != '/':
        #         company_data['security_number'] = security_number
        #
        # social_registration = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[
        #     9].xpath(
        #     'text()').extract_first()
        # if social_registration is not None:
        #     social_registration = social_registration.split()[0]
        #     if social_registration != '/':
        #         company_data['social_registration'] = social_registration
        #
        # registered_address = Selector(response=response).xpath('//td[@colspan="3"]')[5].xpath(
        #     'text()').extract_first()
        # if registered_address is not None:
        #     registered_address = registered_address.split()[0]
        #     if registered_address != '/':
        #         company_data['registered_address'] = registered_address
        #
        # registered__postal_code = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[
        #     11].xpath(
        #     'text()').extract_first()
        # if registered__postal_code is not None:
        #     registered__postal_code = registered__postal_code.split()[0]
        #     if registered__postal_code != '/':
        #         company_data['registered__postal_code'] = registered__postal_code
        #
        # business_address = Selector(response=response).xpath('//td[@colspan="3"]')[5].xpath(
        #     'text()').extract_first()
        # if business_address is not None:
        #     business_address = business_address.split()[0]
        #     if business_address != '/':
        #         company_data['business_address'] = business_address
        #
        # business_postal_number = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[
        #     13].xpath(
        #     'text()').extract_first()
        # if business_postal_number is not None:
        #     business_postal_number = business_postal_number.split()[0]
        #     if business_postal_number != '/':
        #         company_data['business_postal_number'] = business_postal_number
        #
        # legal_person = Selector(response=response).xpath('//td[@colspan="2"]/text()').extract_first()
        # if legal_person is not None:
        #     legal_person = legal_person.split()[0]
        #     if legal_person != '/':
        #         company_data['legal_person'] = legal_person
        #
        # if len(Selector(response=response).xpath('//td[@colspan="5"]')) == 3:
        #     website = Selector(response=response).xpath('//td[@colspan="5"]')[2].xpath(
        #         'text()').extract_first()
        #     if website.split():
        #         print(website.split(), 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA')
        #         website = website.split()[0]
        #         if website.startswith('www') or website.startswith('http'):
        #             company_data['website'] = website
        #     print('公司信息', company_data)
        # yield scrapy.FormRequest(url='tongna', formcompany_data=company_data, callback=self.company_zz)

        ## 资质件信息
        # ability_info_all = Selector(response=response).xpath('//table[@id="table_zz"]')
        # # print(ability_info_all, company_name)
        # if ability_info_all:
        #     ability_info_all = ability_info_all[0].xpath('./tbody/tr')
        #     for a in ability_info_all:
        #         info_condition = a.xpath('./td')
        #         # print(len(info_condition), company_name)
        #         ability_data = {'company_name': company_name, 'issuing_authority': '', 'ability_type': '',
        #                         'licence': '', 'grade': '', 'ability_number': '', 'start_date': ''}
        #         ability_type = info_condition[0].xpath('text()').extract_first()
        #         try:
        #             ability_data['ability_type'] = ability_type.split()[0]
        #
        #         except IndexError:
        #             ability_data['ability_type'] = ''
        #
        #         try:
        #             licence = info_condition[1].xpath('text()').extract_first()
        #             if licence is not None:
        #                 ability_data['licence'] = licence
        #         except IndexError:
        #             pass
        #
        #         try:
        #             grade = info_condition[2].xpath('text()').extract_first()
        #             if grade is not None:
        #                 ability_data['grade'] = grade
        #         except IndexError:
        #             pass
        #
        #         try:
        #             ability_number = info_condition[3].xpath('text()').extract_first()
        #             if ability_number is not None:
        #                 ability_data['ability_number'] = ability_number
        #         except IndexError:
        #             continue
        #
        #         try:
        #             start_date = info_condition[4].xpath('text()').extract_first()
        #             if start_date is not None:
        #                 ability_data['start_date'] = start_date
        #         except IndexError:
        #             pass
        #
        #         try:
        #             end_date = info_condition[5].xpath('text()').extract_first()
        #             if end_date is not None:
        #                 ability_data['end_date'] = end_date
        #         except IndexError:
        #             pass
        #
        #         try:
        #             issuing_authority = info_condition[6].xpath('text()').extract_first()
        #             if issuing_authority is not None:
        #                 ability_data['issuing_authority'] = issuing_authority
        #         except IndexError:
        #             pass
        #
        #         print('企业资质', ability_data, 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA')
        # #     # yield scrapy.FormRequest(url='tongna', formdata=ability_info_all, callback=self.ability_zz)

        ## 安全证件信息
        # ability_info_all2 = Selector(response=response).xpath('//table[@id="table_zz"]')
        # if len(ability_info_all2) == 2:
        #     safe_ability = ability_info_all2[1].xpath('./tbody/tr')
        #     print('为啥不执行这个安全证件信息？？？？%s' % safe_ability)
        #     for s in safe_ability:
        #         safe_certificates_data = {'company_name': company_name, 'safe_number': '', 'address_certificates': '',
        #                                   'start_date_certificates': '', 'type_certificates': ''}
        #         all_safe_td = s.xpath('./td')
        #         safe_number = all_safe_td[0].xpath('text()').extract_first()
        #         if safe_number is not None:
        #             safe_number = safe_number.split()[0]
        #             if safe_number == '无':
        #                 continue
        #             safe_certificates_data['safe_number'] = safe_number
        #         else:
        #             continue
        #
        #         address_certificates = all_safe_td[1].xpath('text()').extract_first()
        #         if address_certificates is not None:
        #             safe_certificates_data['address_certificates'] = address_certificates
        #
        #         start_date_certificates = all_safe_td[2].xpath('text()').extract_first()
        #         if start_date_certificates is not None:
        #             safe_certificates_data['start_date_certificates'] = start_date_certificates
        #
        #         type_certificates = all_safe_td[3].xpath('text()').extract_first()
        #         if type_certificates is not None:
        #             safe_certificates_data['type_certificates'] = type_certificates
        #         print('企业安全证件信息', safe_certificates_data, 'BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB')
        #             # yield scrapy.FormRequest(url='tongna', formdata=safe_certificates_data, callback=self.ability_zz)
        #
        ## 系统相关细信息
        # authentication_all = Selector(response=response).xpath('//table[@id="table_sys"]')
        # if authentication_all:
        #     authentica_tr = authentication_all.xpath('./tbody/tr')
        #
        #     for a in authentica_tr:
        #         system_data = {'company_name': company_name, 'system_end': '', 'system_name': '',
        #                        'system_start': ''}
        #         d = a.xpath('./td')
        #         system_name = d[0].xpath('text()').extract_first()
        #         if system_name is not None:
        #             # print('系统相关信息---%s----%s' % system_name, type(system_name))
        #             system_data['system_name'] = system_name
        #         else:
        #             continue
        #
        #         system_start = d[1].xpath('text()').extract_first()
        #         if system_start is not None:
        #             # print('系统相关信息---%s----%s' % system_start, type(system_start))
        #             system_data['system_start'] = system_start
        #
        #         system_end = d[2].xpath('text()').extract_first()
        #         if system_end is not None:
        #             # print('系统相关信息---%s----%s' % system_end, type(system_end))
        #             system_data['system_end'] = system_end
        #         print('企业系统认证', system_data, 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC')
        # #         # yield scrapy.FormRequest(url='tongna', formdata=system_data, callback=self.ability_zz)

        # 项目详情
        project_performance = Selector(
            response=response).xpath('//div[@id="tab4"]/table/tbody/tr')
        print(len(project_performance),
              project_performance.xpath('./td/text()').extract_first(),
              company_name)
        if len(project_performance) != 1:
            for p in project_performance:
                project_data = {
                    'project_name': '',
                    'project_address': '',
                    'project_status': '',
                    'project_capital': '',
                    'project_start_date': '',
                    'project_company': '',
                    'project_complete': ''
                }
                easy_info = p.xpath('./td[@align="center"]')
                if len(easy_info) == 0:
                    pass
                else:
                    print(len(easy_info), 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAA')
                    project_name = easy_info[2].xpath('text()').extract_first()
                    if project_name is not None:
                        project_data['project_name'] = project_name
                    else:
                        continue

                    project_address = easy_info[3].xpath(
                        'text()').extract_first()
                    if project_address is not None:
                        project_data['project_address'] = project_address

                    project_status = easy_info[4].xpath(
                        'text()').extract_first()
                    if project_status is not None:
                        project_data['project_status'] = project_status

                    project_capital = easy_info[5].xpath(
                        'text()').extract_first()
                    if project_capital is not None:
                        project_data['project_capital'] = project_capital

                    project_start_date = easy_info[6].xpath(
                        'text()').extract_first()
                    if project_start_date is not None:
                        project_data['project_start_date'] = project_start_date

                    project_company = easy_info[7].xpath(
                        'text()').extract_first()
                    if project_company is not None:
                        project_data['project_company'] = project_company

                    project_complete = easy_info[8].xpath(
                        './font/text()').extract_first()
                    if project_complete is not None:
                        project_data['project_complete'] = project_complete
                content = p.xpath(
                    './td[@colspan="9"]/table[@id="table_report"]/tr')

                print(project_data)

예제 #5

파일 보기

 def parseNews(self, response):
     charset = tools.detectPageCharset(response.body)
     if charset is not None:
         try:
             response.body.decode(charset)
         except UnicodeDecodeError:
             response.body.decode(self.backupCharset)
     else:
         logging.log(
             logging.WARNING,
             "Can not detect the charset encoding of " + response.url)
     sel = Selector(response)
     brandname = response.meta['brandname']
     page_type = sel.xpath(
         '//div[@class="contents"]/div[@id="brand-right"]/div[@class="m-zx-lbox"]'
     )
     page_type2 = sel.xpath(
         '//div[@class="g-content clearfix m-first"]/div[@class="first clearfix"]/div[@class="g-main fleft"]'
     )
     page_type3 = sel.xpath('//div[@class="g-content clearfix articlePic"]')
     page_type4 = sel.xpath('//div[@class="centess"]//div[@id="pardynr"]')
     page_type5 = sel.xpath(
         '//div[@id="content94"]//div[@class="con2_left_row1"]')
     if len(page_type) > 0:
         # page_type 官方发布的新闻页面
         title = page_type.xpath('./h1/text()').extract()
         date = page_type.xpath(
             './dl[@class="date"]/dt/span[1]/text()').extract()
         content = page_type.xpath(
             './div[@class="main"]//p//text()').extract()
     elif len(page_type2) > 0:
         # page_type2 编辑写的页面
         all_pages = page_type2.xpath(
             './dl[@class="pages_fullRead"]/dd/a/@href').extract()
         if len(all_pages) > 0:
             # 如果包含全页阅读 需重新生成请求
             url = "http://www.yoka.com" + all_pages[0]
             r = Request(url, callback=self.parseNews)
             r.meta['brandname'] = brandname
             yield r
             return
         else:
             print 'page type 2 llllllllllllllllllllllll'
             title = page_type2.xpath(
                 './h1[@class="infoTitle"]/text()').extract()
             date = page_type2.xpath(
                 './div[@class="infoTime"]/div[@class="time"]/i/text()'
             ).extract()
             content = page_type2.xpath(
                 './div[@class="double_quotes"]/div/text()').extract()
             content.extend(
                 page_type2.xpath(
                     './div[@class="textCon"]//p//text()').extract())
     elif len(page_type3) > 0:
         # page_type3 图片幻灯片的新闻页面
         title = page_type3.xpath('./h1[@id="picTitle"]/text()').extract()
         content = page_type3.xpath(
             './dl[@class="text"]//dd/text()').extract()
         # 从URL中提取日期 http://www.yoka.com/fashion/popinfo/2016/0725/pic48495001119565.shtml?source=brand
         date_pattern = re.compile('(\d{4})/(\d{4})')
         sresult = re.search(date_pattern, response.url)
         if sresult is None:
             date_str = ""
         else:
             date_str = sresult.group()
         index = date_str.index('/')
         sub = date_str[index:index + 3]
         date = date_str.replace(sub, sub + '-').replace('/', '-')
     elif len(page_type4) > 0:
         # page_type4 老版网站的页面 http://www.yoka.com/fashion/roadshow/2008/082290701.shtml
         title = page_type4.xpath(
             './dl[@class="viewtis"]/dt/h1/text()').extract()
         # 提取日期字符串 如 2008-08-22 11:14 来源：
         date_str = page_type4.xpath(
             './dl[@class="viewtis"]/dd/text()').extract()
         pattern = re.compile('(\d{4}-\d{2}-\d{2})')
         date = re.search(pattern, date_str[0]).group()
         content = page_type4.xpath(
             './div[@id="viewbody"]//p//text()').extract()
         # 寻找是否有下一页链接 http://www.yoka.com/fashion/popinfo/2009/0922253399.shtml
         next_page = page_type4.xpath(
             './div[@id="viewbody"]//span[@class="pagebox_next"]/a/@href'
         ).extract()
         if len(next_page) > 0:
             # 如果有下一页链接, 则需要生成新的请求，交给parseNewsNextPage处理
             url = "http://www.yoka.com" + next_page[0]
             r = Request(url, callback=self.parseNewsNextPage)
             article = {
                 'brandname': brandname,
                 'title': title,
                 'date': date,
                 'content': content
             }
             r.meta['article'] = article
             yield r
             return
     elif len(page_type5) > 0:
         # page_type5 老版网页页面: http://www.yoka.com/luxury/watch/2008/060268802.shtml
         title = page_type5.xpath('./h2/text()').extract()
         # 提取日期字符串 如：2008-06-02 17:12　来源：
         date_str = page_type5.xpath('./div[@class="src"]/text()').extract()
         pattern = re.compile('(\d{4}-\d{2}-\d{2})')
         date = re.search(pattern, date_str[0]).group()
         content = page_type5.xpath(
             './div[@class="con"]//p//text()').extract()
         # 寻找是否有下一页链接 http://www.yoka.com/luxury/watch/2008/060268802.shtml
         next_page = page_type5.xpath(
             './div[@class="con"]/p[@align="right"]/a[position()>1 and @style]'
         ).extract()
         if len(next_page):
             url = "http://www.yoka.com" + next_page[0]
             r = Request(url, callback=self.parseNewsNextPage)
             article = {
                 'brandname': brandname,
                 'title': title,
                 'date': date,
                 'content': content
             }
             r.meta['article'] = article
             yield r
             return
     else:
         return
     item = NewsItem()
     item['title'] = "".join(title)
     item['date'] = "".join(date)
     item['brandname'] = brandname
     item['content'] = "".join(content)
     yield item

예제 #6

파일 보기

파일: cankaoxiaoxi_spider.py 프로젝트: chenhaiyuan53880/100public_sentiment

 def parse1(self, response):
     sele = Selector(response)
     title = sele.xpath('//title/text()').extract_first()
     if title:
         # 文章正文内容
         Content = ''
         Content_urls = sele.xpath('//ul[@class="ov"]//a/@href').extract()
         Content_urls_list = []
         for Content_url in Content_urls:
             if 'http://www.cankaoxiaoxi.com' in Content_url:
                 Content_urls_list.append(Content_url)
         for url in Content_urls_list:
             response1 = requests.get(url)
             soup = Selector(text=response1.text)
             bodys = soup.xpath(
                 '//div[@id="ctrlfscont"]//p/text()').extract()
             for body in bodys:
                 Content = Content + str(body)
             time.sleep(4)
         if len(Content) < 10:
             bodys = sele.xpath(
                 '//div[@id="ctrlfscont"]//p/text()').extract()
             for body in bodys:
                 Content = Content + str(body)
         try:
             AgreeCount = sele.xpath(
                 '//p[@class="emoji-num"]/text()').extract()[3]
         except:
             AgreeCount = ''
         try:
             DisagreeCount = sele.xpath(
                 '//p[@class="emoji-num"]/text()').extract()[0]
         except:
             DisagreeCount = ''
         item = Yuqing_CankaoxiaoxiItem({
             'AuthorID':
             '',
             'AuthorName':
             sele.xpath(
                 '//span[@id="editor_baidu"]/text()').extract_first(),
             'ArticleTitle':
             title,
             'SourceArticleURL':
             response.url,
             'URL':
             response.url,
             'PublishTime':
             sele.xpath(
                 '//span[@id="pubtime_baidu"]/text()').extract_first(),
             'Crawler':
             time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
             'ReadCount':
             '',
             'CommentCount':
             '',
             'TransmitCount':
             '',
             'Content':
             Content,
             'comments':
             '',
             'AgreeCount':
             AgreeCount,
             'DisagreeCount':
             DisagreeCount,
             'AskCount':
             '',
             'ParticipateCount':
             '',
             'CollectionCount':
             '',
             'Classification':
             sele.xpath('//div[@class="crumb"]/a/text()').extract()[1],
             'Labels':
             sele.xpath(
                 '//meta[@name="keywords"]/@content').extract_first(),
             'Type':
             '',
             'RewardCount':
             ''
         })
         yield item

예제 #7

파일 보기

    def parse(self, response):
        sel = Selector(response)
        item = ChanyoujiUser()
        user_data = response.meta['data']
        item['user_id'] = user_data['user_id']
        user_name = sel.xpath('//div[contains(@class, "header-inner")]/h1/text()').extract()
        if user_name:
            item['user_name'] = user_name[0]
        else:
            item['user_name'] = None
        ret = sel.xpath('//div[contains(@class, "header-inner")]/div[1]/text()').extract()
        if ret:
            num_youji = ret[0]
            num = re.compile('\d{1,}')
            m1 = num.search(num_youji)
            if m1:
                item['num_notes'] = int(m1.group())

        ret = sel.xpath(
            '//div[contains(@class,"header-inner")]/a/img[contains(@class,"avatar") and @src]/@src').extract()
        if ret:
            item['avatar'] = ret[0]

        ret = sel.xpath('//div[contains(@class, "sns-site")]/p/text()').extract()
        if ret:
            ret = ret[0]
            if u'喜欢她的游记' in ret:
                item['gender'] = 'f'
            elif u'喜欢他的游记' in ret:
                item['gender'] = 'm'

        ret = sel.xpath(
            '//div[contains(@class, "sns-site")]/ul[@class="sns-ico"]/li[contains(@class,"weibo")]/a/@href').extract()
        if ret:
            weibo_url = ret[0]
            item['weibo_url'] = weibo_url

            match = re.search(r'weibo\.com/u/(\d+)/?$', weibo_url)
            if match:
                item['weibo_uid'] = int(match.groups()[0])
            else:
                match = re.search(r'weibo\.com/([^/]+)/?$', weibo_url)
                if match:
                    item['weibo_uid'] = match.groups()[0]

        ret = sel.xpath(
            '//div[contains(@class, "sns-site")]/ul[@class="sns-ico"]/li[contains(@class,"douban")]/a/@href').extract()
        if ret:
            douban_url = ret[0]
            item['douban_url'] = douban_url
            match = re.search(r'douban\.com/people/(\d+)/?$', douban_url)
            if match:
                item['douban_uid'] = int(match.groups()[0])

        ret = sel.xpath(
            '//div[contains(@class, "sns-site")]/ul[@class="sns-ico"]/li[contains(@class,"renren")]/a/@href').extract()
        if ret:
            renren_url = ret[0]
            item['renren_url'] = renren_url
            match = re.search(r'renren\.com/(\d+)/profile/?$', renren_url)
            if match:
                item['renren_uid'] = int(match.groups()[0])

        marker = {}
        # 查找Gmaps.map.markers对象
        match = re.search(r'Gmaps\.map\.markers\s*=\s*(?=\[)(.+?)(?<=\])', response.body)
        if match:
            try:
                marker_data = json.loads(match.groups()[0])
                for tmp in marker_data:
                    lat = float(tmp['lat'])
                    lng = float(tmp['lng'])
                    mid = tmp['id']
                    title = tmp['title'].strip()
                    desc = tmp['description']

                    match = re.search(r'href\s*="([^"]+)"', desc)
                    href = 'http://chanyouji.com' + match.groups()[0] if match else None
                    if href:
                        marker[mid] = {'lat': lat, 'lng': lng, 'title': title, 'url': href, 'data_id': mid}
            except (ValueError, KeyError):
                pass

        traveled_list = []
        for data_id in sel.xpath(
                '//ul[@id="attraction_markers_list"]//a[contains(@class,"node") and @data-id]/@data-id').extract():
            data_id = int(data_id)
            if data_id not in marker:
                continue
            traveled_list.append(marker[data_id])

        item['traveled'] = traveled_list

        if not item['traveled']:
            yield item
        else:
            yield Request(url=item['traveled'][0]['url'], callback=self.parse_note, meta={'item': item})

예제 #8

파일 보기

 def _parse_description(self, item):
     """Parse or generate meeting description."""
     desc_text = " ".join(
         Selector(text=html.unescape(item["description"])).css(
             "*::text").extract())
     return re.sub(r"\s+", " ", desc_text).strip()

예제 #9

파일 보기

def get_user_info(response):
    # sel = Selector(response)
    if 'http://weibo.com/sorry?pagenotfound&' == response.url:
        return None

    user_item = UserInfoItem()
    user_item['url'] = response.url

    user_id = check_value(get_page_conf_info(response, 'oid'))
    user_item['user_id'] = user_id

    page_id = check_value(get_page_conf_info(response, 'page_id'))
    user_item['page_id'] = page_id

    info_div = get_dom_html(response, 'Pl_Official_PersonalInfo__60')
    if info_div:
        info_list = Selector(
            text=info_div).xpath('//li[@class="li_1 clearfix"]')
        for info in info_list:
            info_title = info.xpath(
                './span[contains(@class, "pt_title S_txt")]').xpath(
                    'string(.)').extract_first()
            info_detail = info.xpath('./span[contains(@class, "pt_detail")]'
                                     ).xpath('string(.)').extract()
            info_detail = [info_.strip() for info_ in info_detail if info_]

            if '昵称' in info_title:
                user_item['nick_name'] = check_value(''.join(info_detail))
            elif '真实姓名' in info_title:
                user_item['real_name'] = check_value(''.join(info_detail))
            elif '所在地' in info_title:
                user_item['location'] = check_value(''.join(info_detail))
            elif '性别' in info_title:
                user_item['sex'] = check_value(''.join(info_detail))
            elif '性取向' in info_title:
                user_item['sexual_orientation'] = check_value(
                    ''.join(info_detail))
            elif '感情状况' in info_title:
                user_item['Relationship_status'] = check_value(
                    ''.join(info_detail))
            elif '生日' in info_title:
                user_item['birthday'] = check_value(''.join(info_detail))
            elif '博客' in info_title:
                user_item['blog_address'] = check_value(''.join(info_detail))
            elif '个性域名' in info_title:
                user_item['personal_url'] = check_value(''.join(info_detail))
            elif '简介' in info_title:
                user_item['description'] = check_value(''.join(info_detail))
            elif '注册时间' in info_title:
                user_item['register_date'] = check_value(''.join(info_detail))
            elif '公司' in info_title:
                user_item['company'] = check_value('\n'.join(info_detail))
            elif '大学' in info_title:
                user_item['education'] = check_value('\n'.join(info_detail))
            elif '标签' in info_title:
                user_item['tag'] = check_value('\n'.join(info_detail))
            elif '邮箱' in info_title:
                user_item['mail'] = check_value(''.join(info_detail))
            elif 'QQ' in info_title:
                user_item['qq'] = check_value(''.join(info_detail))
            elif '血型' in info_title:
                user_item['blood_type'] = check_value(''.join(info_detail))
            else:
                print('info div more value!! ' + info_title)

    else:
        raise ValueError('no info div')

    # 关注、粉丝、微博数
    num_div = get_dom_html(response, 'Pl_Core_T8CustomTriColumn__56')
    if num_div:
        num_list = Selector(text=num_div).xpath(
            '//td[contains(@class, "S_line")]/a[contains(@class, "t_link S_txt")]'
        )
        for num_ in num_list:
            num_data = num_.xpath('./*[contains(@class, "W_f")]').xpath(
                'string(.)').extract_first()
            num_name = num_.xpath('./span[contains(@class, "S_txt")]').xpath(
                'string(.)').extract_first()
            if not num_data or not num_data.strip().isdigit():
                num_data = -1

            if '关注' in num_name:
                user_item['friends_num'] = num_data
            elif '粉丝' in num_name:
                user_item['fans_num'] = num_data
            elif '微博' in num_name:
                user_item['blog_num'] = num_data
            else:
                print('num div more value!! ' + num_name)
    else:
        raise ValueError('no num div')

    head_div = get_dom_html(response, 'Pl_Official_Headerv6')
    if head_div:
        head_info = Selector(text=head_div).xpath(
            '//a[@class="icon_bed"]/em/@class').extract_first()
        if not head_info:
            user_item['is_v'] = 'nil'
        else:
            user_item['is_v'] = check_value(head_info)

        # 获取头像URL
        head_img_url = Selector(text=head_div).xpath(
            '//div[@node-type="photo"]'
            '//img[@class="photo"]/@src').extract_first()
        if head_img_url:
            user_item['head_img_url'] = head_img_url
        else:
            user_item['head_img_url'] = ''

    else:
        raise ValueError('head div error!!')

    level_div = get_dom_html(response, 'Pl_Official_RightGrowNew')
    if level_div:
        level_info = Selector(text=level_div) \
            .xpath('//div[contains(@class,"level_box S_txt")]').xpath('string(.)').extract_first()
        user_item['rank'] = check_value(level_info)
    else:
        raise ValueError('level div error!!')

    user_item['parse_time'] = time.time()

    return user_item

예제 #10

파일 보기

 def extract_item_desc():
     inner = response.css('a.J-tooltip::attr(title)').extract_first()
     sel = Selector(text=inner)
     item_desc = sel.css('.tooltip-tip::text').get()
     return item_desc

예제 #11

파일 보기

파일: Lagou_Spider.py 프로젝트: wangyiyao2016/Request_Spider

def parse_detail(position_queue):
    positions_info = []
    while True:
        if position_queue.empty():
            break
        url = position_queue.get()
        proxy["http"] = get_proxy()
        time.sleep(random.randint(5, 10))
        response = requests.get(url, headers=headers, proxies=proxy)
        time.sleep(random.randint(5, 10))
        selector = Selector(text=response.text)

        apartment = selector.xpath('//div[@class="company"]/text()').extract()[0]           # 招聘部门
        title = selector.xpath('//div[@class="job-name"]/@title').extract()[0]              # 标题
        publish_time = selector.xpath('//p[@class="publish_time"]/text()').extract()[0]     # 发布时间
        publish_time = publish_time.split("发布于拉勾网")[0].strip()
        job_desc = selector.xpath('//dd[@class="job_bt"]/div').extract()[0].replace(
                                "<div>", '').replace("</div>", '').replace(
                                "<p>", '').replace("</p>", '').strip()                        # 职位描述
        job_advantage = selector.xpath('//dd[@class="job-advantage"]/p/text()').extract()[0]  # 职业诱惑
        job_addr_list= selector.xpath('//div[@class="work_addr"]').extract()[0]
        job_addr_list = remove_tags(job_addr_list).split("\n")
        job_addr_list = [job_addr.strip() for job_addr in job_addr_list if job_addr.strip != "查看地图"]
        job_addr = "".join(job_addr_list)

        salary = selector.xpath('//dd[@class="job_request"]/p/span/text()').extract()[0]    # 薪资
        if '-' in salary:
            salary_min = salary.split('-')[0]
            salary_max = salary.split('-')[1]
        elif '以上' in salary:
            salary_min = salary.split('以上')[0]
            salary_max = salary_min
        city = selector.xpath('//dd[@class="job_request"]/p/span[2]/text()').extract()[0].replace("/", '')
        # 经验要求
        work_experience = selector.xpath('//dd[@class="job_request"]/p/span[3]/text()').extract()[0]
        if '-' in work_experience:
            work_experience_min = work_experience.split('-')[0].replace("经验", '')
            if int(work_experience_min) > 1:
                work_experience_min = work_experience_min + 'years'
            else:
                work_experience_min = work_experience_min + 'year'
            work_experience_max = work_experience.split('-')[1].replace("年", '').replace("/", '').strip() + 'years'
        elif '不限' in work_experience:
            work_experience_min = work_experience.replace("经验不限", "no require")
            work_experience_max = work_experience_min
        elif '以下' in work_experience:
            work_experience_max = work_experience.split("年")[0].replace("经验", "").replace("年", "")
            if int(work_experience_max) > 1:
                work_experience_max = work_experience_max + 'years'
                work_experience_min = work_experience_max
            else:
                work_experience_max = work_experience_max + 'year'
                work_experience_min = work_experience_max
        elif '应届' in work_experience:
            work_experience_min = work_experience.replace("经验应届毕业生 /", "graduates")
            work_experience_max = work_experience_min
        # 学历
        education = selector.xpath('//dd[@class="job_request"]/p/span[4]/text()').extract()[0]
        if "本科" in education:
            education = education.split("本")[0].replace("", "undergraduate")
        elif "大专" in education:
            education = education.split("大")[0].replace("", "junior_college_student")
        elif "不限" in education:
            education = education.replace("学历不限 /", "no require")
        # 职业类型
        job_type = selector.xpath('//dd[@class="job_request"]/p/span[5]/text()').extract()[0]
        if "全职" in job_type:
            job_type = job_type.replace("全职", "full time")
        else:
            job_type = job_type.replace("实习", "fieldwork")
        positions_info.append((apartment, title, salary_min, salary_max, city, work_experience_min, work_experience_max, education, job_type, publish_time, job_advantage, job_desc, job_addr))

    for position_info in positions_info:
        try:
            insert_sql = """
                           insert into lagou_job_requests (apartment, title, salary_min, salary_max, city,
                           work_experience_min, work_experience_max, education, job_type, publish_time,
                           job_advantage, job_desc, job_addr)
                           values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                       """
            cursor.execute(insert_sql, (position_info[0], position_info[1],
                                        position_info[2], position_info[3],
                                        position_info[4], position_info[5],
                                        position_info[6], position_info[7],
                                        position_info[8], position_info[9],
                                        position_info[10], position_info[11],
                                        position_info[12]))
            conn.commit()

        except:
            conn.rollback()

예제 #12

파일 보기

파일: scraping.py 프로젝트: ObsidianBlaze/WebScraping

# Importing selector
from scrapy import Selector

# Importing requests to load html data
import requests

# Creating a container for the url to scrap all the content of jumia home page
url = "https://www.jumia.com.ng"

# Creating a container to hold the HTML source
html = requests.get(url).content

# Creating a selector object
sel = Selector(text=html)

# Printing out the total elements in the page
print(len(sel.xpath("//*")))

# Selecting all Id in the home page
print(sel.xpath("//@id"))

# Searching for a particular class attribute
print(sel.xpath("//p[@class = '-mas -elli2']"))

# Selecting all products in class mas elli2
print(sel.xpath('//p[@class = "-mas -elli2"]//text()').extract())

# Searching for ps4 console image
print(sel.xpath('//div/img[@alt="PS4 Consoles"]').extract())

# Printing out all the courses offered in the home page

예제 #13

파일 보기

파일: exemplo04.py 프로젝트: cerchiariluiza/CodigosScrapsCompiladosPython

from scrapy import Selector
from urllib.request import urlopen

html = urlopen("https://www.pythonparatodos.com.br/formulario.html")
sel = Selector(text=html.read())
lista = sel.xpath('//input')
terceiro_input = lista[3]
print(terceiro_input.extract())

예제 #14

파일 보기

파일: sn_JSQ.py 프로젝트: wojiaergou/JSQ_SN_GM

    def product_parse(self, response):
        if len(response.text) < 40000:
            yield scrapy.Request(url=response.request.url,
                                 callback=self.product_parse,
                                 dont_filter=True,
                                 meta=response.meta)
            return None
        item = response.meta['item']
        # 商品链接
        product_url = response.request.url
        # 商品ID
        ProductID = product_url.split('/')[-1].split('.')[0]
        # 商品链接urlID
        urlID = product_url.split('/')[-2]
        # 商品链接urlID
        urlID = product_url.split('/')[-2]
        # 店铺名称
        try:
            shop_name = re.findall('shopName":"(.*?)"', response.text)[0]
        except:
            try:
                shop_name = re.findall('"curShopName":.*?>(.*?)</a>"',
                                       response.text)[0]
            except:
                try:
                    shop_name = response.xpath(
                        ".//div[@class='si-intro-list']/dl[1]/dd/a/text()"
                    ).extract()[0]
                except:
                    shop_name = None
        #去掉shopname中的空白字符
        shop_name = re.sub(r'\r', '', shop_name)
        shop_name = re.sub(r'\t', '', shop_name)
        shop_name = re.sub(r'\n', '', shop_name)
        shop_name = re.sub(r' ', '', shop_name)
        # 商品名称
        try:
            p_Name = response.xpath(
                ".//div[@class='imgzoom-main']/a[@id='bigImg']/img/@alt"
            ).extract()[0]
        except:
            try:
                p_Name = re.findall('"itemDisplayName":"(.*?)"',
                                    response.text)[0]
            except:
                p_Name = None
        #类别
        try:
            X_type = Selector(response).re('"分类":"(.*?)"')[0]
        except:
            try:
                X_type = Selector(response).re(
                    '分类</span> </div> </td> <td class="val">(.*?)</td>')[0]
            except:
                try:
                    X_type = re.findall('"分类":"(.*?)"', response.text)[0]
                except:
                    X_type = None
        # 品牌
        try:
            brand = Selector(response).re('"brandName":"(.*?)"')[0]
        except:
            try:
                brand = Selector(response).re('<li><b>品牌</b>：(.*?)</li>')[0]
            except:
                try:
                    brand = re.findall('"brandName":"(.*?)"', response.text)[0]
                except:
                    brand = None
        # 去掉品牌括号内容
        if brand:
            if re.findall(r'（.*?）', brand):
                re_com = re.compile('（.*?）')
                brand = brand[:0] + re.sub(re_com, '', brand)
        if brand:
            if re.findall(r'\(.*?\)', brand):
                re_cn = re.compile('\(.*?\)')
                brand = brand[:0] + re.sub(re_cn, '', brand)
        # 颜色
        color = None
        # 类型，商品型号
        try:
            X_name = Selector(response).re(
                '型号</span> </div> </td> <td class="val">(.*?)</td>')[0]
        except:
            try:
                X_name = re.findall(
                    '型号</span> </div> </td> <td class="val">(.*?)</td>',
                    response.text)[0]
                if X_name == None:
                    X_name = re.findall(
                        '型号</span> </div> </td> <td class="val">(.*?)</td>',
                        response.text)[0]
            except:
                X_name = None
        if X_name:
            if brand:
                if brand in X_name:
                    X_name = X_name[:0] + re.sub(brand, '', X_name)
            X_name = X_name[:0] + re.sub(r'（.*?）', '', X_name)
            X_name = X_name[:0] + re.sub(r'\(.*?\)', '', X_name)
        #安装方式
        try:
            install = Selector(response).re('安装方式：(.*?)</li>')[0]
        except:
            try:
                install = Selector(response).re(
                    '安装方式</span> </div> </td> <td class="val">(.*?)</td>')[0]
            except:
                try:
                    install = re.findall('安装方式：(.*?)</li>', response.text)[0]
                except:
                    try:
                        install = re.findall(
                            '安装方式</span> </div> </td> <td class="val">(.*?)</td>',
                            response.text)[0]
                    except:
                        install = None
        #是否可以直饮
        try:
            drink = Selector(response).re('是否直饮：(.*?)</li>')[0]
        except:
            try:
                drink = Selector(response).re(
                    '是否直饮</span> </div> </td> <td class="val">(.*?)</td>')[0]
            except:
                try:
                    drink = re.findall('是否直饮：(.*?)</li>', response.text)[0]
                except:
                    try:
                        drink = re.findall(
                            '是否直饮</span> </div> </td> <td class="val">(.*?)</td>',
                            response.text)[0]
                    except:
                        drink = None
        #滤芯种类
        try:
            kinds = Selector(response).re('滤芯种类：(.*?)</li>')[0]
        except:
            try:
                kinds = Selector(response).re(
                    '滤芯种类</span> </div> </td> <td class="val">(.*?)</td>')[0]
            except:
                try:
                    kinds = re.findall('滤芯种类：(.*?)</li>', response.text)[0]
                except:
                    try:
                        kinds = re.findall(
                            '滤芯种类</span> </div> </td> <td class="val">(.*?)</td>',
                            response.text)[0]
                    except:
                        kinds = None
        #滤芯使用寿命
        try:
            life = Selector(response).re('滤芯寿命：(.*?)</li>')[0]
        except:
            try:
                life = Selector(response).re(
                    '滤芯寿命</span> </div> </td> <td class="val">(.*?)</td>')[0]
            except:
                try:
                    life = re.findall('滤芯寿命：(.*?)</li>', response.text)[0]
                except:
                    try:
                        life = re.findall(
                            '滤芯寿命</span> </div> </td> <td class="val">(.*?)</td>',
                            response.text)[0]
                    except:
                        life = None

        #过滤精度
        try:
            precision = Selector(response).re('过滤精度：(.*?)</li>')[0]
        except:
            try:
                precision = Selector(response).re(
                    '过滤精度</span> </div> </td> <td class="val">(.*?)</td>')[0]
            except:
                try:
                    precision = re.findall('过滤精度：(.*?)</li>', response.text)[0]
                except:
                    try:
                        precision = re.findall(
                            '过滤精度</span> </div> </td> <td class="val">(.*?)</td>',
                            response.text)[0]
                    except:
                        precision = None
        # 核心参数
        type = '"'
        soup = BeautifulSoup(response.text, 'lxml')
        try:
            ul = soup.find('ul', attrs={'class': 'cnt clearfix'})
            li = ul.find_all('li')
            for i in range(len(li)):
                type = type[:] + li[i].text
                if i < len(li) - 1:
                    type = type[:] + ' '
                if i == len(li) - 1:
                    type = type[:] + '"'
        except:
            try:  # 部分核心参数格式更改
                div = soup.find('div', class_='prod-detail-container')
                ul = div.find('ul', attrs={'class': 'clearfix'})
                li = ul.find_all('li')
                for each in li:
                    li_li = each.find_all('li')
                    for i in range(len(li_li)):
                        type = type[:] + li_li[i].text
                        if i < len(li_li) - 1:
                            type = type[:] + ' '
                        if i == len(li_li) - 1:
                            type = type[:] + '"'
            except:
                type = None
        if type:
            if len(type) < 2:
                type = None
        if type == None:
            try:
                parameter_id = Selector(response).re(
                    '"mainPartNumber":"(.*?)"')[0]
            except:
                try:
                    parameter_id = re.findall('"mainPartNumber":"(.*?)"',
                                              response.text)[0]
                except:
                    parameter_id = None
                    type = None
            if parameter_id:
                try:
                    parameter_id = Selector(response).re(
                        '"mainPartNumber":"(.*?)"')[0]
                    parameter_url = 'https://product.suning.com/pds-web/ajax/itemParameter_%s_R0105002_10051.html' % parameter_id
                    para_response = requests.get(parameter_url).text
                    time.sleep(0.3)
                    eles = re.findall('"snparameterdesc":"(.*?)"',
                                      para_response)
                    souls = re.findall('"snparameterVal":"(.*?)"',
                                       para_response)
                    try:
                        type = '"'
                        for i in range(len(eles)):
                            type = type[:] + eles[i] + ':' + souls[i]
                            if i < len(eles) - 1:
                                type = type[:] + ' '
                            if i == len(eles) - 1:
                                type = type[:] + '"'
                            if len(type) < 2:
                                type = None
                    except:
                        type = None
                    if brand == None:
                        try:
                            brand = re.findall(
                                '"snparameterdesc":"品牌","snparameterVal":"(.*?)"',
                                para_response)[0]
                        except:
                            brand = None
                    try:
                        X_name = re.findall(
                            '"snparameterdesc":"型号","snparameterVal":"(.*?)"',
                            para_response)[0]
                    except:
                        X_name = None
                    if X_name:
                        if brand:
                            if brand in X_name:
                                X_name = X_name[:0] + re.sub(brand, '', X_name)
                        X_name = X_name[:0] + re.sub(r'（.*?）', '', X_name)
                        X_name = X_name[:0] + re.sub(r'\(.*?\)', '', X_name)
                    #类别
                    if X_type == None:
                        try:
                            X_type = re.findall(
                                '"snparameterdesc":"分类","snparameterVal":"(.*?)"',
                                para_response)[0]
                        except:
                            X_type = None
                    #安装方式
                    if install == None:
                        try:
                            install = re.findall(
                                '"snparameterdesc":"安装方式","snparameterVal":"(.*?)"',
                                para_response)[0]
                        except:
                            install = None
                    #是否直饮
                    if drink == None:
                        try:
                            drink = re.findall(
                                '"snparameterdesc":"是否直饮","snparameterVal":"(.*?)"',
                                para_response)[0]
                        except:
                            drink = None
                    #滤芯种类
                    if kinds == None:
                        try:
                            kinds = re.findall(
                                '"snparameterdesc":"滤芯种类","snparameterVal":"(.*?)"',
                                para_response)[0]
                        except:
                            kinds = None
                    #滤芯使用寿命
                    if life == None:
                        try:
                            life = re.findall(
                                '"snparameterdesc":"滤芯寿命","snparameterVal":"(.*?)"',
                                para_response)[0]
                        except:
                            life = None
                    #过滤精度
                    if precision == None:
                        try:
                            precision = re.findall(
                                '"snparameterdesc":"过滤精度","snparameterVal":"(.*?)"',
                                para_response)[0]
                        except:
                            precision = None
                except:
                    pass
        # 获取相关请求url
        keyword_url = 'https://review.suning.com/ajax/getreview_labels/general-000000000' + ProductID + '-' + urlID + '-----commodityrLabels.htm'
        comment_url = 'https://review.suning.com/ajax/review_satisfy/general-000000000' + ProductID + '-' + urlID + '-----satisfy.htm'
        price_url = 'https://pas.suning.com/nspcsale_0_000000000' + ProductID + '_000000000' + ProductID + '_' + urlID + '_10_010_0100101_20268_1000000_9017_10106_Z001.html'
        # 获取印象关键字
        try:
            keyword_response = requests.get(keyword_url).text
            keyword_text = json.loads(
                re.findall(r'\((.*?)\)', keyword_response)[0])
            keyword_list = keyword_text.get('commodityLabelCountList')
            key_str = '"'
            keyword = []
            for i in range(len(keyword_list)):
                key_str = key_str[:] + keyword_list[i].get('labelName')
                if i < len(keyword_list) - 1:
                    key_str = key_str[:] + ' '
                if i == len(keyword_list) - 1:
                    key_str = key_str[:] + '"'
            keyword.append(key_str)
        except:
            keyword = None
        # 获取评价信息
        try:
            comment_response = requests.get(comment_url).text
            comment_text = json.loads(
                re.findall(r'\((.*?)\)', comment_response)[0])
            comment_list = comment_text.get('reviewCounts')[0]
            # 差评
            PoorCount = comment_list.get('oneStarCount')
            twoStarCount = comment_list.get('twoStarCount')
            threeStarCount = comment_list.get('threeStarCount')
            fourStarCount = comment_list.get('fourStarCount')
            fiveStarCount = comment_list.get('fiveStarCount')
            # 评论数量
            CommentCount = comment_list.get('totalCount')
            # 好评
            GoodCount = fourStarCount + fiveStarCount
            # 中评
            GeneralCount = twoStarCount + threeStarCount
            # 好评度
            # 得到百分比取整函数
            if CommentCount != 0:
                goodpercent = round(GoodCount / CommentCount * 100)
                generalpercent = round(GeneralCount / CommentCount * 100)
                poorpercent = round(PoorCount / CommentCount * 100)
                commentlist = [GoodCount, GeneralCount, PoorCount]
                percent_list = [goodpercent, generalpercent, poorpercent]
                # 对不满百分之一的判定
                for i in range(len(percent_list)):
                    if percent_list[i] == 0 and commentlist[
                            i] != 0 and CommentCount != 0:
                        percent_list[i] = 1
                nomaxpercent = 0  # 定义为累计不是最大百分比数值
                # 好评度计算url='http://res.suning.cn/project/review/js/reviewAll.js?v=20170823001'
                if CommentCount != 0:
                    maxpercent = max(goodpercent, generalpercent, poorpercent)
                    for each in percent_list:
                        if maxpercent != each:
                            nomaxpercent += each
                    GoodRateShow = 100 - nomaxpercent
                else:
                    GoodRateShow = 100
            else:
                PoorCount = 0
                CommentCount = 0
                GoodCount = 0
                GeneralCount = 0
                GoodRateShow = 100
        except:
            PoorCount = 0
            CommentCount = 0
            GoodCount = 0
            GeneralCount = 0
            GoodRateShow = 100
        # 有关价格
        try:
            price_response = requests.get(price_url).text
        except requests.RequestException as e:
            # print(e)
            time.sleep(2)
            s = requests.session()
            s.keep_alive = False
            s.mount('https://', HTTPAdapter(max_retries=5))
            price_response = s.get(price_url).text
        if len(price_response) > 900:
            try:
                price = re.findall('"refPrice":"(.*?)"', price_response)[0]
                PreferentialPrice = re.findall('"promotionPrice":"(.*?)"',
                                               price_response)[0]
                if len(price) < 1:
                    price = re.findall('"netPrice":"(.*?)"', price_response)[0]
                if price:
                    if float(price) < float(PreferentialPrice):
                        tt = price
                        price = PreferentialPrice
                        PreferentialPrice = tt
            except:
                price = None
                PreferentialPrice = None
        else:
            time.sleep(3)
            price_response = requests.get(price_url).text
            if len(price_response) > 900:
                try:
                    price = re.findall('"refPrice":"(.*?)"', price_response)[0]
                    PreferentialPrice = re.findall('"promotionPrice":"(.*?)"',
                                                   price_response)[0]
                    if len(price) < 1:
                        price = re.findall('"netPrice":"(.*?)"',
                                           price_response)[0]
                    if price:
                        if float(price) < float(PreferentialPrice):
                            tt = price
                            price = PreferentialPrice
                            PreferentialPrice = tt
                except:
                    price = None
                    PreferentialPrice = None
            else:
                # 作出失败判断并将url归入重试
                price_response = self.retry_price(price_url)
                if len(price_response) > 500:
                    try:
                        price = re.findall('"refPrice":"(.*?)"',
                                           price_response)[0]
                        PreferentialPrice = re.findall(
                            '"promotionPrice":"(.*?)"', price_response)[0]
                        if len(price) < 1:
                            price = re.findall('"netPrice":"(.*?)"',
                                               price_response)[0]
                        if price:
                            if float(price) < float(PreferentialPrice):
                                tt = price
                                price = PreferentialPrice
                                PreferentialPrice = tt
                    except:
                        price = None
                        PreferentialPrice = None
                else:
                    PreferentialPrice = None
                    price = None
        if kinds:
            if re.findall(r'\d', kinds) and len(kinds) < 3:
                level = kinds
                kinds = None
            else:
                level = None
        else:
            level = None
        # 防止出现多个字段出现为空
        if p_Name == None and brand == None and type == None:
            yield None
        else:
            source = '苏宁'
            item['shop_name'] = shop_name
            item['p_Name'] = p_Name
            item['X_name'] = X_name
            item['type'] = type
            item['price'] = price
            item['PreferentialPrice'] = PreferentialPrice
            item['brand'] = brand
            item['keyword'] = keyword
            item['PoorCount'] = PoorCount
            item['CommentCount'] = CommentCount
            item['GoodCount'] = GoodCount
            item['GeneralCount'] = GeneralCount
            item['GoodRateShow'] = GoodRateShow
            item['install'] = install
            item['drink'] = drink
            item['source'] = source
            item['level'] = level
            item['kinds'] = kinds
            item['life'] = life
            item['precision'] = precision
            item['color'] = color
            item['product_url'] = product_url
            item['ProductID'] = ProductID
            item['X_type'] = X_type
            yield item

예제 #15

파일 보기

    def parse_detail(self, response):
        # TODO 主动关闭爬虫问题
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace(
                '\xa0', '').replace('\u3000', '')
            WJBT_45 = ''
            SJ_46 = ''
            LY_47 = ''
            ZWBT_48 = ''
            DKBH_49 = ''
            ZDBH_50 = ''
            PMJG_51 = ''
            GGZRFS_52 = ''
            GPSJ_53 = ''
            ZRR_54 = ''
            ZRF_55 = ''
            SRR_56 = ''
            SRF_57 = ''
            SRDW_58 = ''
            WZ_59 = ''
            DKWZ_60 = ''
            CRMJ_61 = ''
            YT_62 = ''
            CJJ_63 = ''
            BDCQDJH_64 = ''
            CRHTBH_65 = ''
            CRHT_66 = ''
            BGXYBH_67 = ''
            TDYT_68 = ''
            SYNX_69 = ''
            MJ_70 = ''
            TDMJ_71 = ''
            ZRJG_72 = ''
            CRNX_73 = ''
            TDSYNX_74 = ''
            BZ_75 = ''
            GSQ_76 = ''
            LXDW_77 = ''
            DWDZ_78 = ''
            YZBM_79 = ''
            LXDH_80 = ''
            LXR_81 = ''
            DZYJ_82 = ''

            # TODO 共有字段  reFunction(f'时间：\s*([{self.reStr}]*)\s', LY)
            # 文件标题
            WJBT_45 = response.meta.get('title')
            # 时间
            SJ_46 = data.xpath(
                '//div[@class="ztzx_frame_subtitle_l"]/span[1]/text()'
            ).extract_first()
            # 来源
            LY_47 = data.xpath(
                '//div[@class="ztzx_frame_subtitle_l"]/span[2]/text()'
            ).extract_first()
            # 正文标题
            ZWBT_48 = data.xpath(
                '//div[@class="ztzx_frame_content"]/div[1]/text()'
            ).extract_first()
            # 公示期
            GSQ_76 = reFunction(
                f'公示期：([（）\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)[。\s]', items)
            # 联系单位
            LXDW_77 = reFunction(
                '联系单位：([（）\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
            # 单位地址
            DWDZ_78 = reFunction(
                '单位地址：([（）\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
            # 邮政编码
            YZBM_79 = reFunction(
                '邮政编码：([（）\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
            # 联系电话
            LXDH_80 = reFunction(
                '联系电话：([（）\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
            # 联系人
            LXR_81 = reFunction(
                '联\s*系\s*人：([（）\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
            # 电子邮件
            DZYJ_82 = reFunction(
                '电子邮件：([（）\w\.:： —\(\)@〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)

            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url + WJBT_45 + SJ_46)

            soup = BeautifulSoup(
                response.body.decode('utf-8').replace('thead', 'tbody'))
            table = soup.find('table')
            htmlTable = htmlTableTransformer()
            if '国有划拨土地使用权结果公示' in items:
                table.find_all('tr')[1].extract()
                tdData = htmlTable.tableTrTdRegulationToList(table)
                for _ in range(len(list(tdData.values())[0])):
                    # 地块编号
                    DKBH_49 = tdData.get('地块编号')[_] if tdData.get(
                        '地块编号') else ''
                    # 公开转让方式
                    GGZRFS_52 = tdData.get('公开转让方式')[_] if tdData.get(
                        '公开转让方式') else ''
                    # 挂牌时间
                    GPSJ_53 = tdData.get('挂牌')[_] if tdData.get('挂牌') else ''
                    # 受让人
                    SRR_56 = tdData.get('受让人')[_] if tdData.get('受让人') else ''
                    # 位置
                    WZ_59 = tdData.get('位置')[_] if tdData.get('位置') else ''
                    # 出让面积(平方米)
                    CRMJ_61 = tdData.get('出让面积')[_] if tdData.get(
                        '出让面积') else ''
                    # 用途
                    YT_62 = tdData.get('用途')[_] if tdData.get('用途') else ''
                    # 成交价(万元)
                    CJJ_63 = tdData.get('成交价')[_] if tdData.get('成交价') else ''
                    # 写入数据
                    if self.name in DUPLICATE_SWITCH_LIST:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if True:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                WJBT_45,
                                SJ_46,
                                LY_47,
                                ZWBT_48,
                                DKBH_49,
                                ZDBH_50,
                                PMJG_51,
                                GGZRFS_52,
                                GPSJ_53,
                                ZRR_54,
                                ZRF_55,
                                SRR_56,
                                SRF_57,
                                SRDW_58,
                                WZ_59,
                                DKWZ_60,
                                CRMJ_61,
                                YT_62,
                                CJJ_63,
                                BDCQDJH_64,
                                CRHTBH_65,
                                CRHT_66,
                                BGXYBH_67,
                                TDYT_68,
                                SYNX_69,
                                MJ_70,
                                TDMJ_71,
                                ZRJG_72,
                                CRNX_73,
                                TDSYNX_74,
                                BZ_75,
                                GSQ_76,
                                LXDW_77,
                                DWDZ_78,
                                YZBM_79,
                                LXDH_80,
                                LXR_81,
                                DZYJ_82,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace(
                                            '\n',
                                            '').replace('\t', '').replace(
                                                '\r', '').replace(
                                                    r'\xa0', '').replace(
                                                        '\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(
                                        f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                        level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(
                            self, 'response msg info %s, job duplicated!' %
                            response.url)
            elif '不动产权登记证号' in items:
                # 转让方
                ZRF_55 = reFunction(
                    '转让方：\s*([（）【】\w\.:：—\(\)〔〕㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
                # 受让方
                SRF_57 = reFunction(
                    '受让方：\s*([（）【】\w\.:：—\(\)〔〕㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
                # 位置
                WZ_59 = reFunction(
                    '宗地位置：\s*([（）【】\w\.:：—\(\)〔〕㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                    items)
                # 不动产权登记证号
                BDCQDJH_64 = reFunction(
                    '不动产权登记证号：\s*([（）【】\w\.:：—\(\)〔〕㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                    items)
                # 出让合同编号
                CRHTBH_65 = reFunction(
                    '出让合同编号：\s*([（）【】\w\.:：—\(\)〔〕㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                    items)
                # 变更协议编号
                BGXYBH_67 = reFunction(
                    '出让合同变更协议编号：\s*([（）【】\w\.:：—\(\)〔〕㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                    items)
                # 土地用途
                TDYT_68 = reFunction(
                    '土地用途：\s*([（）【】\w\.:：—\(\)〔〕㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                    items)
                # 使用年限
                SYNX_69 = reFunction(
                    '使用年限：\s*([（）【】\w\.:：—\(\)〔〕\s㎡≤≥《》\-\/\%,；，、\.﹪]*)面\s*积',
                    items)
                # 面积
                MJ_70 = reFunction(
                    '面\s*积：\s*([（）【】\w\.:：—\(\)〔〕㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                    items)
                # 转让价格(单价总价)
                ZRJG_72 = reFunction(
                    '转让价格：\s*([（）【】\w\.:：—\(\)〔〕㎡≤≥《》\-\/\%,；，、。\.﹪]*)\s',
                    items)

                # 写入数据
                if self.name in DUPLICATE_SWITCH_LIST:
                    if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                        self.duplicateUrl += 1

                if self.duplicateUrl < 50:
                    if True:
                        # 重复效验通过, 存储数据
                        csvFile = [
                            WJBT_45,
                            SJ_46,
                            LY_47,
                            ZWBT_48,
                            DKBH_49,
                            ZDBH_50,
                            PMJG_51,
                            GGZRFS_52,
                            GPSJ_53,
                            ZRR_54,
                            ZRF_55,
                            SRR_56,
                            SRF_57,
                            SRDW_58,
                            WZ_59,
                            DKWZ_60,
                            CRMJ_61,
                            YT_62,
                            CJJ_63,
                            BDCQDJH_64,
                            CRHTBH_65,
                            CRHT_66,
                            BGXYBH_67,
                            TDYT_68,
                            SYNX_69,
                            MJ_70,
                            TDMJ_71,
                            ZRJG_72,
                            CRNX_73,
                            TDSYNX_74,
                            BZ_75,
                            GSQ_76,
                            LXDW_77,
                            DWDZ_78,
                            YZBM_79,
                            LXDH_80,
                            LXR_81,
                            DZYJ_82,
                            crawlingTime,
                            url,
                            md5Mark,
                        ]
                        results = ''
                        for _ in csvFile:
                            try:
                                if _ and _ != '|' * len(_):
                                    results += _.replace(',', ' ').replace(
                                        '\n', '').replace('\t', '').replace(
                                            '\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                else:
                                    results += ','
                            except Exception as e:
                                results += ','
                                self.log(
                                    f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                    level=logging.ERROR)
                        with open(self.pathDetail, 'a+') as fp:
                            fp.write(results)
                            fp.write('\n')
                        self.log(f'数据获取成功', level=logging.INFO)
                        yield
                else:
                    self.crawler.engine.close_spider(
                        self,
                        'response msg info %s, job duplicated!' % response.url)
            elif '挂牌出让地块的基本情况和规划指标要求' in items:
                # 宗地编号
                ZDBH_50 = reFunction(
                    '宗地编号：*\s*([（）【】\w\.:：—\(\)〔〕㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                    items)
                # 挂牌时间
                GPSJ_53 = reFunction(
                    '挂牌时间为:\s*([（）【】\w\.:：—\(\)〔〕㎡≤≥《》\-\/\%,；。，、\.﹪]*)\s',
                    items).replace('。', '')
                # 转让人
                ZRR_54 = reFunction(
                    '转让人为：*\s*([（）【】\w\.:：—\(\)〔〕㎡≤≥《》\-\/\%\.﹪]*)，', items)
                # 位置
                WZ_59 = reFunction(
                    '宗地坐落：*\s*([（）【】\w\.:：—\(\)〔〕㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                    items)
                # 土地用途
                TDYT_68 = reFunction(
                    '土地用途：*\s*([（）【】\w\.:：—\(\)〔〕㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                    items)
                # 面积
                MJ_70 = reFunction(
                    '宗地面积：*\s*([（）【】\w\.:：—\(\)〔〕㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                    items)
                # 出让年限
                CRNX_73 = reFunction(
                    '出让年限：*\s*([（）【】\w\.:：—\(\)〔〕㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                    items)
                # 备注
                BZ_75 = reFunction(
                    '备注：*\s*([（）【】\w\.:：—\(\)〔〕㎡≤≥《》\-\/\%,；。，、\.﹪]*)\s*二',
                    items)

                # 写入数据
                if self.name in DUPLICATE_SWITCH_LIST:
                    if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                        self.duplicateUrl += 1

                if self.duplicateUrl < 50:
                    if True:
                        # 重复效验通过, 存储数据
                        csvFile = [
                            WJBT_45,
                            SJ_46,
                            LY_47,
                            ZWBT_48,
                            DKBH_49,
                            ZDBH_50,
                            PMJG_51,
                            GGZRFS_52,
                            GPSJ_53,
                            ZRR_54,
                            ZRF_55,
                            SRR_56,
                            SRF_57,
                            SRDW_58,
                            WZ_59,
                            DKWZ_60,
                            CRMJ_61,
                            YT_62,
                            CJJ_63,
                            BDCQDJH_64,
                            CRHTBH_65,
                            CRHT_66,
                            BGXYBH_67,
                            TDYT_68,
                            SYNX_69,
                            MJ_70,
                            TDMJ_71,
                            ZRJG_72,
                            CRNX_73,
                            TDSYNX_74,
                            BZ_75,
                            GSQ_76,
                            LXDW_77,
                            DWDZ_78,
                            YZBM_79,
                            LXDH_80,
                            LXR_81,
                            DZYJ_82,
                            crawlingTime,
                            url,
                            md5Mark,
                        ]
                        results = ''
                        for _ in csvFile:
                            try:
                                if _ and _ != '|' * len(_):
                                    results += _.replace(',', ' ').replace(
                                        '\n', '').replace('\t', '').replace(
                                            '\r', '').replace(
                                                r'\xa0', '').replace(
                                                    '\xa0', '') + ','
                                else:
                                    results += ','
                            except Exception as e:
                                results += ','
                                self.log(
                                    f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                    level=logging.ERROR)
                        with open(self.pathDetail, 'a+') as fp:
                            fp.write(results)
                            fp.write('\n')
                        self.log(f'数据获取成功', level=logging.INFO)
                        yield
                else:
                    self.crawler.engine.close_spider(
                        self,
                        'response msg info %s, job duplicated!' % response.url)
            elif '地块基本情况' in items:
                try:
                    if '备注' not in items:
                        tdData = htmlTable.tableTrTdRegulationToList(table)
                        for _ in range(len(list(tdData.values())[0])):
                            # 宗地编号
                            ZDBH_50 = tdData.get('宗地编号')[_] if tdData.get(
                                '宗地编号') else ''
                            # 受让单位
                            SRDW_58 = tdData.get('受让单位')[_] if tdData.get(
                                '受让单位') else ''
                            # 受让人
                            SRR_56 = tdData.get('竞得人')[_] if tdData.get(
                                '竞得人') else ''
                            # 地块位置
                            DKWZ_60 = tdData.get('地块位置')[_] if tdData.get(
                                '地块位置') else ''
                            # 土地用途
                            TDYT_68 = tdData.get('土地用途')[_] if tdData.get(
                                '土地用途') else ''
                            # 成交价(万元)
                            CJJ_63 = tdData.get('成交价(万元)')[_] if tdData.get(
                                '成交价(万元)') else ''
                            # 土地面积(公顷)
                            TDMJ_71 = tdData.get('土地面积(亩)')[_] if tdData.get(
                                '土地面积(亩)') else ''
                            # 出让年限
                            CRNX_73 = tdData.get('出让年限')[_] if tdData.get(
                                '出让年限') else ''

                            # 写入数据
                            if self.name in DUPLICATE_SWITCH_LIST:
                                if self.redisClient.isExist(
                                        md5Mark):  # 存在, 去重计数
                                    self.duplicateUrl += 1

                            if self.duplicateUrl < 50:
                                if True:
                                    # 重复效验通过, 存储数据
                                    csvFile = [
                                        WJBT_45,
                                        SJ_46,
                                        LY_47,
                                        ZWBT_48,
                                        DKBH_49,
                                        ZDBH_50,
                                        PMJG_51,
                                        GGZRFS_52,
                                        GPSJ_53,
                                        ZRR_54,
                                        ZRF_55,
                                        SRR_56,
                                        SRF_57,
                                        SRDW_58,
                                        WZ_59,
                                        DKWZ_60,
                                        CRMJ_61,
                                        YT_62,
                                        CJJ_63,
                                        BDCQDJH_64,
                                        CRHTBH_65,
                                        CRHT_66,
                                        BGXYBH_67,
                                        TDYT_68,
                                        SYNX_69,
                                        MJ_70,
                                        TDMJ_71,
                                        ZRJG_72,
                                        CRNX_73,
                                        TDSYNX_74,
                                        BZ_75,
                                        GSQ_76,
                                        LXDW_77,
                                        DWDZ_78,
                                        YZBM_79,
                                        LXDH_80,
                                        LXR_81,
                                        DZYJ_82,
                                        crawlingTime,
                                        url,
                                        md5Mark,
                                    ]
                                    results = ''
                                    for _ in csvFile:
                                        try:
                                            if _ and _ != '|' * len(_):
                                                results += _.replace(
                                                    ',', ' '
                                                ).replace('\n', '').replace(
                                                    '\t', ''
                                                ).replace('\r', '').replace(
                                                    r'\xa0', '').replace(
                                                        '\xa0', '') + ','
                                            else:
                                                results += ','
                                        except Exception as e:
                                            results += ','
                                            self.log(
                                                f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                level=logging.ERROR)
                                    with open(self.pathDetail, 'a+') as fp:
                                        fp.write(results)
                                        fp.write('\n')
                                    self.log(f'数据获取成功', level=logging.INFO)
                                    yield
                            else:
                                self.crawler.engine.close_spider(
                                    self,
                                    'response msg info %s, job duplicated!' %
                                    response.url)
                    else:
                        if '竞得人' not in items:
                            for item in [
                                    '宗地编号' + _
                                    for _ in re.findall('一([\s\S]*)二、', items)
                                [0].split('宗地编号')[1:]
                            ]:
                                # 宗地编号
                                ZDBH_50 = reFunction('编号\s*([\w\-]*)\s', item)
                                # 受让单位
                                SRDW_58 = reFunction(
                                    '受让单位\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                                    item)
                                # 地块位置
                                DKWZ_60 = reFunction(
                                    '地块位置\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                                    item)
                                # 成交价(万元)
                                CJJ_63 = reFunction(
                                    '成交价\(万元\)\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                                    item
                                ) if reFunction(
                                    '成交价\(万元\)\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                                    item
                                ) else reFunction(
                                    '成交价（万元）\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                                    item)
                                # 土地用途
                                TDYT_68 = reFunction(
                                    '土地用途\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                                    item)
                                # 土地面积(公顷)
                                TDMJ_71 = reFunction(
                                    '土地\s*面积\s*\(公顷\)\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                                    item)
                                # 出让年限
                                CRNX_73 = reFunction(
                                    '出让年限\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                                    item)
                                # 备注
                                BZ_75 = reFunction(
                                    '备注：\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)',
                                    item)
                                if '二' in BZ_75:
                                    BZ_75 = ''
                                # 写入数据
                                if self.name in DUPLICATE_SWITCH_LIST:
                                    if self.redisClient.isExist(
                                            md5Mark):  # 存在, 去重计数
                                        self.duplicateUrl += 1

                                if self.duplicateUrl < 50:
                                    if True:
                                        # 重复效验通过, 存储数据
                                        csvFile = [
                                            WJBT_45,
                                            SJ_46,
                                            LY_47,
                                            ZWBT_48,
                                            DKBH_49,
                                            ZDBH_50,
                                            PMJG_51,
                                            GGZRFS_52,
                                            GPSJ_53,
                                            ZRR_54,
                                            ZRF_55,
                                            SRR_56,
                                            SRF_57,
                                            SRDW_58,
                                            WZ_59,
                                            DKWZ_60,
                                            CRMJ_61,
                                            YT_62,
                                            CJJ_63,
                                            BDCQDJH_64,
                                            CRHTBH_65,
                                            CRHT_66,
                                            BGXYBH_67,
                                            TDYT_68,
                                            SYNX_69,
                                            MJ_70,
                                            TDMJ_71,
                                            ZRJG_72,
                                            CRNX_73,
                                            TDSYNX_74,
                                            BZ_75,
                                            GSQ_76,
                                            LXDW_77,
                                            DWDZ_78,
                                            YZBM_79,
                                            LXDH_80,
                                            LXR_81,
                                            DZYJ_82,
                                            crawlingTime,
                                            url,
                                            md5Mark,
                                        ]
                                        results = ''
                                        for _ in csvFile:
                                            try:
                                                if _ and _ != '|' * len(_):
                                                    results += _.replace(
                                                        ',', ' '
                                                    ).replace('\n', '').replace(
                                                        '\t', '').replace(
                                                            '\r', '').replace(
                                                                r'\xa0',
                                                                '').replace(
                                                                    '\xa0',
                                                                    '') + ','
                                                else:
                                                    results += ','
                                            except Exception as e:
                                                results += ','
                                                self.log(
                                                    f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                    level=logging.ERROR)
                                        with open(self.pathDetail, 'a+') as fp:
                                            fp.write(results)
                                            fp.write('\n')
                                        self.log(f'数据获取成功', level=logging.INFO)
                                        yield
                                else:
                                    self.crawler.engine.close_spider(
                                        self,
                                        'response msg info %s, job duplicated!'
                                        % response.url)
                except Exception as e:
                    if '竞得人' not in items:
                        for item in [
                                '宗地编号' + _ for _ in re.findall(
                                    '一([\s\S]*)二、', items)[0].split('宗地编号')[1:]
                        ]:
                            # 宗地编号
                            ZDBH_50 = reFunction('编号\s*([\w\-]*)\s', item)
                            # 受让单位
                            SRDW_58 = reFunction(
                                '受让单位\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                                item)
                            # 地块位置
                            DKWZ_60 = reFunction(
                                '地块位置\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                                item)
                            # 成交价(万元)
                            CJJ_63 = reFunction(
                                '成交价\(万元\)\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                                item
                            ) if reFunction(
                                '成交价\(万元\)\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                                item
                            ) else reFunction(
                                '成交价（万元）\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                                item)
                            # 土地用途
                            TDYT_68 = reFunction(
                                '土地用途\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                                item)
                            # 土地面积(公顷)
                            TDMJ_71 = reFunction(
                                '土地\s*面积\s*\(公顷\)\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                                item)
                            # 出让年限
                            CRNX_73 = reFunction(
                                '出让年限\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s',
                                item)
                            # 备注
                            BZ_75 = reFunction(
                                '备注：\s*([（）\w\.:：—\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)',
                                item)
                            if '二' in BZ_75:
                                BZ_75 = ''
                            # 写入数据
                            if self.name in DUPLICATE_SWITCH_LIST:
                                if self.redisClient.isExist(
                                        md5Mark):  # 存在, 去重计数
                                    self.duplicateUrl += 1

                            if self.duplicateUrl < 50:
                                if True:
                                    # 重复效验通过, 存储数据
                                    csvFile = [
                                        WJBT_45,
                                        SJ_46,
                                        LY_47,
                                        ZWBT_48,
                                        DKBH_49,
                                        ZDBH_50,
                                        PMJG_51,
                                        GGZRFS_52,
                                        GPSJ_53,
                                        ZRR_54,
                                        ZRF_55,
                                        SRR_56,
                                        SRF_57,
                                        SRDW_58,
                                        WZ_59,
                                        DKWZ_60,
                                        CRMJ_61,
                                        YT_62,
                                        CJJ_63,
                                        BDCQDJH_64,
                                        CRHTBH_65,
                                        CRHT_66,
                                        BGXYBH_67,
                                        TDYT_68,
                                        SYNX_69,
                                        MJ_70,
                                        TDMJ_71,
                                        ZRJG_72,
                                        CRNX_73,
                                        TDSYNX_74,
                                        BZ_75,
                                        GSQ_76,
                                        LXDW_77,
                                        DWDZ_78,
                                        YZBM_79,
                                        LXDH_80,
                                        LXR_81,
                                        DZYJ_82,
                                        crawlingTime,
                                        url,
                                        md5Mark,
                                    ]
                                    results = ''
                                    for _ in csvFile:
                                        try:
                                            if _ and _ != '|' * len(_):
                                                results += _.replace(
                                                    ',', ' '
                                                ).replace('\n', '').replace(
                                                    '\t', ''
                                                ).replace('\r', '').replace(
                                                    r'\xa0', '').replace(
                                                        '\xa0', '') + ','
                                            else:
                                                results += ','
                                        except Exception as e:
                                            results += ','
                                            self.log(
                                                f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                level=logging.ERROR)
                                    with open(self.pathDetail, 'a+') as fp:
                                        fp.write(results)
                                        fp.write('\n')
                                    self.log(f'数据获取成功', level=logging.INFO)
                                    yield
                            else:
                                self.crawler.engine.close_spider(
                                    self,
                                    'response msg info %s, job duplicated!' %
                                    response.url)

        except Exception as e:
            print(response.url)
            self.log(
                f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}',
                level=logging.ERROR)

예제 #16

파일 보기

def get_blog_list(response, total_page_response=None, ajax_html=None):
    """
    获取当页的微博文章
    :param ajax_html:
    :param total_page_response:
    :param response:
    :return:
    """
    if response:
        blog_html = get_dom_html(response, 'Pl_Official_MyProfileFeed_')
        try:
            sel = Selector(text=blog_html)
        except Exception as e:
            raise e
        blog_div = sel.xpath(
            '//div[contains(@class, "WB_feed WB_feed_v")]/div[@mid]')
        blog_user_id = get_page_conf_info(response, 'oid')
    else:
        blog_div = Selector(text=ajax_html).xpath('//body/div[@mid]')
        blog_user_id = get_page_conf_info(total_page_response, 'oid')

    for blog_ in blog_div:
        blog_item = BlogItem()
        blog_item['user_id'] = blog_user_id
        blog_item['praise_time'] = str(time.time())

        mid = blog_.xpath('./@mid').extract_first()
        blog_item['mid'] = check_value(mid)

        # 如果是转发的，获得转发的信息
        is_forward = blog_.xpath('./@isforward').extract_first()
        if is_forward and is_forward == '1':
            blog_item['is_forward'] = 'True'

            o_mid = blog_.xpath('./@omid').extract_first()
            blog_item['o_mid'] = o_mid

            m_info = blog_.xpath('./@minfo').extract_first()
            tb_info = blog_.xpath('./@tbinfo').extract_first()

            o_user_id = ''
            if m_info:
                m_info_d = m_info.strip().split('&')
                for x in m_info_d:
                    info_key_value = x.strip().split('=')
                    if info_key_value[0] == 'ru':
                        o_user_id = info_key_value[-1]
                        blog_item['o_user_id'] = o_user_id
                    elif info_key_value[0] == 'rm':
                        if not o_mid:
                            blog_item['o_mid'] = info_key_value[-1]

            if tb_info:
                tb_info_d = tb_info.strip().split('&')
                for x in tb_info_d:
                    tb_key_value = x.strip().split('=')
                    if tb_key_value[0] == 'ouid':
                        if not blog_user_id:
                            blog_item['user_id'] = tb_key_value[-1]
                    elif tb_key_value[0] == 'rouid':
                        if not o_user_id:
                            blog_item['o_user_id'] = tb_key_value[-1]

            # 获取转发的文章
            forward_item = BlogItem()
            sub_div = blog_.xpath(
                './div/div[@class="WB_detail"]/div[@class="WB_feed_expand"]'
                '/div[@node-type="feed_list_forwardContent"]')
            is_empty = sub_div.xpath('./div[@class="WB_empty"]')  # 转发是否已经被删除
            if not is_empty:
                forward_item['is_forward'] = 'False'
                sub_info_div = sub_div.xpath(
                    './div[@class="WB_info"]/a[contains(@class, "W_fb")]')
                sub_user_info = sub_info_div.xpath(
                    './@usercard').extract_first()
                forward_item['user_id'] = '-1'
                if sub_user_info:
                    x = sub_user_info.strip().split('&')
                    for y in x:
                        z = y.strip().split('=')
                        if z[0] == 'id':
                            forward_item['user_id'] = z[-1]

                sub_mid_info = sub_info_div.xpath(
                    './@suda-uatrack').extract_first()
                forward_item['mid'] = '-1'
                if sub_mid_info:
                    forward_item['mid'] = sub_mid_info.strip().split(':')[-1]

                # 转发的微博内容
                sub_blog_info = sub_div.xpath(
                    './div[@class="WB_text"]').extract_first()

                forward_item['praise_time'] = str(time.time())

                sub_unflod_url, sub_info_dict = get_blog_content_info(
                    sub_blog_info)
                forward_item['blog_info'] = sub_info_dict['text_list']
                forward_item['at_url_list'] = sub_info_dict['at_url_list']
                forward_item['at_list'] = sub_info_dict['at_text_list']
                forward_item['topic_list'] = sub_info_dict['topic_list']
                forward_item['topic_url_list'] = sub_info_dict[
                    'topic_url_list']
                forward_item['article_url'] = sub_info_dict['article_url_list'][0] \
                    if sub_info_dict['article_url_list'] else ''
                forward_item['picture_url'] = sub_info_dict['img_url_list']

                # 获得转发的图片
                sub_pic_div = sub_div.xpath(
                    './div[@node-type="feed_list_media_prev"]'
                    '//div[@class="media_box"]/ul//img/@src').extract()
                forward_item['picture_url'] += sub_pic_div
                forward_item['picture_url'] = turn_to_big_pic(
                    forward_item['picture_url'])

                # 时间日期，来自
                sub_foot_div = sub_div.xpath(
                    './div[contains(@class, "WB_func")]')
                sub_from_div = sub_foot_div.xpath(
                    './div[contains(@class, "WB_from")]/a')
                forward_item['date_time'] = check_value(
                    sub_from_div[0].xpath('./@title').extract_first())
                forward_item['data_from'] = check_value(
                    sub_from_div[1].xpath('string(.)').extract_first(
                    )) if len(sub_from_div) > 1 else ''
                forward_item['exact_time'] = check_value(
                    sub_from_div[0].xpath('./@date').extract_first())

                # 评论、转发、赞
                forward_item['forward_num'] = -1
                forward_item['prise_num'] = -1
                forward_item['comment_num'] = -1
                sub_mid = sub_foot_div.xpath(
                    './div[@class="WB_handle W_fr"]/@mid').extract_first()
                if 'mid' not in forward_item.fields and not forward_item[
                        'mid'].isdigit():
                    forward_item['mid'] = sub_mid
                sub_num_div = sub_foot_div.xpath(
                    './div[@class="WB_handle W_fr"]//ul/li')
                for sub_div in sub_num_div:
                    sub_type = sub_div.xpath(
                        './span/a/span//em[@class]/@class').extract_first()
                    sub_num = sub_div.xpath(
                        './span/a/span//em[not(@class)]/text()').extract_first(
                        )

                    if sub_type and 'ficon_forward' in sub_type:
                        if '转发' in sub_num:
                            forward_item['forward_num'] = '0'
                        elif sub_num.strip().isdigit():
                            forward_item['forward_num'] = sub_num.strip()
                        else:
                            print('Parse sub forward_num error!!  ' + sub_num)

                    elif sub_type and 'ficon_repeat' in sub_type:
                        if '评论' in sub_num:
                            forward_item['comment_num'] = '0'
                        elif sub_num.strip().isdigit():
                            forward_item['comment_num'] = sub_num.strip()
                        else:
                            print('Parse sub comment_num error!!  ' + sub_num)

                    elif sub_type and 'ficon_praised' in sub_type:
                        if '赞' in sub_num:
                            forward_item['prise_num'] = '0'
                        elif sub_num.strip().isdigit():
                            forward_item['prise_num'] = sub_num.strip()
                        else:
                            print('Parse sub prise_num error!!  ' + sub_num)

                yield sub_unflod_url, forward_item
            else:
                blog_item['is_forward'] = 'Forward delete'

        else:
            blog_item['is_forward'] = 'False'

        # head_img_url = blog_.xpath('./div[@node-type="feed_content"]/'
        #                            'div[contains(@class, "WB_face")]//img/@src').extract_first()
        # blog_item['head_img_url'] = check_value(head_img_url)

        blog_info = blog_.xpath(
            './div[@node-type="feed_content"]/'
            'div[@class="WB_detail"]/div[contains(@class, "WB_text")]'
        ).extract_first()

        # 获得日期时间和来源
        date_div = blog_.xpath(
            './div[@node-type="feed_content"]/'
            'div[@class="WB_detail"]/div[contains(@class, "WB_from")]/a')
        blog_item['date_time'] = check_value(
            date_div[0].xpath('./@title').extract_first())
        blog_item['data_from'] = check_value(date_div[1].xpath('string(.)').extract_first()) \
            if len(date_div) > 1 else ''
        blog_item['exact_time'] = check_value(
            date_div[0].xpath('./@date').extract_first())

        unflod_url, info_dict = get_blog_content_info(blog_info)
        blog_item['blog_info'] = info_dict['text_list']
        blog_item['at_url_list'] = info_dict['at_url_list']
        blog_item['at_list'] = info_dict['at_text_list']
        blog_item['topic_list'] = info_dict['topic_list']
        blog_item['topic_url_list'] = info_dict['topic_url_list']
        blog_item['article_url'] = info_dict['article_url_list'][
            0] if info_dict['article_url_list'] else ''
        blog_item['picture_url'] = info_dict['img_url_list']

        media_div = blog_.xpath(
            './div[@node-type="feed_content"]/'
            'div[@class="WB_detail"]//div[@class="media_box"]//img/@src'
        ).extract()
        if 'picture_url' in blog_item.fields:
            blog_item['picture_url'] += media_div
        else:
            blog_item['picture_url'] = media_div
        if blog_item['picture_url']:
            if response:
                blog_item['picture_url'] = urljoin_list(
                    response, blog_item['picture_url'])
            else:
                blog_item['picture_url'] = urljoin_list(
                    total_page_response, blog_item['picture_url'])
        blog_item['picture_url'] = turn_to_big_pic(blog_item['picture_url'])

        # 获取点赞、评论、转发数
        blog_item['forward_num'] = -1
        blog_item['prise_num'] = -1
        blog_item['comment_num'] = -1

        foot_div = blog_.xpath('./div//ul[contains(@class, "WB_row_line")]/li')
        for sub_div in foot_div:
            sub_type = sub_div.xpath(
                './a/span[@class="pos"]//em[@class]/@class').extract_first()
            sub_num = sub_div.xpath(
                './a/span[@class="pos"]//em[not(@class)]/text()'
            ).extract_first()

            if sub_type and 'ficon_forward' in sub_type:
                if '转发' in sub_num:
                    blog_item['forward_num'] = '0'
                elif sub_num.strip().isdigit():
                    blog_item['forward_num'] = sub_num.strip()
                else:
                    print('Paese forward_num error!!  ' + sub_num)

            elif sub_type and 'ficon_repeat' in sub_type:
                if '评论' in sub_num:
                    blog_item['comment_num'] = '0'
                elif sub_num.strip().isdigit():
                    blog_item['comment_num'] = sub_num.strip()
                else:
                    print('Paese comment_num error!!  ' + sub_num)

            elif sub_type and 'ficon_praised' in sub_type:
                if '赞' in sub_num:
                    blog_item['prise_num'] = '0'
                elif sub_num.strip().isdigit():
                    blog_item['prise_num'] = sub_num.strip()
                else:
                    print('Paese prise_num error!!  ' + sub_num)

        yield unflod_url, blog_item

예제 #17

파일 보기

파일: cankaoxiaoxi_spider.py 프로젝트: chenhaiyuan53880/100public_sentiment

                '',
                'AgreeCount':
                AgreeCount,
                'DisagreeCount':
                DisagreeCount,
                'AskCount':
                '',
                'ParticipateCount':
                '',
                'CollectionCount':
                '',
                'Classification':
                sele.xpath('//div[@class="crumb"]/a/text()').extract()[1],
                'Labels':
                sele.xpath(
                    '//meta[@name="keywords"]/@content').extract_first(),
                'Type':
                '',
                'RewardCount':
                ''
            })
            yield item


if __name__ == '__main__':
    response = requests.get(
        'http://www.cankaoxiaoxi.com/mil/20180814/2310386_2.shtml')
    soup = Selector(text=response.text)
    body = soup.xpath('//div[@id="ctrlfscont"]//p/text()').extract()
    print(body)

예제 #18

파일 보기

def get_blog_content_info(blog_content_html, is_unflod=False):
    sel = Selector(text=blog_content_html)

    # 微博文字信息
    if is_unflod:
        blog_text_div = sel.xpath('//body/child::node()').extract()
    else:
        blog_text_div = sel.xpath(
            '//div[contains(@class, "WB_text")]/child::node()').extract()

    text_list = []
    at_url_list = []
    at_text_list = []
    topic_list = []
    topic_url_list = []
    article_url_list = []
    img_url_list = []
    unfold_url = None
    for child_div in blog_text_div:
        content_sel = Selector(text=child_div)
        a_sel = content_sel.xpath('//a')
        img_sel = content_sel.xpath('//img')

        if a_sel:
            a_type = a_sel.xpath(
                './i/@class | ./span/i/@class').extract_first()
            # 转发的时候带着图片
            if a_type and 'ficon_cd_img' in a_type:
                action_data = a_sel.xpath('./@action-data').extract_first()
                uid = ''
                mid = ''
                pid = ''
                short_url = ''
                if action_data:
                    for x in action_data.split('&'):
                        x_key = x.strip().split('=')[0]
                        x_value = x.strip().split('=')[1]
                        if x_key == 'uid':
                            uid = x_value
                        elif x_key == 'mid':
                            mid = x_value
                        elif x_key == 'pid':
                            pid = x_value
                        elif x_key == 'short_url':
                            short_url = x_value
                if short_url:
                    img_url_list.append(short_url)
                elif uid and mid and pid:
                    img_url = 'http://photo.weibo.com/' \
                              + uid \
                              + '/wbphotos/large/mid/' \
                              + mid \
                              + '/pid/' \
                              + pid
                    img_url_list.append(img_url)
                else:
                    print('No img url' + str(a_sel.extract()))
                text_list.append(
                    check_value(a_sel.xpath('string(.)').extract_first()))

            elif a_sel.xpath('./@extra-data') and a_sel.xpath(
                    './@extra-data').extract_first() == 'type=atname':
                at_text = check_value(a_sel.xpath('string(.)').extract_first())
                at_text_list.append(at_text)
                text_list.append(at_text)
                at_url_list.append(a_sel.xpath('./@href').extract_first())
            elif a_sel.xpath('./@extra-data') and a_sel.xpath(
                    './@extra-data').extract_first() == 'type=topic':
                topic_text = check_value(
                    a_sel.xpath('string(.)').extract_first())
                text_list.append(topic_text)
                topic_list.append(topic_text)
                topic_url_list.append(a_sel.xpath('./@href').extract_first())
            elif a_sel.xpath('./@action-type') and a_sel.xpath(
                    './@action-type').extract_first() == 'fl_unfold':
                # 获取展开全文的URL，这个URL只用于获取内容的ajax请求
                fl_action_data = a_sel.xpath('./@action-data').extract_first()
                unfold_url = 'http://weibo.com/p/aj/mblog/getlongtext?ajwvr=6&' + fl_action_data
            elif content_sel.xpath('//a/img'):
                img_type = content_sel.xpath('//a/img/@type').extract_first()
                # 如果是表情
                if img_type and img_type == 'face':
                    title = img_sel.xpath('./@title').extract_first()
                    src = img_sel.xpath('./@src').extract_first()
                    text = gen_emjo_text(title, src)
                    text_list.append(text)
                else:
                    text_list.append(
                        check_value(
                            img_sel.xpath('string(.)').extract_first()))

            elif a_type:
                a_href = check_value(a_sel.xpath('./@href').extract_first())
                if 'ficon_cd_longwb' in a_type:
                    article_url_list.append(a_href)
                part_text = gen_a_text(a_type, a_href)
                text_list.append(
                    check_value(a_sel.xpath('string(.)').extract_first()) +
                    part_text)

            else:
                print('blogs has more type!! ' + str(a_sel.extract()) + ' \n' +
                      blog_content_html)
        elif img_sel:
            img_type = img_sel.xpath('./@type').extract_first()
            # 如果是表情
            if img_type and img_type == 'face':
                title = img_sel.xpath('./@title').extract_first()
                src = img_sel.xpath('./@src').extract_first()
                text = gen_emjo_text(title, src)
                text_list.append(text)
            else:
                text_list.append(
                    check_value(img_sel.xpath('string(.)').extract_first()))
        else:
            text_list.append(
                check_value(content_sel.xpath('string(.)').extract_first()))

    return unfold_url, {
        'text_list': ''.join(text_list),
        'at_url_list': at_url_list,
        'at_text_list': at_text_list,
        'topic_list': topic_list,
        'topic_url_list': topic_url_list,
        'article_url_list': article_url_list,
        'img_url_list': img_url_list
    }

예제 #19

파일 보기

 def selector(self):
     if hasattr(self, '_selector'):
         return self._selector
     self._selector = Selector(text=self.numbered_html)
     return self._selector

예제 #20

파일 보기

def _get_comment_info(response, blog_id, html_=None, parent_comment_id='0'):
    comment_div = None
    if parent_comment_id == '0':
        json_data = response.text
        try:
            json_obj = json.loads(json_data)
            html_ = json_obj['data']['html']

            sel = Selector(text=html_)
            comment_div = sel.xpath(
                '//div[@class="list_box"]/div[@class="list_ul"]/div[@node-type="root_comment"]'
            )
        except:
            print('Parse comment json error!! ')
    elif html_:
        sel = Selector(text=html_)
        comment_div = sel.xpath('//div[@comment_id]')
    else:
        raise ValueError('None param html_')

    if comment_div:
        for comment_info in comment_div:
            # 获得根comment的信息
            root_comment = CommentItem()

            root_comment['parent_comment_id'] = str(parent_comment_id)

            root_comment['blog_id'] = blog_id
            root_comment['parse_time'] = str(time.time())

            comment_id = comment_info.xpath('./@comment_id').extract_first()
            root_comment['comment_id'] = check_value(comment_id)

            user_info_div = comment_info.xpath(
                './div[@class="list_con"]/div[@class="WB_text"]/a[1]')
            nick_name = user_info_div.xpath('string(.)').extract_first()
            root_comment['comment_user_nick'] = check_value(nick_name)

            user_id = user_info_div.xpath('./@usercard').extract_first()
            id_str = check_value(user_id).split('=')
            root_comment['comment_user_id'] = id_str[1] if len(
                id_str) > 1 else ''

            user_url = user_info_div.xpath('./@href').extract_first()
            root_comment['comment_user_page'] = response.urljoin(user_url)

            date_time_div = comment_info.xpath(
                './div[@class="list_con"]/div[contains(@class, "WB_func")]')
            date_time = date_time_div.xpath(
                './div[contains(@class, "WB_from")]').xpath(
                    'string(.)').extract_first()

            root_comment['comment_date_time'] = check_value(date_time)

            praise_num = date_time_div.xpath(
                './div[contains(@class, "WB_handle")]'
                '/ul//span[@node-type="like_status"]/em[not(@class)]/text()'
            ).extract_first()
            if isinstance(praise_num, int) or praise_num.isdigit():
                root_comment['praise_num'] = str(praise_num)
            elif '赞' in praise_num:
                root_comment['praise_num'] = '0'
            else:
                root_comment['praise_num'] = '-1'

            comment_content_div = comment_info.xpath(
                './div[@class="list_con"]/div[@class="WB_text"]'
            ).extract_first()
            info_dic = get_comment_content(comment_content_div)

            root_comment['content'] = info_dic['text_list']
            root_comment['at_url_list'] = info_dic['at_url_list']
            root_comment['at_name_list'] = info_dic['at_text_list']
            root_comment['topic_url_list'] = info_dic['topic_url_list']
            root_comment['topic_text_list'] = info_dic['topic_url_list']
            root_comment['img_url_list'] = info_dic['img_url_list']

            # 获得更多回复的链接
            more_replay = None
            if parent_comment_id == '0':
                more_replay = comment_info.xpath('./div[@node-type="replywrap"]'
                                                 '//a[@action-type="click_more_child_comment_big"]/@action-data') \
                    .extract_first()

            # 是否有子评论
            child_div = comment_info.xpath(
                './div[@class="list_con"]/div[contains(@class, "list_box_in")]'
                '/div[@node-type="child_comment"]').extract_first()
            root_comment['child_comment_ids'] = []
            if child_div and parent_comment_id == '0':  # and not more_replay:
                for child_comment in get_child_comment(response, blog_id,
                                                       child_div, comment_id):
                    root_comment['child_comment_ids'].append(
                        child_comment['comment_id'])
                    yield None, child_comment

            yield more_replay, root_comment

예제 #21

파일 보기

파일: weibo.py 프로젝트: Charles61/weibo-distributed-crawler

    def parse_pages(self, response):
        """
        对搜索页中的每条微博信息进行抽取，
        如果微博内容中有显示完全有“展开全文”按钮则继续返回一个微博全文的请求，
        否则返回item

        :param response:
        :return:
        """
        page_json = json.loads(response.body.decode('utf-8'))
        card_group = []
        try:
            card_group = page_json['data']['cards'][-1]['card_group']
        except IndexError as e:
            if response.meta.get('retry', 0) == 10:
                logger.error(
                    '账号：[%s]，第 %s 页解析微博列表json出错，已重试10次，放弃重试！错误原因：%s，返回信息：%s',
                    response.meta['account'], response.meta['index'], e,
                    page_json)
                return None
            else:
                logger.warning(
                    '账号：[%s]，第 %s 页解析微博列表json出错，将重试第 %s 次，错误原因：%s，返回信息：%s',
                    response.meta['account'], response.meta['index'],
                    response.meta.get('retry', 0) + 1, e, page_json)
                yield scrapy.Request(url=response.url,
                                     callback=self.parse_pages,
                                     dont_filter=True,
                                     meta={
                                         'index': response.meta['index'],
                                         'retry':
                                         response.meta.get('retry', 0) + 1
                                     })
                return None

        for i in card_group:

            item = WeiboItem()
            item['weibo_mid'] = int(i['mblog']['mid'])
            item['user_nick_name'] = i['mblog']['user']['screen_name']
            item['user_home_url'] = i['mblog']['user']['profile_url'].split(
                '?')[0]

            text = i['mblog']['text']
            text_s = Selector(text=text, type='html')
            item['content'] = text_s.xpath(
                'normalize-space(string(.))').extract_first('')

            item['time'] = self.format_date(i['mblog']['created_at'])
            item['forwarded_count'] = i['mblog']['reposts_count']
            item['comment_count'] = i['mblog']['comments_count']
            item['like_count'] = i['mblog']['attitudes_count']
            item['weibo_url'] = i['scheme'].split('?')[0]

            if len(text_s.xpath(
                    '//a[text()="全文"]')) != 0:  # 有展开全文按钮，构造全文请求，获取全文
                yield scrapy.Request(url=WeiboSpider.FULL_CONTENT_URL %
                                     item['weibo_mid'],
                                     callback=self.parse_full_content,
                                     dont_filter=True,
                                     meta={'item': item})
            else:
                yield item

            #  构造返回该微博评论request
            comment_url = WeiboSpider.COMMENT_URL % (item['weibo_mid'], 1)
            yield scrapy.Request(
                url=comment_url,
                callback=self.parse_comment,
                dont_filter=True,
                meta={
                    'mid': item['weibo_mid'],  # 微博mid
                    'index': 1,
                    'count': 0
                })

        logger.info('成功获取第%s页的微博信息', response.meta['index'])

예제 #22

파일 보기

def get_comment_content(comment_div):
    sel = Selector(text=comment_div)

    # 微博文字信息
    blog_text_div = sel.xpath('//body/div/child::node()').extract()

    text_list = []
    at_url_list = []
    at_text_list = []
    topic_list = []
    topic_url_list = []
    img_url_list = []

    for child_div in blog_text_div:
        content_sel = Selector(text=child_div)
        a_sel = content_sel.xpath('//a')
        img_sel = content_sel.xpath('//img')

        if a_sel:
            a_type = a_sel.xpath(
                './i/@class | ./span/i/@class').extract_first()
            # 转发的时候带着图片
            if a_type and 'ficon_cd_img' in a_type:
                action_data = a_sel.xpath('./@action-data').extract_first()
                uid = ''
                mid = ''
                pid = ''
                short_url = ''
                if action_data:
                    for x in action_data.split('&'):
                        x_key = x.strip().split('=')[0]
                        x_value = x.strip().split('=')[1]
                        if x_key == 'uid':
                            uid = x_value
                        elif x_key == 'mid':
                            mid = x_value
                        elif x_key == 'pid':
                            pid = x_value
                        elif x_key == 'short_url':
                            short_url = x_value
                if short_url:
                    img_url_list.append(short_url)
                elif uid and mid and pid:
                    img_url = 'http://photo.weibo.com/' \
                              + uid \
                              + '/wbphotos/large/mid/' \
                              + mid \
                              + '/pid/' \
                              + pid
                    img_url_list.append(img_url)
                else:
                    print('No img url' + str(a_sel.extract()))
                text_list.append(
                    check_value(a_sel.xpath('string(.)').extract_first()))

            elif a_sel.xpath('./@extra-data') and a_sel.xpath(
                    './@extra-data').extract_first() == 'type=atname':
                at_text = check_value(a_sel.xpath('string(.)').extract_first())
                at_text_list.append(at_text)
                text_list.append(at_text)
                at_url_list.append(a_sel.xpath('./@href').extract_first())
            elif a_sel.xpath('./@extra-data') and a_sel.xpath(
                    './@extra-data').extract_first() == 'type=topic':
                topic_text = check_value(
                    a_sel.xpath('string(.)').extract_first())
                text_list.append(topic_text)
                topic_list.append(topic_text)
                topic_url_list.append(a_sel.xpath('./@href').extract_first())
            elif content_sel.xpath('//a/img'):
                img_type = content_sel.xpath('//a/img/@type').extract_first()
                # 如果是表情
                if img_type and img_type == 'face':
                    title = img_sel.xpath('./@title').extract_first()
                    src = img_sel.xpath('./@src').extract_first()
                    text = gen_emjo_text(title, src)
                    text_list.append(text)
                else:
                    text_list.append(
                        check_value(
                            img_sel.xpath('string(.)').extract_first()))

            # else:

            # print('blogs has more type!! ' + str(a_sel.extract()) + ' \n' + comment_div)
        elif img_sel:
            img_type = img_sel.xpath('./@type').extract_first()
            # 如果是表情
            if img_type and img_type == 'face':
                title = img_sel.xpath('./@title').extract_first()
                src = img_sel.xpath('./@src').extract_first()
                text = gen_emjo_text(title, src)
                text_list.append(text)
            else:
                text_list.append(
                    check_value(img_sel.xpath('string(.)').extract_first()))
        else:
            text_list.append(
                check_value(content_sel.xpath('string(.)').extract_first()))

    return {
        'text_list': ''.join(text_list),
        'at_url_list': at_url_list,
        'at_text_list': at_text_list,
        'topic_list': topic_list,
        'topic_url_list': topic_url_list,
        'img_url_list': img_url_list
    }

예제 #23

파일 보기

    def parse_detail(self, response):
        # TODO 主动关闭爬虫问题
        try:
            data = Selector(text=response.body.decode('utf-8'))
            items = str(data.xpath('string(.)').extract()[0]).replace('\xa0', '').replace('\u3000', '')
            WJBT_27 = ''
            SJ_28 = ''
            LY_29 = ''
            WJBT_30 = ''
            ZDBH_31 = ''
            BH_32 = ''
            DKWZ_33 = ''
            TDWZ_34 = ''
            TDMJM_35 = ''
            TDMJPFM_36 = ''
            TDYT_37 = ''
            CJJ_38 = ''
            JDR_39 = ''
            GSQ_40 = ''
            LXDW_41 = ''
            DWDZ_42 = ''
            YZBM_43 = ''
            LXDH_44 = ''
            # TODO 共有字段  reFunction(f'时间：\s*([{self.reStr}]*)\s', LY)
            # 文件标题
            WJBT_27 = response.meta.get('title')
            # 时间
            SJ_28 = data.xpath('//div[@class="ztzx_frame_subtitle_l"]/span[1]/text()').extract_first()
            # 来源
            LY_29 = data.xpath('//div[@class="ztzx_frame_subtitle_l"]/span[2]/text()').extract_first()
            # 文件编号
            WJBT_30 = data.xpath('//div[@class="ztzx_frame_content"]/div[1]/text()').extract_first()
            # 公示期
            GSQ_40 = reFunction(f'公示期：([（）\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)。', items)
            # 联系单位
            LXDW_41 = reFunction('联系单位：([（）\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
            # 单位地址
            DWDZ_42 = reFunction('单位地址：([（）\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
            # 邮政编码
            YZBM_43 = reFunction('邮政编码：([（）\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
            # 联系电话
            LXDH_44 = reFunction('联系电话：([（）\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
            # 爬取时间
            crawlingTime = time.strftime("%Y-%m-%d", time.localtime())
            # 爬取地址url
            url = response.url
            # 唯一标识
            md5Mark = encrypt_md5(url + WJBT_27 + SJ_28)

            soup = BeautifulSoup(response.body.decode('utf-8').replace('thead', 'tbody'))
            table = soup.find('table')
            htmlTable = htmlTableTransformer()
            if table:
                if '竣工时间' in items:
                    try:
                        tdData = htmlTable.tableTrTdUNregulationToList(table)
                        for _ in range(len(list(tdData.values())[0])):
                            # 宗地编号
                            ZDBH_31 = tdData.get('地块编号')[_] if tdData.get('地块编号') else ''
                            # 地块位置
                            DKWZ_33 = tdData.get('位置')[_] if tdData.get('位置') else ''
                            # 土地位置
                            TDWZ_34 = tdData.get('位置')[_] if tdData.get('位置') else ''
                            # 土地面积(亩)
                            TDMJM_35 = tdData.get('出让面积平方米/亩')[_] if tdData.get('出让面积平方米/亩') else ''
                            # 土地面积(平方米)
                            TDMJPFM_36 = tdData.get(list(tdData.keys())[7])[_] if tdData.get(list(tdData.keys())[7]) else ''
                            # 土地用途
                            TDYT_37 = tdData.get('用途')[_] if tdData.get('用途') else ''
                            # 成交价(万元)
                            CJJ_38 = tdData.get('成交价(万元)')[_] if tdData.get('成交价(万元)') else tdData.get('成交价（万元）')[_] if tdData.get('成交价（万元）') else ''
                            # 竞得人
                            JDR_39 = tdData.get('受让人')[_] if tdData.get('受让人') else ''
                            # 写入数据
                            if self.name in DUPLICATE_SWITCH_LIST:
                                if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                    self.duplicateUrl += 1

                            if self.duplicateUrl < 50:
                                if TDYT_37:
                                    # 重复效验通过, 存储数据
                                    csvFile = [
                                        WJBT_27,
                                        SJ_28,
                                        LY_29,
                                        WJBT_30,
                                        ZDBH_31,
                                        BH_32,
                                        DKWZ_33,
                                        TDWZ_34,
                                        TDMJM_35,
                                        TDMJPFM_36,
                                        TDYT_37,
                                        CJJ_38,
                                        JDR_39,
                                        GSQ_40,
                                        LXDW_41,
                                        DWDZ_42,
                                        YZBM_43,
                                        LXDH_44,
                                        crawlingTime,
                                        url,
                                        md5Mark,
                                    ]
                                    results = ''
                                    for _ in csvFile:
                                        try:
                                            if _ and _ != '|' * len(_):
                                                results += _.replace(',', ' ').replace('\n', '').replace('\r',
                                                                                                         '').replace(
                                                    r'\xa0', '').replace('\xa0', '') + ','
                                            else:
                                                results += ','
                                        except Exception as e:
                                            results += ','
                                            self.log(
                                                f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                level=logging.ERROR)
                                    with open(self.pathDetail, 'a+') as fp:
                                        fp.write(results)
                                        fp.write('\n')
                                    self.log(f'数据获取成功', level=logging.INFO)
                                    yield
                    except:
                        for tdData in table.find_all('tr')[2:]:
                            # 宗地编号
                            ZDBH_31 = tdData.find_all('td')[4].string.strip()
                            # 地块位置
                            DKWZ_33 = tdData.find_all('td')[5].string.strip()
                            # 土地位置
                            TDWZ_34 = tdData.find_all('td')[5].string.strip()
                            # 土地面积(亩)
                            TDMJM_35 = tdData.find_all('td')[6].string.strip()
                            # 土地面积(平方米)
                            TDMJPFM_36 = tdData.find_all('td')[7].string.strip()
                            # 土地用途
                            TDYT_37 = tdData.find_all('td')[8].string.strip()
                            # 成交价(万元)
                            CJJ_38 = tdData.find_all('td')[9].string.strip()
                            # 竞得人
                            JDR_39 = tdData.find_all('td')[3].string.strip()
                            # 写入数据
                            if self.name in DUPLICATE_SWITCH_LIST:
                                if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                    self.duplicateUrl += 1

                            if self.duplicateUrl < 50:
                                if TDYT_37:
                                    # 重复效验通过, 存储数据
                                    csvFile = [
                                        WJBT_27,
                                        SJ_28,
                                        LY_29,
                                        WJBT_30,
                                        ZDBH_31,
                                        BH_32,
                                        DKWZ_33,
                                        TDWZ_34,
                                        TDMJM_35,
                                        TDMJPFM_36,
                                        TDYT_37,
                                        CJJ_38,
                                        JDR_39,
                                        GSQ_40,
                                        LXDW_41,
                                        DWDZ_42,
                                        YZBM_43,
                                        LXDH_44,
                                        crawlingTime,
                                        url,
                                        md5Mark,
                                    ]
                                    results = ''
                                    for _ in csvFile:
                                        try:
                                            if _ and _ != '|' * len(_):
                                                results += _.replace(',', ' ').replace('\n', '').replace('\r',
                                                                                                         '').replace(
                                                    r'\xa0', '').replace('\xa0', '') + ','
                                            else:
                                                results += ','
                                        except Exception as e:
                                            results += ','
                                            self.log(
                                                f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                level=logging.ERROR)
                                    with open(self.pathDetail, 'a+') as fp:
                                        fp.write(results)
                                        fp.write('\n')
                                    self.log(f'数据获取成功', level=logging.INFO)
                                    yield
                elif '转让方' not in items:
                    if len(table.find_all('tr')[1].find_all('td')) < 5:
                        table.find_all('tr')[1].extract()
                        table.find_all('tr')[0].find_all('td')[-1].extract()
                    tdData = htmlTable.tableTrTdRegulationToList(table)
                    for _ in range(len(list(tdData.values())[0])):
                        # 宗地编号
                        ZDBH_31 = tdData.get('宗地编号')[_] if tdData.get('宗地编号') else ''
                        # 编号
                        BH_32 = tdData.get('编号')[_] if tdData.get('编号') else ''
                        # 地块位置
                        DKWZ_33 = tdData.get('地块位置')[_] if tdData.get('地块位置') else ''
                        # 土地位置
                        TDWZ_34 = tdData.get('土地位置')[_] if tdData.get('土地位置') else ''
                        # 土地面积(亩)
                        TDMJM_35 = tdData.get('土地面积(亩)')[_] if tdData.get('土地面积(亩)') else ''
                        # 土地面积(平方米)
                        TDMJPFM_36 = tdData.get('土地面积(平方米)')[_] if tdData.get('土地面积(平方米)') else ''
                        # 土地用途
                        TDYT_37 = tdData.get('土地用途')[_] if tdData.get('土地用途') else ''
                        # 成交价(万元)
                        CJJ_38 = tdData.get('成交价(万元)')[_] if tdData.get('成交价(万元)') else tdData.get('成交价（万元）')[_] if tdData.get('成交价（万元）') else ''
                        # 竞得人
                        JDR_39 = tdData.get('竞得人')[_] if tdData.get('竞得人') else ''

                        # 写入数据
                        if self.name in DUPLICATE_SWITCH_LIST:
                            if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                                self.duplicateUrl += 1

                        if self.duplicateUrl < 50:
                            if TDYT_37:
                                # 重复效验通过, 存储数据
                                csvFile = [
                                    WJBT_27,
                                    SJ_28,
                                    LY_29,
                                    WJBT_30,
                                    ZDBH_31,
                                    BH_32,
                                    DKWZ_33,
                                    TDWZ_34,
                                    TDMJM_35,
                                    TDMJPFM_36,
                                    TDYT_37,
                                    CJJ_38,
                                    JDR_39,
                                    GSQ_40,
                                    LXDW_41,
                                    DWDZ_42,
                                    YZBM_43,
                                    LXDH_44,
                                    crawlingTime,
                                    url,
                                    md5Mark,
                                ]
                                results = ''
                                for _ in csvFile:
                                    try:
                                        if _ and _ != '|' * len(_):
                                            results += _.replace(',', ' ').replace('\n', '').replace('\r', '').replace(r'\xa0', '').replace('\xa0', '') + ','
                                        else:
                                            results += ','
                                    except Exception as e:
                                        results += ','
                                        self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                                 level=logging.ERROR)
                                with open(self.pathDetail, 'a+') as fp:
                                    fp.write(results)
                                    fp.write('\n')
                                self.log(f'数据获取成功', level=logging.INFO)
                                yield
                elif '地块基本情况' in items:
                    # 宗地编号
                    ZDBH_31 = reFunction('宗地编号\s*([（）【】\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
                    # 地块位置
                    DKWZ_33 = reFunction('地块位置\s*([（）【】\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
                    # 土地面积(亩)
                    TDMJM_35 = reFunction('土地面积\(公顷\)\s*([（）【】\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
                    # 土地用途
                    TDYT_37 = reFunction('土地用途\s*([（）【】\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
                    # 成交价(万元)
                    CJJ_38 = reFunction('成交价\(万元\)\s*([（）【】\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
                    # 竞得人
                    JDR_39 = reFunction('受让单位\s*([（）【】\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)

                    # 写入数据
                    if self.name in DUPLICATE_SWITCH_LIST:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if TDYT_37:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                WJBT_27,
                                SJ_28,
                                LY_29,
                                WJBT_30,
                                ZDBH_31,
                                BH_32,
                                DKWZ_33,
                                TDWZ_34,
                                TDMJM_35,
                                TDMJPFM_36,
                                TDYT_37,
                                CJJ_38,
                                JDR_39,
                                GSQ_40,
                                LXDW_41,
                                DWDZ_42,
                                YZBM_43,
                                LXDH_44,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace('\n', '').replace('\r', '').replace(
                                            r'\xa0', '').replace('\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                             level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url)
            elif '转让方' in items:
                    # 编号
                    BH_32 = reFunction('不动产权登记证号：([（）【】\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
                    # 地块位置
                    DKWZ_33 = reFunction('宗地位置：([（）\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
                    # 土地面积(平方米)
                    TDMJPFM_36 = reFunction('面\s*积：([（）\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
                    # 土地用途
                    TDYT_37 = reFunction('土地用途：([（）\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
                    # 成交价(万元)
                    # CJJ_38
                    # 竞得人
                    JDR_39 = reFunction('受让方：([（）\w\.:： —\(\)〔〕㎡㎡≤≥《》\-\/\%,；，、\.﹪]*)\s', items)
                    # 写入数据
                    if self.name in DUPLICATE_SWITCH_LIST:
                        if self.redisClient.isExist(md5Mark):  # 存在, 去重计数
                            self.duplicateUrl += 1

                    if self.duplicateUrl < 50:
                        if TDYT_37:
                            # 重复效验通过, 存储数据
                            csvFile = [
                                WJBT_27,
                                SJ_28,
                                LY_29,
                                WJBT_30,
                                ZDBH_31,
                                BH_32,
                                DKWZ_33,
                                TDWZ_34,
                                TDMJM_35,
                                TDMJPFM_36,
                                TDYT_37,
                                CJJ_38,
                                JDR_39,
                                GSQ_40,
                                LXDW_41,
                                DWDZ_42,
                                YZBM_43,
                                LXDH_44,
                                crawlingTime,
                                url,
                                md5Mark,
                            ]
                            results = ''
                            for _ in csvFile:
                                try:
                                    if _ and _ != '|' * len(_):
                                        results += _.replace(',', ' ').replace('\n', '').replace('\t', '').replace('\r', '').replace(
                                            r'\xa0', '').replace('\xa0', '') + ','
                                    else:
                                        results += ','
                                except Exception as e:
                                    results += ','
                                    self.log(f'{getVariableName(_).pop()}字段解析出错, 错误: {e}\n{traceback.format_exc()}',
                                             level=logging.ERROR)
                            with open(self.pathDetail, 'a+') as fp:
                                fp.write(results)
                                fp.write('\n')
                            self.log(f'数据获取成功', level=logging.INFO)
                            yield
                    else:
                        self.crawler.engine.close_spider(self, 'response msg info %s, job duplicated!' % response.url)


        except Exception as e:
            print(response.url)
            self.log(f'详情页数据解析失败, 请求:{response.url}, 错误: {e}\n{traceback.format_exc()}', level=logging.ERROR)

예제 #24

파일 보기

파일: extract_qa.py 프로젝트: too-young-too-naive/xpath_extraction

def get_element(path, tree):
    sel = Selector(text=tree)
    xp = lambda x: sel.xpath(x).extract()
    return xp(path)

예제 #25

파일 보기

def port_sample(sample, schemas=None, extractors=None):
    """Convert slybot samples made before slybot 0.13 to new format."""
    if schemas is None:
        schemas = {}
    if extractors is None:
        extractors = {}
    container_id = gen_predictable_id(sample.get('id', 1), sample['page_id'])
    default_annotations = [_create_container('body', container_id)]
    if not sample.get('annotated_body') and not sample.get('plugins'):
        sample['plugins'] = {
            'annotations-plugin': {
                'extracts': default_annotations
            }
        }
        return sample
    if not sample.get('plugins'):
        sample['plugins'] = load_annotations(sample.get('annotated_body', u''))
    else:
        repair_ids(sample)
    sample.pop('annotated_body', None)

    # Group annotations by type
    annotations = sample['plugins']['annotations-plugin']['extracts']
    try:
        sel = Selector(text=add_tagids(sample['original_body']))
    except KeyError:
        annotated = sample.get('annotated_body', u'')
        sample['original_body'] = annotated
        try:
            tagged = add_tagids(annotated)
        except KeyError:
            tagged = u''
        sel = Selector(text=tagged)
    annotations = port_standard(annotations, sel, sample)
    standard_annos, generated_annos, variant_annos = [], [], []
    for a in annotations:
        if a.get('generated'):
            generated_annos.append(a)
        elif a.get('variants', 0) > 0:
            variant_annos.append(a)
        else:
            standard_annos.append(a)
    if not annotations:
        sample['plugins'] = {
            'annotations-plugin': {
                'extracts': default_annotations
            }
        }
        return sample
    new_annotations = []
    a = find_element(annotations[0], sel)
    for b in annotations[1:]:
        b = find_element(b, sel)
        a = find_common_parent(a, b)
    parent = a.getparent()
    container = _create_container(
        a if parent is None else parent, container_id, selector=sel)
    new_annotations.append(container)
    for a in standard_annos:
        a.pop('variant', None)
    new_annotations.extend(standard_annos)
    new_annotations.extend(port_generated(generated_annos, sel))
    new_annotations.extend(port_variants(variant_annos, sel))
    for a in new_annotations:
        if not (a.get('item_container') and a.get('container_id')):
            a['container_id'] = container_id
        a.pop('tagid', None) or a.pop('data-tagid', None)
    # Update annotations
    sample['plugins']['annotations-plugin']['extracts'] = new_annotations
    sample['version'] = SLYBOT_VERSION
    schema_id, schemas = guess_schema(sample, schemas)
    container['schema_id'] = schema_id
    return sample, schemas

예제 #26

파일 보기

    def parseArticle(self, response):
        body = EncodeUtil.toUnicode(response.body)
        if False:
            self.logDao.info(u'访问过多被禁止')
        else:
            src_channel = response.meta['src_channel']
            sub_channel = response.meta['sub_channel']
            title = response.meta['title']
            post_user = response.meta['post_user']
            tags = response.meta['tags']
            post_date = response.meta['post_date']
            source_url = response.meta['source_url']

            self.logDao.info(u'开始解析文章:' + title + ':' + post_date + ':' +
                             source_url)
            selector = Selector(text=body)

            # 得到样式
            styleUrls = selector.xpath(
                '//link[@rel="stylesheet"]/@href').extract()
            styleList = []
            for styleUrl in styleUrls:
                if styleUrl.startswith('//'):
                    styleUrl = 'http:' + styleUrl
                # 得到hash作为key
                styleUrlHash = EncryptUtil.md5(styleUrl)
                if not self.css.get(styleUrlHash):
                    # 不存在则去下载 并保存
                    self.css[styleUrlHash] = CssUtil.downLoad(styleUrl)
                styleList.append(self.css[styleUrlHash])
            styles = CssUtil.compressCss(styleList).replace('\'', '"').replace(
                '\\', '\\\\')

            # 替换样式里面的链接
            styles = CssUtil.clearUrl(styles)

            content_html = selector.xpath('//*[@class="article"]')
            backHtml = selector.xpath('//*[@id="backsohucom"]').extract_first(
                '')

            if not len(content_html):
                self.logDao.info(u'不存在内容：' + source_url)
                return
            # 去除内部不需要的标签u'<p data-role="editor-name">责任编辑：<span></span></p>'
            # 完整案例：content_html.xpath('*[not(boolean(@class="entHdPic" or @class="ep-source cDGray")) and not(name(.)="script")]').extract()
            content_items = content_html.xpath(
                '*[not(name(.)="script") and not(name(.)="style")  and not(name(.)="iframe") and not(boolean(@data-role="editor-name"))]'
            )
            if not len(content_items):
                self.logDao.info(u'不存在内容：' + source_url)
                return

            # 得到纯文本
            content_txt = []
            for item in content_items:
                # 文本
                allTxt = item.xpath('.//text()').extract()
                allTxt = ''.join(allTxt).replace('\t', '')
                # 加入
                content_txt.append(allTxt)
            content_txt = '\n'.join(content_txt)

            # 组装新的内容标签
            outHtml = """<div class="article-page"><article class="article">${++content++}</article></div>"""
            content_items = content_items.extract()
            content_items = ''.join(content_items)

            content_html = outHtml.replace('${++content++}', content_items)
            content_html = content_html.replace(backHtml, '')

            selector = Selector(text=content_html)
            # 解析文档中的所有图片url，然后替换成标识
            image_urls = []
            imgs = selector.xpath('descendant::img')

            for img in imgs:
                # 图片可能放在src 或者data-src
                image_url_base = img.xpath('@src').extract_first('')
                if image_url_base.startswith('//'):
                    image_url = 'http:' + image_url_base
                else:
                    image_url = image_url_base
                if image_url and image_url.startswith('http'):
                    self.logDao.info(u'得到图片：' + image_url)
                    image_urls.append({
                        'url': image_url,
                    })
                    content_html = content_html.replace(
                        image_url_base, image_url)

            urlHash = EncryptUtil.md5(source_url.encode('utf8'))
            self.saveFile(urlHash, body)

            # 得到hashCode
            hash_code = self.checkDao.getHashCode(source_url)

            # 去除 image 的 alt title
            selector = Selector(text=content_html)
            imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
            # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
            for imgAltTitle in imgAltTitles:
                if imgAltTitle.strip(' '):
                    content_html = content_html.replace(imgAltTitle, '')

            contentItem = ContentItem()
            contentItem['content_txt'] = content_txt
            contentItem['image_urls'] = image_urls
            contentItem['title'] = title
            contentItem['source_url'] = source_url
            contentItem['post_date'] = post_date
            contentItem['sub_channel'] = sub_channel
            contentItem['post_user'] = post_user
            contentItem['tags'] = ','.join(tags)
            contentItem['styles'] = styles
            contentItem['content_html'] = content_html
            contentItem['hash_code'] = hash_code
            contentItem['info_type'] = 1
            contentItem['src_source_id'] = 7
            # contentItem['src_account_id'] = 0
            contentItem['src_channel'] = src_channel
            contentItem['src_ref'] = '搜狐科技'
            return contentItem

예제 #27

파일 보기

파일: lianjia_spider.py 프로젝트: xsgs/spider_boc

 def parse_neighborhood_info(self, response):
     basic_info = response.text
     city_name = response.meta['city_name']
     block_name = '>'.join(Selector(text=basic_info).xpath('//div[@class="xiaoquDetailbreadCrumbs"]/div[@class="fl l-txt"]/a/text()').extract())
     neighborhood_name=''
     if len(Selector(text=basic_info).xpath('//div[@class="xiaoquDetailHeader"]/div[@class="xiaoquDetailHeaderContent clear"]/div[@class="detailHeader fl"]/h1[@class="detailTitle"]/text()').extract())>0:
         neighborhood_name = Selector(text=basic_info).xpath('//div[@class="xiaoquDetailHeader"]/div[@class="xiaoquDetailHeaderContent clear"]/div[@class="detailHeader fl"]/h1[@class="detailTitle"]/text()').extract()[0]
     neighborhood_addr = Selector(text=basic_info).xpath(
         '//div[@class="xiaoquDetailHeader"]/div[@class="xiaoquDetailHeaderContent clear"]/div[@class="detailHeader fl"]/div[@class="detailDesc"]/text()').extract()[0]
     neighborhood_price = ''
     if len(Selector(text=basic_info).xpath(
         '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquPrice clear"]//span[@class="xiaoquUnitPrice"]/text()').extract())>0:
         neighborhood_price = Selector(text=basic_info).xpath(
             '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquPrice clear"]//span[@class="xiaoquUnitPrice"]/text()').extract()[0]
     neighborhood_year = Selector(text=basic_info).xpath(
         '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquInfo"]/div[@class="xiaoquInfoItem"][1]/span[@class="xiaoquInfoContent"]/text()').extract()[0]
     neighborhood_type = Selector(text=basic_info).xpath(
         '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquInfo"]/div[@class="xiaoquInfoItem"][2]/span[@class="xiaoquInfoContent"]/text()').extract()[0]
     neighborhood_estate = Selector(text=basic_info).xpath(
         '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquInfo"]/div[@class="xiaoquInfoItem"][3]/span[@class="xiaoquInfoContent"]/text()').extract()[0]
     neighborhood_property = Selector(text=basic_info).xpath(
         '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquInfo"]/div[@class="xiaoquInfoItem"][4]/span[@class="xiaoquInfoContent"]/text()').extract()[0]
     neighborhood_company = Selector(text=basic_info).xpath(
         '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquInfo"]/div[@class="xiaoquInfoItem"][5]/span[@class="xiaoquInfoContent"]/text()').extract()[0]
     neighborhood_builds = Selector(text=basic_info).xpath(
         '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquInfo"]/div[@class="xiaoquInfoItem"][6]/span[@class="xiaoquInfoContent"]/text()').extract()[0]
     neighborhood_houses = Selector(text=basic_info).xpath(
         '//div[@class="xiaoquOverview"]/div[@class="xiaoquDescribe fr"]/div[@class="xiaoquInfo"]/div[@class="xiaoquInfoItem"][7]/span[@class="xiaoquInfoContent"]/text()').extract()[0]
     item = LianjiaLoaderItem(item=LianjiaResultItem(), response=response)
     item.add_value('batch_date', self.batch_date)
     item.add_value('city_name', city_name)
     item.add_value('block_name', block_name)
     item.add_value('neighborhood_name', neighborhood_name)
     item.add_value('neighborhood_addr', neighborhood_addr)
     item.add_value('neighborhood_price', neighborhood_price)
     item.add_value('neighborhood_year', neighborhood_year)
     item.add_value('neighborhood_type', neighborhood_type)
     item.add_value('neighborhood_estate', neighborhood_estate)
     item.add_value('neighborhood_property', neighborhood_property)
     item.add_value('neighborhood_company', neighborhood_company)
     item.add_value('neighborhood_builds', neighborhood_builds)
     item.add_value('neighborhood_houses', neighborhood_houses)
     item.add_value('table_name', 'spider.lianjia_result')
     yield item.load_item()

예제 #28

파일 보기

파일: spider.py 프로젝트: zxbzxb180/python_work

def spider(url, place):
    text = requests.get(url).content.decode("utf-8")
    sel = Selector(text=text)

    date_all = sel.xpath('//div[@id="forecast"]/div[@class="detail"]')
    day_id = 0
    for date_item in date_all:
        if day_id <= 2:
            date_i = "".join(
                date_item.xpath(
                    './/div[@class="today"]/table/tbody/tr[1]/td[2]/text()').
                extract()).strip()
        else:
            date_i = "".join(
                date_item.xpath(
                    './/div[@class="today"]/table/tbody/tr[1]/td[1]/text()').
                extract()).strip()
        time_all = sel.xpath('//div[@id="hour3"]/div')
        time_item = time_all[day_id]
        for item in range(0, 8):
            time_i = "".join(
                (time_item.xpath('.//div[@class="row first"]/div/text()')
                 )[item + 1].extract()).strip()
            temperature = "".join(
                (time_item.xpath('.//div[@class="row wd"]/div/text()')
                 )[item + 1].extract()).strip()
            humidity = "".join(
                (time_item.xpath('.//div[@class="row xdsd"]/div/text()')
                 )[item + 1].extract()).strip()

            if place == 'shenzhen':
                existed_data = Temperature.select().where((
                    Temperature.date == date_i) & (Temperature.time == time_i))
                if existed_data:
                    temperature_data = existed_data[0]
                else:
                    temperature_data = Temperature()
            elif place == 'guangzhou':
                existed_data = Temperature2.select().where(
                    (Temperature2.date == date_i)
                    & (Temperature2.time == time_i))
                if existed_data:
                    temperature_data = existed_data[0]
                else:
                    temperature_data = Temperature2()
            elif place == 'foshan':
                existed_data = Temperature3.select().where(
                    (Temperature3.date == date_i)
                    & (Temperature3.time == time_i))
                if existed_data:
                    temperature_data = existed_data[0]
                else:
                    temperature_data = Temperature3()
            elif place == 'dongguan':
                existed_data = Temperature4.select().where(
                    (Temperature4.date == date_i)
                    & (Temperature4.time == time_i))
                if existed_data:
                    temperature_data = existed_data[0]
                else:
                    temperature_data = Temperature4()

            temperature_data.date = date_i
            temperature_data.time = time_i
            temperature_data.temperature = temperature
            temperature_data.humidity = humidity

            temperature_data.save()

        day_id += 1

예제 #29

파일 보기

 def title_parse(self, response):
     selector = Selector(response)
     itemList = selector.xpath(
         "//div[@id = 'subcontent']/dl[contains(@class,'list_dl') and not(contains(@class,'bluebg'))]"
     )
     for item in itemList:
         try:
             autohomeforumItem = AutohomeforumItem()
             autohomeforumItem['carId'] = re.findall(
                 '-(\d+)-', response.url)[0]
             autohomeforumItem['iconName'] = item.xpath(
                 "./dt/span/@class").extract_first()
             autohomeforumItem['title'] = item.xpath(
                 "./dt/a[1]/text()").extract_first()
             autohomeforumItem['author'] = item.xpath(
                 "./dd[1]/a/text()").extract_first()
             autohomeforumItem['authorId'] = item.xpath(
                 "./dd[1]/a/@href").extract_first().split('/')[-1]
             autohomeforumItem['publishTime'] = item.xpath(
                 "./dd[1]/span/text()").extract_first()
             autohomeforumItem['replyNum'] = item.xpath(
                 "./dd[2]/span[1]/text()").extract_first()
             autohomeforumItem['clickNum'] = item.xpath(
                 "./dd[2]/span[2]/text()").extract_first()
             autohomeforumItem['lastReplyer'] = item.xpath(
                 "./dd[3]/a/text()").extract_first()
             autohomeforumItem['lastReplyTime'] = item.xpath(
                 "./dd[3]/span/text()").extract_first()
             detialUrl = item.xpath("./dt/a[1]/@href").extract_first()
             autohomeforumItem['itemId'] = re.findall('/(\d+)-',
                                                      detialUrl)[0]
             autohomeforumItem['url'] = self.baseUrl + detialUrl
             autohomeforumItem['contents'] = list()
             if self.isSavedInMongodb({
                     'carId': autohomeforumItem['carId'],
                     'itemId': autohomeforumItem['itemId']
             }) > 0:
                 logging.warning('{carId}的{itemId}已经保存'.format(
                     carId=autohomeforumItem['carId'],
                     itemId=autohomeforumItem['itemId']))
                 continue
             yield SplashRequest(
                 url=self.baseUrl + detialUrl,
                 callback=self.detial_parse,
                 args={
                     'wait': 1,
                     'timeout': 60,
                     'images': 0
                 },
                 meta={'autohomeforumItem': autohomeforumItem})
         except Exception as e:
             print(e)
     maxNumText = selector.xpath(
         "//span[@class='fr']/text()").extract_first()
     try:
         maxNum = re.findall("(\d+)", maxNumText)[0]
     except Exception as e:
         print(e)
     currentPageNum = selector.xpath(
         "//span[@class='cur']/text()").extract_first()
     if int(currentPageNum) < response.meta['page']:
         nextUrl = re.sub('\d+.html',
                          str(int(currentPageNum) + 1) + '.html',
                          response.url)
         yield SplashRequest(url=nextUrl,
                             callback=self.title_parse,
                             args={
                                 'wait': 1,
                                 'timeout': 60,
                                 'images': 0
                             },
                             meta=response.meta)

예제 #30

파일 보기

    def duck_selector(self, response):

        base_url = "https://duckduckgo.com/"
        snippets = response \
            .xpath("//div[@class='result results_links results_links_deep web-result ']") \
            .extract()

        itemproc = self.crawler.engine.scraper.itemproc

        id_person = response.meta['id_person']
        base_attr = response.meta['attr']
        search = response.meta['search']
        num_snippet = response.meta['num_snip']

        for snippet in snippets:
            storage_item = UsmItem()
            num_snippet = num_snippet + 1

            title = Selector(text=snippet).xpath("//div/h2/a/node()").extract()
            cite = Selector(text=snippet).xpath("//div/a/@href").extract()
            text = Selector(text=snippet).xpath("//div/a[@class='result__snippet']/node()").extract()

            if title.__len__() > 0:
                tmp = ""
                for text in title:
                    for r in ["<b>", "</b>"]:
                        text = text.replace(r, '')
                    tmp = tmp + text
                title = tmp
            else:
                title = ""

            if cite.__len__() > 0:
                cite = cite[0]
            else:
                cite = ""

            if text.__len__() > 0:
                tmp = ""
                for txt in title:
                    for r in ["<b>", "</b>"]:
                        txt = txt.replace(r, '')
                    tmp = tmp + txt
                text = tmp
            else:
                text = ""

            if cite != "":
                self.log("---------------------------------")
                self.log("------------TITLE----------------")
                self.log(title)
                self.log("------------CITE-----------------")
                self.log(cite)
                self.log("------------TEXT-----------------")
                self.log(text)
                self.log("-----------ID PERSON-----------------")
                self.log(id_person)
                self.log("-----------SEARCH----------------")
                self.log(search)
                self.log("--------------ATTR---------------")
                self.log(base_attr)
                self.log("-----------ENGINE SEARCH---------")
                self.log(self.browser)
                self.log("------------NUMBER SNIPPET-------")
                self.log(num_snippet)

                storage_item['title'] = title
                storage_item['cite'] = cite
                storage_item['text'] = text
                storage_item['id_person'] = id_person
                storage_item['search'] = search
                storage_item['attr'] = base_attr
                storage_item['engine_search'] = self.browser
                storage_item['number_snippet'] = num_snippet

                itemproc.process_item(storage_item, self)