Python Selector.startswith示例，scrapy.Selector.startswith Python示例

示例#1

0

显示文件

文件： animation_jiadiandm_spider.py 项目： rizkyxp/spider

 def parse_basic_info(self, response):
     contents = response.xpath(
         '//div[@class="wrap mb "]/div[@class="list_z2 g3"]/div[@class="list_z g4 pslpx"]/ul[@class="ipic"]//li'
     ).extract()
     for content in contents:
         url = 'https://www.jiadiandm.com' + Selector(
             text=content).xpath('//li[1]/a/@href').extract()[0]
         img = Selector(text=content).xpath('//li/a/img/@src').extract()[0]
         if img != None and img != '' and (not img.startswith('http')):
             img = 'https://www.jiadiandm.com' + img
         name = Selector(text=content).xpath('//li/a/img/@alt').extract()[0]
         status = None
         if len(
                 Selector(text=content).xpath(
                     '//li/a/span/text()').extract()) > 0:
             status = Selector(
                 text=content).xpath('//li/a/span/text()').extract()[0]
         yield scrapy.Request(url=url,
                              headers=self.default_headers,
                              body=self.default_data,
                              callback=self.parse_detail_info,
                              meta={
                                  'img': img,
                                  'name': name,
                                  'status': status
                              },
                              dont_filter=True)

示例#2

0

显示文件

文件： game_ali213_spider.py 项目： rizkyxp/spider

 def parse_basic_info(self, response):
     contents = response.xpath(
         '//div[@class="list_body"]//div[@class="list_body_contain"]//div[@class="list_body_con"]'
     ).extract()
     for content in contents:
         img = Selector(text=content).xpath(
             '//div[@class="list_body_con"]//a[@class="list_body_con_img"]//img/@data-original'
         ).extract()[0]
         name = Selector(text=content).xpath(
             '//div[@class="list_body_con"]//a[@class="list_body_con_img"]//img/@alt'
         ).extract()[0]
         zh_name = Selector(text=content).xpath(
             '//div[@class="list_body_con"]//div[@class="list_body_con_con"]/a/text()'
         ).extract()[0]
         url_tmp = Selector(text=content).xpath(
             '//div[@class="list_body_con"]//a[@class="list_body_con_img"]/@href'
         ).extract()[0]
         if not url_tmp.startswith('/'):
             continue
         url = 'http://down.ali213.net' + url_tmp
         yield scrapy.Request(url=url,
                              headers=self.default_headers,
                              body=self.default_data,
                              callback=self.parse_detail_info,
                              meta={
                                  'img': img,
                                  'name': name,
                                  'zh_name': zh_name
                              },
                              dont_filter=True)

示例#3

0

显示文件

文件： comic_mh66_spider.py 项目： rizkyxp/spider

 def parse_basic_info(self, response):
     contents = response.xpath(
         '//div[@class="dmList clearfix"]/ul//li').extract()
     for content in contents:
         img = Selector(text=content).xpath(
             '//li/p[@class="fl cover"]/a[@class="pic"]/img/@src').extract(
             )[0]
         if not (img != '' and img != None and img.startswith('http')):
             img = 'http://www.66mh.cc' + img
         name = Selector(text=content).xpath(
             '//li/p[@class="fl cover"]/a[@class="pic"]/img/@alt').extract(
             )[0]
         url = 'http://www.66mh.cc' + Selector(text=content).xpath(
             '//li/p[@class="fl cover"]/a[@class="pic"]/@href').extract()[0]
         status_part1 = ''
         if len(
                 Selector(text=content).xpath(
                     '//li/p[@class="fl cover"]/span/a/text()').extract()
         ) > 0:
             status_part1 = Selector(text=content).xpath(
                 '//li/p[@class="fl cover"]/span/a/text()').extract()[0]
         update_time = None
         if len(
                 Selector(text=content).xpath(
                     '//li/dl/dd//p[1]/span/text()').extract()) > 0:
             update_time = Selector(text=content).xpath(
                 '//li/dl/dd//p[1]/span/text()').extract()[0]
         else:
             update_time = Selector(text=content).xpath(
                 '//li/dl/dd//p[1]/span/font/text()').extract()[0]
         status_part2 = ''
         if len(
                 Selector(text=content).xpath(
                     '//li/dl/dd//p[2]/span/text()').extract()) > 0:
             status_part2 = Selector(text=content).xpath(
                 '//li/dl/dd//p[2]/span/text()').extract()[0]
         status = status_part2 + status_part1
         type = None
         if len(
                 Selector(text=content).xpath(
                     '//li/dl/dd//p[3]/a/text()').extract()) > 0:
             type = Selector(text=content).xpath(
                 '//li/dl/dd//p[3]/a/text()').extract()[0]
         yield scrapy.Request(url=url,
                              headers=self.default_headers,
                              body=self.default_data,
                              meta={
                                  'img': img,
                                  'name': name,
                                  'update_time': update_time,
                                  'status': status,
                                  'type': type
                              },
                              callback=self.parse_detail_info,
                              dont_filter=True)

示例#4

0

显示文件

文件： game_3dmgame_spider.py 项目： rizkyxp/spider

 def parse_basic_info(self, response):
     contents = response.xpath(
         '//div[@class="list_con"]//ul[@class="game_list"]//li').extract()
     for content in contents:
         img = Selector(text=content).xpath(
             '//li//div[@class="img"]//a//img/@src').extract()[0]
         if img != '' and img != None and img != [] and not img.startswith(
                 'http'):
             img = 'http://dl.3dmgame.com' + img
         name = Selector(text=content).xpath(
             '//li//div[@class="img"]//a//img/@alt').extract()[0]
         zh_name = Selector(text=content).xpath(
             '//li//div[@class="img"]//a//img/@alt').extract()[0]
         url = Selector(text=content).xpath(
             '//li//div[@class="img"]//a/@href').extract()[0]
         introduction = Selector(
             text=content).xpath('//li//div[@class="text"]/dl').extract()[0]
         type = Selector(text=content).xpath(
             '//li//div[@class="more_info"]/dl//dd[1]/a/text()').extract(
             )[0]
         update_time = Selector(text=content).xpath(
             '//li//div[@class="more_info"]/dl//dd[2]/text()').extract()[0]
         size = Selector(text=content).xpath(
             '//li//div[@class="more_info"]/dl//dd[3]/text()').extract()[0]
         language = ''
         if len(
                 Selector(text=content).xpath(
                     '//li//div[@class="more_info"]/dl//dd[4]/text()').
                 extract()) > 0:
             language = Selector(text=content).xpath(
                 '//li//div[@class="more_info"]/dl//dd[4]/text()').extract(
                 )[0]
         yield scrapy.Request(url=url,
                              headers=self.default_headers,
                              body=self.default_data,
                              callback=self.parse_detail_info,
                              meta={
                                  'img': img,
                                  'name': name,
                                  'zh_name': zh_name,
                                  'introduction': introduction,
                                  'type': type,
                                  'update_time': update_time,
                                  'size': size,
                                  'language': language
                              },
                              dont_filter=True)

示例#5

0

显示文件

文件： game_gamersky_spider.py 项目： rizkyxp/spider

 def parse_basic_info(self, response):
     contents = response.xpath(
         '//ul[@class="down_con downData"]//li').extract()
     for content in contents:
         img = Selector(text=content).xpath(
             '//li//div[@class="img"]/a/img/@src').extract()[0]
         if img != '' and img != None and img != [] and not img.startswith(
                 'http'):
             img = 'http://img4.gamersky.com/Files/GamerSky/' + img
         name = Selector(text=content).xpath(
             '//li//div[@class="img"]/a/@title').extract()[0]
         url = Selector(text=content).xpath(
             '//li//div[@class="img"]/a/@href').extract()[0]
         update_time = Selector(text=content).xpath(
             '//li//div[@class="txt"][1]/text()').extract()[0]
         if update_time != None:
             update_time = update_time.split('：')[1]
         type = Selector(text=content).xpath(
             '//li//div[@class="txt"][2]/text()').extract()[0]
         if type != None:
             type = type.split('：')[1]
         language = Selector(text=content).xpath(
             '//li//div[@class="txt"][3]/text()').extract()[0]
         if language != None:
             language = language.split('：')[1]
         size = Selector(text=content).xpath(
             '//li//div[@class="txt"][4]/text()').extract()[0]
         if size != None:
             size = size.split('：')[1]
         yield scrapy.Request(url=url,
                              headers=self.default_headers,
                              body=self.default_data,
                              callback=self.parse_detail_info,
                              meta={
                                  'img': img,
                                  'name': name,
                                  'update_time': update_time,
                                  'type': type,
                                  'language': language,
                                  'size': size
                              },
                              dont_filter=True)

示例#6

0

显示文件

 def parse_basic_info(self, response):
     contents = response.xpath(
         '//div[@style="width:373px;float:left;margin:5px 0px 5px 5px;"]'
     ).extract()
     for content in contents:
         img = Selector(text=content).xpath(
             '//div[@style="width:373px;float:left;margin:5px 0px 5px 5px;"]/div[@style="width:95px;float:left;"]/a/img/@src'
         ).extract()[0]
         if not img.startswith('http'):
             img = 'http://www.wenku8.com' + img
         url = Selector(text=content).xpath(
             '//div[@style="width:373px;float:left;margin:5px 0px 5px 5px;"]/div[@style="width:95px;float:left;"]/a/@href'
         ).extract()[0]
         name = Selector(text=content).xpath(
             '//div[@style="width:373px;float:left;margin:5px 0px 5px 5px;"]/div[@style="width:95px;float:left;"]/a/@title'
         ).extract()[0]
         yield scrapy.Request(url=url,
                              meta={
                                  'img': img,
                                  'name': name
                              },
                              callback=self.parse_detail_info,
                              dont_filter=True)

示例#7

0

显示文件

    def company_info(self, response):
        company_data = {'unit_type': response.meta['unit_type'], 'city': '',
                        'start_date': '', 'number': '', 'authority': '', 'type_of_registration': '',
                        'business_area': '', 'security_number': '', 'capital': '', 'unit_property': '',
                        'social_registration': '', 'registered_address': '', 'registered__postal_code': '',
                        'business_address': '', 'business_postal_number': '', 'legal_person': '',
                        'website': '',
                        }
        company_name = Selector(response=response).xpath('//td[@colspan="3"]')[0].xpath('./a/@title').extract_first()
        company_data['company_name'] = company_name

        # yield scrapy.FormRequest()

        # test = self.r.sadd('title_name1', company_name)
        unit_property = Selector(response=response).xpath(
            '//td[@style="width: 350px;padding-top: 9px;"]/text()').extract_first()
        if unit_property.split():
            unit_property = unit_property.split()[0]
            company_data['unit_property'] = unit_property

        capital = Selector(response=response).xpath('//td[@colspan="3"]')[2].xpath('text()').extract_first()
        if capital is not None:
            if capital != '/':
                company_data['capital'] = capital + '万元'

        city = Selector(response=response).xpath('//td[@colspan="3"]')[1].xpath('text()').extract_first()
        if city.split():
            city = city.split()[0]
            company_data['city'] = city

        start_company_data = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[
            3].xpath('text()').extract_first()
        if start_company_data.split():
            start_company_data = start_company_data.split()[0]
            company_data['start_date'] = start_company_data

        number = Selector(response=response).xpath('//td[@colspan="3"]')[3].xpath(
            'text()').extract_first()
        if number.split():
            number = number.split()[0]
            if len(number) == 18:
                company_data['number'] = number

        authority = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[5].xpath(
            'text()').extract_first()
        if authority is not None:
            authority = authority.split()[0]
            if authority != '/':
                company_data['authority'] = authority

        type_of_registration = Selector(response=response).xpath('//td[@colspan="5"]')[0].xpath(
            'text()').extract_first()
        if type_of_registration.split():
            type_of_registration = type_of_registration.split()[0]
            company_data['type_of_registration'] = type_of_registration

        business_area = Selector(response=response).xpath('//td[@colspan="5"]')[1].xpath(
            'text()').extract_first()
        if business_area is not None:
            business_area = business_area.split()[0]
            if business_area != '/':
                company_data['business_area'] = business_area

        security_number = Selector(response=response).xpath('//td[@colspan="3"]')[4].xpath(
            'text()').extract_first()
        if security_number is not None:
            security_number = security_number.split()[0]
            if security_number != '/':
                company_data['security_number'] = security_number

        social_registration = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[
            9].xpath(
            'text()').extract_first()
        if social_registration is not None:
            social_registration = social_registration.split()[0]
            if social_registration != '/':
                company_data['social_registration'] = social_registration

        registered_address = Selector(response=response).xpath('//td[@colspan="3"]')[5].xpath(
            'text()').extract_first()
        if registered_address is not None:
            registered_address = registered_address.split()[0]
            if registered_address != '/':
                company_data['registered_address'] = registered_address

        registered__postal_code = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[
            11].xpath(
            'text()').extract_first()
        if registered__postal_code is not None:
            registered__postal_code = registered__postal_code.split()[0]
            if registered__postal_code != '/':
                company_data['registered__postal_code'] = registered__postal_code

        business_address = Selector(response=response).xpath('//td[@colspan="3"]')[5].xpath(
            'text()').extract_first()
        if business_address is not None:
            business_address = business_address.split()[0]
            if business_address != '/':
                company_data['business_address'] = business_address

        business_postal_number = Selector(response=response).xpath('//td[@style="width: 230px;padding-top: 9px;"]')[
            13].xpath(
            'text()').extract_first()
        if business_postal_number is not None:
            business_postal_number = business_postal_number.split()[0]
            if business_postal_number != '/':
                company_data['business_postal_number'] = business_postal_number

        legal_person = Selector(response=response).xpath('//td[@colspan="2"]/text()').extract_first()
        if legal_person is not None:
            legal_person = legal_person.split()[0]
            if legal_person != '/':
                company_data['legal_person'] = legal_person

        if len(Selector(response=response).xpath('//td[@colspan="5"]')) == 3:
            website = Selector(response=response).xpath('//td[@colspan="5"]')[2].xpath(
                'text()').extract_first()
            if website.split():
                print(website.split(), 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA')
                website = website.split()[0]
                if website.startswith('www') or website.startswith('http'):
                    company_data['website'] = website
        print('公司信息', company_data)