Python Selector.index примеры использования

Язык программирования: Python

Пространство имен/Пакет: scrapy.selector

Класс/Тип: Selector

Метод/Функция: index

Примеров на hotexamples.com: 10

Python Selector.index - 10 примеров найдено. Это лучшие примеры Python кода для scrapy.selector.Selector.index, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Selector(30)

strip(30)

split(30)

css(30)

select(30)

replace(30)

extract(30)

re(30)

extract_first(19)

remove_namespaces(16)

index(9)

rstrip(9)

lstrip(9)

encode(8)

register_namespace(6)

find(5)

remove(4)

append(4)

startswith(3)

rindex(3)

extend(3)

get(3)

re_first(2)

getall(2)

lower(2)

pop(1)

partition(1)

extract_unquoted(1)

__getattribute__(1)

rfind(1)

items(1)

decode(1)

find_all(1)

group(1)

__len__(1)

title(1)

to_csv(1)

url(1)

Пример #1

Показать файл

Файл: buscador.py Проект: davidcastilloc/ApiVE2

    def _get_registro_nacional_electoral(self):
        print("REGISTRO NACIONAL ELECTORAL")
        print(self.nacionalidad)
        print(self.cedula)

        html = ConsultarDatos(self.nacionalidad,
                              self.cedula).registro_nacional_electoral()
        data = Selector(text=html).xpath(
            self.registro_electoral_xpath).extract()
        if data[3].find("Registro") == 0:
            return self.CI_NO_REGISTRADA
        elif data[3] == " FALLECIDO (3)":
            raise CiudadanoException(
                message=
                f"Error! la cedula {self.nacionalidad}-{self.cedula} pertenece a un ciudadano fallecido...",
                code=self.CI_FALLECIDO)
        pn = ParseNombre(html)
        p = Parse()
        return Ciudadano(
            id=int(self.cedula),
            nacionalidad="Venezolano"
            if self.nacionalidad == "V" else "Extranjero",
            cedula=int(self.cedula),
            nombre_completo=pn.nombre_completo,
            nombres=pn.nombre_de_pila,
            apellidos=pn.apellidos,
            estado=p.parse_edo(data[data.index('Estado:') + 1]).title(),
            municipio=p.parse_mp(data[data.index('Municipio:') + 1]).title(),
            parroquia=p.parse_pq(data[data.index('Parroquia:') + 1]).title(),
            centro=p.parse_txt(data[data.index('Centro:') + 1]).title(),
            direccion=p.parse_txt(data[data.index('Dirección:') +
                                       1]).capitalize())

Пример #2

Показать файл

    def parse(self, response):
        contents = Selector(response).xpath('//div[@class="listing_title "]')
        img_contents = Selector(response).xpath(
            '//div[@class="photo_booking non_generic"]/a/img/@id').extract()
        lazy_img = Selector(response).xpath(
            '//*[@id="BODY_BLOCK_JQUERY_REFLOW"]/script[22]/text()').extract()

        for content in contents:
            item = AttractionItem()
            index = contents.index(content)
            item['name'] = content.xpath('a/text()').extract()[0]
            item['url'] = content.xpath('a/@href').extract()[0]
            item['url'] = "https://www.tripadvisor.com.vn/" + item['url']
            item['img_url'] = ""
            item['city_id'] = 0
            try:
                item['img_id'] = img_contents[index]
            except:
                pass
            yield scrapy.Request(item['url'],
                                 self.page_parse,
                                 meta={
                                     'splash': {
                                         'endpoint': 'render.html',
                                         'args': {
                                             'wait': 0.5
                                         },
                                     },
                                     'item': item,
                                 })

Пример #3

Показать файл

	def parse(self, response):
		contents = Selector(response).xpath('//div[@class="listing_title"]')
		img_contents = Selector(response).xpath('//div[@class="aspect  is-shown-at-tablet"]/div[@class="inner"]').extract()

		for content in contents:
			item = TripItem()
			index = contents.index(content)
			item['title'] = content.xpath('a/text()').extract()[0]
			item['url'] = content.xpath('a/@href').extract()[0]
			item['url'] = "https://www.tripadvisor.com.vn/" + item['url']
			item['city_id'] = 1
			try:
				regexRule = re.compile("\((.*?)\)")
				item['img_url'] = regexRule.findall(img_contents[index])[0]
			except:
				try:
					regexRule = re.compile("\"(.*?)\"")
					item['img_url'] = regexRule.findall(img_contents[index])[1]
				except:
					pass
			yield scrapy.Request(item['url'], self.page_parse, meta={
				'splash': {
				'endpoint': 'render.html',
				'args': {'wait': 0.5},
				},
				'item': item,
				})

Пример #4

Показать файл

 def parse(self, response, data):
     item = DispensaryItem()
     # item['State'] = str(Selector(response).xpath(
     #     '//*[@class="map-title__TitleWrapper-sc-8szojn-0 juiolJ"]/text()').get()).replace(" in ", "")
     item['StoreUrl'] = response.request.url
     item['ListingUrl'] = response.request.url
     item['DispensaryName'] = Selector(response).xpath(
         '//*[@class="styled-components__Name-soafp9-0 cWmvtr"]/text()'
     ).get()
     item['Type'] = Selector(response).xpath(
         '//*[@class="styled-components__Capitalize-soafp9-10 gihoQE"]/text()'
     ).get()
     item['Address'] = Selector(response).xpath(
         '//*[@class="styled-components__AddressRow-sc-1k0lbjf-2 dwPNra"]/text()'
     ).get()
     list_working_time = Selector(response).xpath(
         '//*[@class="src__Box-sc-1sbtrzs-0 open-hours__Range-xpgk3n-7 fCOJPV"]/text()'
     )
     for working_time in list_working_time:
         if list_working_time.index(working_time) == 0:
             item['Monday'] = working_time.get()
         elif list_working_time.index(working_time) == 1:
             item['Tuesday'] = working_time.get()
         elif list_working_time.index(working_time) == 2:
             item['Wednesday'] = working_time.get()
         elif list_working_time.index(working_time) == 3:
             item['Thursday'] = working_time.get()
         elif list_working_time.index(working_time) == 4:
             item['Friday'] = working_time.get()
         elif list_working_time.index(working_time) == 5:
             item['Saturday'] = working_time.get()
         elif list_working_time.index(working_time) == 6:
             item['Sunday'] = working_time.get()
     item['Phone'] = Selector(response).xpath(
         '//a[contains(@href, "tel:")]/text()').get()
     item['Email'] = Selector(response).xpath(
         '//a[contains(@href, "mailto:")]/text()').get()
     # item['Website'] = Selector(response).xpath('//a[contains(@href, "mailto:")]/text()').get()
     item['Instagram'] = Selector(response).xpath(
         '//a[contains(@href, "https://www.instagram.com")]/text()').get()
     item['Twitter'] = Selector(response).xpath(
         '//a[contains(@href, "https://twitter.com")]/text()').get()
     item['Facebook'] = Selector(response).xpath(
         '//a[contains(@href, "https://www.facebook.com")]/text()').get()
     item['StoreUrl'] = data[5]
     item['State'] = data[4]
     item['ListingUrl'] = data[2]
     item['Listings'] = data[3]
     item['ChildListingUrl'] = data[0]
     item['ChildListings'] = data[1]
     # item['Address'] = Selector(response).xpath(
     #     '//*[@class="styled-components__Capitalize-soafp9-10 gihoQE"]/text()').get()  # styled-components__AddressRow-sc-1k0lbjf-2 dwPNra
     # item['Address'] = Selector(response).xpath(
     #     '//*[@class="styled-components__Capitalize-soafp9-10 gihoQE"]/text()').get()  # styled-components__AddressRow-sc-1k0lbjf-2 dwPNra
     # item['Address'] = Selector(response).xpath(
     #     '//*[@class="styled-components__Capitalize-soafp9-10 gihoQE"]/text()').get()  # styled-components__AddressRow-sc-1k0lbjf-2 dwPNra
     # item['Address'] = Selector(response).xpath(
     #     '//*[@class="styled-components__Capitalize-soafp9-10 gihoQE"]/text()').get()  # styled-components__AddressRow-sc-1k0lbjf-2 dwPNra
     yield item

Пример #5

Показать файл

Файл: getdata.py Проект: IsmaelRLG/cnelib

    def registro_electoral(cls, nac, ci):
        html = cls.request_re(nac, ci)
        if html is None:
            return config.failed
        html = html.read().decode('utf-8')
        html = html.replace('\t', '').replace('\n', '').replace('\r', '')
        data = Selector(text=html).xpath(cls.re_xpath).extract()
        if not 'Nombre:' in data:
            return config.failed

        return {
            'ci': cls.get_ci(nac, ci),
            'nacionalidad': nac,
            'cedula': ci,
            'nombre': cls.parse_txt(data[data.index('Nombre:') + 1]).title(),
            'estado': cls.parse_edo(data[data.index('Estado:') + 1]),
            'municipio': cls.parse_mp(data[data.index('Municipio:') + 1]),
            'parroquia': cls.parse_pq(data[data.index('Parroquia:') + 1]),
            'centro': cls.parse_txt(data[data.index('Centro:') + 1]).capitalize(),
            'direccion': cls.parse_txt(data[data.index('Dirección:') + 1]).capitalize()
            }

Пример #6

Показать файл

    def registro_electoral(cls, nac, ci):
        html = cls.request_re(nac, ci)
        if html is None:
            return config.failed
        html = html.read().decode('utf-8')
        html = html.replace('\t', '').replace('\n', '').replace('\r', '')
        data = Selector(text=html).xpath(cls.re_xpath).extract()
        if not 'Nombre:' in data:
            return config.failed

        return {
            'ci':
            cls.get_ci(nac, ci),
            'nacionalidad':
            nac,
            'cedula':
            ci,
            'nombre':
            cls.parse_txt(data[data.index('Nombre:') + 1]).title(),
            'estado':
            cls.parse_edo(data[data.index('Estado:') + 1]),
            'municipio':
            cls.parse_mp(data[data.index('Municipio:') + 1]),
            'parroquia':
            cls.parse_pq(data[data.index('Parroquia:') + 1]),
            'centro':
            cls.parse_txt(data[data.index('Centro:') + 1]).capitalize(),
            'direccion':
            cls.parse_txt(data[data.index('Dirección:') + 1]).capitalize()
        }

Пример #7

Показать файл

Файл: maoyanmovies.py Проект: 919954807/Python001-class01

    def parse2(self, response):
        #详情页取定义存数据的item
        item = response.meta['item']
        #通过url取数据
        movies = Selector(
            response=response).xpath('//div[@class="movie-brief-container"]')

        #将取到的数据依次存入item
        for movie in movies:
            filmtitle = movie.xpath('./h1/text()')

            #filmtype不定，有多个，用循环取,暂未实现
            filmtypes = Selector(
                response=response).xpath('//li[@class="ellipsis"]')
            print("-------------打印filmtypes-------------------------")
            print(filmtypes)
            filmtypestr = ' '
            for filmtype in filmtypes:
                print("-------------打印filmtype-------------------------")
                print(filmtype)
                i = filmtypes.index(filmtype)
                #print("i:" + str(i))
                tempstr = filmtype.xpath(
                    '/html/body/div[3]/div/div[2]/div[1]/ul/li[1]/a[1]/text()'
                ).extract_first().strip()
                print("-------------打印tempstr-------------------------")
                print(tempstr)
                filmtypestr.join(tempstr)
            print("-------------打印filmtypesstr-------------------------")
            print(filmtypestr)

            #filmtype只取了一个
            filmtype = movie.xpath(
                '/html/body/div[3]/div/div[2]/div[1]/ul/li[1]/a[1]/text()')
            filmdate = movie.xpath(
                '/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/text()')
            #打印调试
            print(
                "-------------------打印parse2中item的各项---------------------------"
            )
            print(filmtitle.extract_first().strip())
            print(filmtype.extract_first().strip())
            print(filmdate.extract_first().strip())
            #传入值
            item['filmtitle'] = filmtitle.extract_first().strip()
            item['filmtype'] = filmtype.extract_first().strip()
            item['filmdate'] = filmdate.extract_first().strip()
        #返回数据
        yield item

Пример #8

Показать файл

    def parse_item(self, response):
        nameList = response.xpath(
            '//*[@id="ListViewInner"]/li/h3/a/text()').extract()
        linkList = response.xpath(
            '//*[@id="ListViewInner"]/li/h3/a/@href').extract()

        raw_costList = response.xpath(
            '//*[@id="ListViewInner"]/li/ul[@class="lvprices left space-zero"]'
        ).extract()
        costList = []
        for htmlpart in raw_costList:  #extract first cost
            raw_cost = Selector(text=htmlpart).xpath(
                '//ul[@class="lvprices left space-zero"]/li[@class="lvprice prc"]/span/text()'
            ).extract_first()
            try:
                result = raw_cost[raw_cost.index('$') + 1:]
            except:
                result = '-'
            costList.append(result)

        countryList_raw = response.xpath(
            '//*[@id="ListViewInner"]/li/ul[@class="lvdetails left space-zero full-width"]/li/text()'
        ).extract()
        countryList = []
        for cnt in countryList_raw:  # clear country list
            if cnt.isspace() == False:
                cnt = cnt[cnt.index('F'):]
                countryList.append(cnt)
            else:
                pass

        imgUrlList = response.xpath(
            '//*[@id="ListViewInner"]/li/div/div/a/img/@src').extract()

        for i in range(len(nameList)):
            try:
                yield {
                    "name": nameList[i],
                    "link": linkList[i],
                    "cost": costList[i],
                    "country": countryList[i],
                    "imgUrl": imgUrlList[i]
                }
            except:
                pass

Пример #9

Показать файл

    def page_parse(self, response):
        item = response.meta['item']
        item['location'] = ""
        try:
            locationFetch = Selector(response).xpath(
                '//span[@class="street-address"]/text()').extract()[0]
            item['location'] = item['location'] + locationFetch
        except:
            pass
        try:
            locationFetch = Selector(response).xpath(
                '//span[@class="locality"]/text()').extract()[0]
            item['location'] = item['location'] + " " + locationFetch
        except:
            pass
        try:
            locationFetch = Selector(response).xpath(
                '//span[@class="country-name"]/text()').extract()[0]
            item['location'] = item['location'] + " " + locationFetch
        except:
            pass
        item['location'] = re.sub(' +', ' ', item['location']).strip()

        features = Selector(response).xpath(
            '//div[@class="rating_and_popularity"]/span[@class="header_detail attraction_details"]/div[@class="detail"]/a'
        )
        item['features'] = ""
        features_list = []

        for feature in features:
            index = features.index(feature)
            features_list.append(feature.xpath('text()').extract()[0])
            item['features'] = item['features'] + feature.xpath(
                'text()').extract()[0]
            item['features'] = item['features'] + ", "

        item['features'] = item['features'][:-2]

        yield item

Пример #10

Показать файл

Файл: espider.py Проект: Raja1802/ebayCrawler

    def parse(self, response):
        nameList = response.xpath(
            '//*[@id="ListViewInner"]/li/h3/a/text()').extract()
        linkList = response.xpath(
            '//*[@id="ListViewInner"]/li/h3/a/@href').extract()

        raw_costList = response.xpath(
            '//*[@id="ListViewInner"]/li/ul[@class="lvprices left space-zero"]'
        ).extract()
        costList = []
        for htmlpart in raw_costList:  #extract first cost
            raw_cost = Selector(text=htmlpart).xpath(
                '//ul[@class="lvprices left space-zero"]/li[@class="lvprice prc"]/span/text()'
            ).extract_first()
            try:
                result = raw_cost[raw_cost.index('$') + 1:]
            except:
                result = '-'
            costList.append(result)

        countryList_raw = response.xpath(
            '//*[@id="ListViewInner"]/li/ul[@class="lvdetails left space-zero full-width"]/li/text()'
        ).extract()
        countryList = []
        for cnt in countryList_raw:  # clear country list
            if cnt.isspace() == False:
                cnt = cnt[cnt.index('F'):]
                countryList.append(cnt)
            else:
                pass

    # Not all imgUrls pull correct  if use >
    #imgUrlList = response.xpath('//*[@id="ListViewInner"]/li/div/div/a/img/@src').extract()
        imgUrlList = []
        for link in linkList:
            #imgUrl example "http://thumbs.ebaystatic.com/images/g/B80AAOSwZQRYaBjw/s-l225.jpg"
            #part B80AAOSwZQRYaBjw extract from item link
            index = link.rindex(':')
            imgUrl = "http://thumbs.ebaystatic.com/images/g/" + link[
                index + 1:] + "/s-l225.jpg"
            imgUrlList.append(imgUrl)

        for i in range(len(nameList)):
            try:
                yield {
                    "name": nameList[i],
                    "link": linkList[i],
                    "cost": costList[i],
                    "country": countryList[i],
                    "imgUrl": imgUrlList[i]
                }
            except:
                pass

        crawledLinks = self.start_urls

        nextPageUrl = response.xpath(
            '//*[@class="gspr next"]/@href').extract()[0]
        print("NPU >>> ", nextPageUrl)
        if nextPageUrl != None and len(crawledLinks) < 5:
            crawledLinks.append(nextPageUrl)
            yield Request(response.urljoin(nextPageUrl), callback=self.parse)
        else:
            "nextPageUrl not defined"

        print(crawledLinks)