Exemplo n.º 1
0
    def _get_registro_nacional_electoral(self):
        print("REGISTRO NACIONAL ELECTORAL")
        print(self.nacionalidad)
        print(self.cedula)

        html = ConsultarDatos(self.nacionalidad,
                              self.cedula).registro_nacional_electoral()
        data = Selector(text=html).xpath(
            self.registro_electoral_xpath).extract()
        if data[3].find("Registro") == 0:
            return self.CI_NO_REGISTRADA
        elif data[3] == " FALLECIDO (3)":
            raise CiudadanoException(
                message=
                f"Error! la cedula {self.nacionalidad}-{self.cedula} pertenece a un ciudadano fallecido...",
                code=self.CI_FALLECIDO)
        pn = ParseNombre(html)
        p = Parse()
        return Ciudadano(
            id=int(self.cedula),
            nacionalidad="Venezolano"
            if self.nacionalidad == "V" else "Extranjero",
            cedula=int(self.cedula),
            nombre_completo=pn.nombre_completo,
            nombres=pn.nombre_de_pila,
            apellidos=pn.apellidos,
            estado=p.parse_edo(data[data.index('Estado:') + 1]).title(),
            municipio=p.parse_mp(data[data.index('Municipio:') + 1]).title(),
            parroquia=p.parse_pq(data[data.index('Parroquia:') + 1]).title(),
            centro=p.parse_txt(data[data.index('Centro:') + 1]).title(),
            direccion=p.parse_txt(data[data.index('Dirección:') +
                                       1]).capitalize())
Exemplo n.º 2
0
    def parse(self, response):
        contents = Selector(response).xpath('//div[@class="listing_title "]')
        img_contents = Selector(response).xpath(
            '//div[@class="photo_booking non_generic"]/a/img/@id').extract()
        lazy_img = Selector(response).xpath(
            '//*[@id="BODY_BLOCK_JQUERY_REFLOW"]/script[22]/text()').extract()

        for content in contents:
            item = AttractionItem()
            index = contents.index(content)
            item['name'] = content.xpath('a/text()').extract()[0]
            item['url'] = content.xpath('a/@href').extract()[0]
            item['url'] = "https://www.tripadvisor.com.vn/" + item['url']
            item['img_url'] = ""
            item['city_id'] = 0
            try:
                item['img_id'] = img_contents[index]
            except:
                pass
            yield scrapy.Request(item['url'],
                                 self.page_parse,
                                 meta={
                                     'splash': {
                                         'endpoint': 'render.html',
                                         'args': {
                                             'wait': 0.5
                                         },
                                     },
                                     'item': item,
                                 })
Exemplo n.º 3
0
	def parse(self, response):
		contents = Selector(response).xpath('//div[@class="listing_title"]')
		img_contents = Selector(response).xpath('//div[@class="aspect  is-shown-at-tablet"]/div[@class="inner"]').extract()

		for content in contents:
			item = TripItem()
			index = contents.index(content)
			item['title'] = content.xpath('a/text()').extract()[0]
			item['url'] = content.xpath('a/@href').extract()[0]
			item['url'] = "https://www.tripadvisor.com.vn/" + item['url']
			item['city_id'] = 1
			try:
				regexRule = re.compile("\((.*?)\)")
				item['img_url'] = regexRule.findall(img_contents[index])[0]
			except:
				try:
					regexRule = re.compile("\"(.*?)\"")
					item['img_url'] = regexRule.findall(img_contents[index])[1]
				except:
					pass
			yield scrapy.Request(item['url'], self.page_parse, meta={
				'splash': {
				'endpoint': 'render.html',
				'args': {'wait': 0.5},
				},
				'item': item,
				})
Exemplo n.º 4
0
 def parse(self, response, data):
     item = DispensaryItem()
     # item['State'] = str(Selector(response).xpath(
     #     '//*[@class="map-title__TitleWrapper-sc-8szojn-0 juiolJ"]/text()').get()).replace(" in ", "")
     item['StoreUrl'] = response.request.url
     item['ListingUrl'] = response.request.url
     item['DispensaryName'] = Selector(response).xpath(
         '//*[@class="styled-components__Name-soafp9-0 cWmvtr"]/text()'
     ).get()
     item['Type'] = Selector(response).xpath(
         '//*[@class="styled-components__Capitalize-soafp9-10 gihoQE"]/text()'
     ).get()
     item['Address'] = Selector(response).xpath(
         '//*[@class="styled-components__AddressRow-sc-1k0lbjf-2 dwPNra"]/text()'
     ).get()
     list_working_time = Selector(response).xpath(
         '//*[@class="src__Box-sc-1sbtrzs-0 open-hours__Range-xpgk3n-7 fCOJPV"]/text()'
     )
     for working_time in list_working_time:
         if list_working_time.index(working_time) == 0:
             item['Monday'] = working_time.get()
         elif list_working_time.index(working_time) == 1:
             item['Tuesday'] = working_time.get()
         elif list_working_time.index(working_time) == 2:
             item['Wednesday'] = working_time.get()
         elif list_working_time.index(working_time) == 3:
             item['Thursday'] = working_time.get()
         elif list_working_time.index(working_time) == 4:
             item['Friday'] = working_time.get()
         elif list_working_time.index(working_time) == 5:
             item['Saturday'] = working_time.get()
         elif list_working_time.index(working_time) == 6:
             item['Sunday'] = working_time.get()
     item['Phone'] = Selector(response).xpath(
         '//a[contains(@href, "tel:")]/text()').get()
     item['Email'] = Selector(response).xpath(
         '//a[contains(@href, "mailto:")]/text()').get()
     # item['Website'] = Selector(response).xpath('//a[contains(@href, "mailto:")]/text()').get()
     item['Instagram'] = Selector(response).xpath(
         '//a[contains(@href, "https://www.instagram.com")]/text()').get()
     item['Twitter'] = Selector(response).xpath(
         '//a[contains(@href, "https://twitter.com")]/text()').get()
     item['Facebook'] = Selector(response).xpath(
         '//a[contains(@href, "https://www.facebook.com")]/text()').get()
     item['StoreUrl'] = data[5]
     item['State'] = data[4]
     item['ListingUrl'] = data[2]
     item['Listings'] = data[3]
     item['ChildListingUrl'] = data[0]
     item['ChildListings'] = data[1]
     # item['Address'] = Selector(response).xpath(
     #     '//*[@class="styled-components__Capitalize-soafp9-10 gihoQE"]/text()').get()  # styled-components__AddressRow-sc-1k0lbjf-2 dwPNra
     # item['Address'] = Selector(response).xpath(
     #     '//*[@class="styled-components__Capitalize-soafp9-10 gihoQE"]/text()').get()  # styled-components__AddressRow-sc-1k0lbjf-2 dwPNra
     # item['Address'] = Selector(response).xpath(
     #     '//*[@class="styled-components__Capitalize-soafp9-10 gihoQE"]/text()').get()  # styled-components__AddressRow-sc-1k0lbjf-2 dwPNra
     # item['Address'] = Selector(response).xpath(
     #     '//*[@class="styled-components__Capitalize-soafp9-10 gihoQE"]/text()').get()  # styled-components__AddressRow-sc-1k0lbjf-2 dwPNra
     yield item
Exemplo n.º 5
0
    def registro_electoral(cls, nac, ci):
        html = cls.request_re(nac, ci)
        if html is None:
            return config.failed
        html = html.read().decode('utf-8')
        html = html.replace('\t', '').replace('\n', '').replace('\r', '')
        data = Selector(text=html).xpath(cls.re_xpath).extract()
        if not 'Nombre:' in data:
            return config.failed

        return {
            'ci': cls.get_ci(nac, ci),
            'nacionalidad': nac,
            'cedula': ci,
            'nombre': cls.parse_txt(data[data.index('Nombre:') + 1]).title(),
            'estado': cls.parse_edo(data[data.index('Estado:') + 1]),
            'municipio': cls.parse_mp(data[data.index('Municipio:') + 1]),
            'parroquia': cls.parse_pq(data[data.index('Parroquia:') + 1]),
            'centro': cls.parse_txt(data[data.index('Centro:') + 1]).capitalize(),
            'direccion': cls.parse_txt(data[data.index('Dirección:') + 1]).capitalize()
            }
Exemplo n.º 6
0
    def registro_electoral(cls, nac, ci):
        html = cls.request_re(nac, ci)
        if html is None:
            return config.failed
        html = html.read().decode('utf-8')
        html = html.replace('\t', '').replace('\n', '').replace('\r', '')
        data = Selector(text=html).xpath(cls.re_xpath).extract()
        if not 'Nombre:' in data:
            return config.failed

        return {
            'ci':
            cls.get_ci(nac, ci),
            'nacionalidad':
            nac,
            'cedula':
            ci,
            'nombre':
            cls.parse_txt(data[data.index('Nombre:') + 1]).title(),
            'estado':
            cls.parse_edo(data[data.index('Estado:') + 1]),
            'municipio':
            cls.parse_mp(data[data.index('Municipio:') + 1]),
            'parroquia':
            cls.parse_pq(data[data.index('Parroquia:') + 1]),
            'centro':
            cls.parse_txt(data[data.index('Centro:') + 1]).capitalize(),
            'direccion':
            cls.parse_txt(data[data.index('Dirección:') + 1]).capitalize()
        }
Exemplo n.º 7
0
    def parse2(self, response):
        #详情页取定义存数据的item
        item = response.meta['item']
        #通过url取数据
        movies = Selector(
            response=response).xpath('//div[@class="movie-brief-container"]')

        #将取到的数据依次存入item
        for movie in movies:
            filmtitle = movie.xpath('./h1/text()')

            #filmtype不定,有多个,用循环取,暂未实现
            filmtypes = Selector(
                response=response).xpath('//li[@class="ellipsis"]')
            print("-------------打印filmtypes-------------------------")
            print(filmtypes)
            filmtypestr = ' '
            for filmtype in filmtypes:
                print("-------------打印filmtype-------------------------")
                print(filmtype)
                i = filmtypes.index(filmtype)
                #print("i:" + str(i))
                tempstr = filmtype.xpath(
                    '/html/body/div[3]/div/div[2]/div[1]/ul/li[1]/a[1]/text()'
                ).extract_first().strip()
                print("-------------打印tempstr-------------------------")
                print(tempstr)
                filmtypestr.join(tempstr)
            print("-------------打印filmtypesstr-------------------------")
            print(filmtypestr)

            #filmtype只取了一个
            filmtype = movie.xpath(
                '/html/body/div[3]/div/div[2]/div[1]/ul/li[1]/a[1]/text()')
            filmdate = movie.xpath(
                '/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/text()')
            #打印调试
            print(
                "-------------------打印parse2中item的各项---------------------------"
            )
            print(filmtitle.extract_first().strip())
            print(filmtype.extract_first().strip())
            print(filmdate.extract_first().strip())
            #传入值
            item['filmtitle'] = filmtitle.extract_first().strip()
            item['filmtype'] = filmtype.extract_first().strip()
            item['filmdate'] = filmdate.extract_first().strip()
        #返回数据
        yield item
Exemplo n.º 8
0
    def parse_item(self, response):
        nameList = response.xpath(
            '//*[@id="ListViewInner"]/li/h3/a/text()').extract()
        linkList = response.xpath(
            '//*[@id="ListViewInner"]/li/h3/a/@href').extract()

        raw_costList = response.xpath(
            '//*[@id="ListViewInner"]/li/ul[@class="lvprices left space-zero"]'
        ).extract()
        costList = []
        for htmlpart in raw_costList:  #extract first cost
            raw_cost = Selector(text=htmlpart).xpath(
                '//ul[@class="lvprices left space-zero"]/li[@class="lvprice prc"]/span/text()'
            ).extract_first()
            try:
                result = raw_cost[raw_cost.index('$') + 1:]
            except:
                result = '-'
            costList.append(result)

        countryList_raw = response.xpath(
            '//*[@id="ListViewInner"]/li/ul[@class="lvdetails left space-zero full-width"]/li/text()'
        ).extract()
        countryList = []
        for cnt in countryList_raw:  # clear country list
            if cnt.isspace() == False:
                cnt = cnt[cnt.index('F'):]
                countryList.append(cnt)
            else:
                pass

        imgUrlList = response.xpath(
            '//*[@id="ListViewInner"]/li/div/div/a/img/@src').extract()

        for i in range(len(nameList)):
            try:
                yield {
                    "name": nameList[i],
                    "link": linkList[i],
                    "cost": costList[i],
                    "country": countryList[i],
                    "imgUrl": imgUrlList[i]
                }
            except:
                pass
Exemplo n.º 9
0
    def page_parse(self, response):
        item = response.meta['item']
        item['location'] = ""
        try:
            locationFetch = Selector(response).xpath(
                '//span[@class="street-address"]/text()').extract()[0]
            item['location'] = item['location'] + locationFetch
        except:
            pass
        try:
            locationFetch = Selector(response).xpath(
                '//span[@class="locality"]/text()').extract()[0]
            item['location'] = item['location'] + " " + locationFetch
        except:
            pass
        try:
            locationFetch = Selector(response).xpath(
                '//span[@class="country-name"]/text()').extract()[0]
            item['location'] = item['location'] + " " + locationFetch
        except:
            pass
        item['location'] = re.sub(' +', ' ', item['location']).strip()

        features = Selector(response).xpath(
            '//div[@class="rating_and_popularity"]/span[@class="header_detail attraction_details"]/div[@class="detail"]/a'
        )
        item['features'] = ""
        features_list = []

        for feature in features:
            index = features.index(feature)
            features_list.append(feature.xpath('text()').extract()[0])
            item['features'] = item['features'] + feature.xpath(
                'text()').extract()[0]
            item['features'] = item['features'] + ", "

        item['features'] = item['features'][:-2]

        yield item
Exemplo n.º 10
0
    def parse(self, response):
        nameList = response.xpath(
            '//*[@id="ListViewInner"]/li/h3/a/text()').extract()
        linkList = response.xpath(
            '//*[@id="ListViewInner"]/li/h3/a/@href').extract()

        raw_costList = response.xpath(
            '//*[@id="ListViewInner"]/li/ul[@class="lvprices left space-zero"]'
        ).extract()
        costList = []
        for htmlpart in raw_costList:  #extract first cost
            raw_cost = Selector(text=htmlpart).xpath(
                '//ul[@class="lvprices left space-zero"]/li[@class="lvprice prc"]/span/text()'
            ).extract_first()
            try:
                result = raw_cost[raw_cost.index('$') + 1:]
            except:
                result = '-'
            costList.append(result)

        countryList_raw = response.xpath(
            '//*[@id="ListViewInner"]/li/ul[@class="lvdetails left space-zero full-width"]/li/text()'
        ).extract()
        countryList = []
        for cnt in countryList_raw:  # clear country list
            if cnt.isspace() == False:
                cnt = cnt[cnt.index('F'):]
                countryList.append(cnt)
            else:
                pass

    # Not all imgUrls pull correct  if use >
    #imgUrlList = response.xpath('//*[@id="ListViewInner"]/li/div/div/a/img/@src').extract()
        imgUrlList = []
        for link in linkList:
            #imgUrl example "http://thumbs.ebaystatic.com/images/g/B80AAOSwZQRYaBjw/s-l225.jpg"
            #part B80AAOSwZQRYaBjw extract from item link
            index = link.rindex(':')
            imgUrl = "http://thumbs.ebaystatic.com/images/g/" + link[
                index + 1:] + "/s-l225.jpg"
            imgUrlList.append(imgUrl)

        for i in range(len(nameList)):
            try:
                yield {
                    "name": nameList[i],
                    "link": linkList[i],
                    "cost": costList[i],
                    "country": countryList[i],
                    "imgUrl": imgUrlList[i]
                }
            except:
                pass

        crawledLinks = self.start_urls

        nextPageUrl = response.xpath(
            '//*[@class="gspr next"]/@href').extract()[0]
        print("NPU >>> ", nextPageUrl)
        if nextPageUrl != None and len(crawledLinks) < 5:
            crawledLinks.append(nextPageUrl)
            yield Request(response.urljoin(nextPageUrl), callback=self.parse)
        else:
            "nextPageUrl not defined"

        print(crawledLinks)