def _get_registro_nacional_electoral(self): print("REGISTRO NACIONAL ELECTORAL") print(self.nacionalidad) print(self.cedula) html = ConsultarDatos(self.nacionalidad, self.cedula).registro_nacional_electoral() data = Selector(text=html).xpath( self.registro_electoral_xpath).extract() if data[3].find("Registro") == 0: return self.CI_NO_REGISTRADA elif data[3] == " FALLECIDO (3)": raise CiudadanoException( message= f"Error! la cedula {self.nacionalidad}-{self.cedula} pertenece a un ciudadano fallecido...", code=self.CI_FALLECIDO) pn = ParseNombre(html) p = Parse() return Ciudadano( id=int(self.cedula), nacionalidad="Venezolano" if self.nacionalidad == "V" else "Extranjero", cedula=int(self.cedula), nombre_completo=pn.nombre_completo, nombres=pn.nombre_de_pila, apellidos=pn.apellidos, estado=p.parse_edo(data[data.index('Estado:') + 1]).title(), municipio=p.parse_mp(data[data.index('Municipio:') + 1]).title(), parroquia=p.parse_pq(data[data.index('Parroquia:') + 1]).title(), centro=p.parse_txt(data[data.index('Centro:') + 1]).title(), direccion=p.parse_txt(data[data.index('Dirección:') + 1]).capitalize())
def parse(self, response): contents = Selector(response).xpath('//div[@class="listing_title "]') img_contents = Selector(response).xpath( '//div[@class="photo_booking non_generic"]/a/img/@id').extract() lazy_img = Selector(response).xpath( '//*[@id="BODY_BLOCK_JQUERY_REFLOW"]/script[22]/text()').extract() for content in contents: item = AttractionItem() index = contents.index(content) item['name'] = content.xpath('a/text()').extract()[0] item['url'] = content.xpath('a/@href').extract()[0] item['url'] = "https://www.tripadvisor.com.vn/" + item['url'] item['img_url'] = "" item['city_id'] = 0 try: item['img_id'] = img_contents[index] except: pass yield scrapy.Request(item['url'], self.page_parse, meta={ 'splash': { 'endpoint': 'render.html', 'args': { 'wait': 0.5 }, }, 'item': item, })
def parse(self, response): contents = Selector(response).xpath('//div[@class="listing_title"]') img_contents = Selector(response).xpath('//div[@class="aspect is-shown-at-tablet"]/div[@class="inner"]').extract() for content in contents: item = TripItem() index = contents.index(content) item['title'] = content.xpath('a/text()').extract()[0] item['url'] = content.xpath('a/@href').extract()[0] item['url'] = "https://www.tripadvisor.com.vn/" + item['url'] item['city_id'] = 1 try: regexRule = re.compile("\((.*?)\)") item['img_url'] = regexRule.findall(img_contents[index])[0] except: try: regexRule = re.compile("\"(.*?)\"") item['img_url'] = regexRule.findall(img_contents[index])[1] except: pass yield scrapy.Request(item['url'], self.page_parse, meta={ 'splash': { 'endpoint': 'render.html', 'args': {'wait': 0.5}, }, 'item': item, })
def parse(self, response, data): item = DispensaryItem() # item['State'] = str(Selector(response).xpath( # '//*[@class="map-title__TitleWrapper-sc-8szojn-0 juiolJ"]/text()').get()).replace(" in ", "") item['StoreUrl'] = response.request.url item['ListingUrl'] = response.request.url item['DispensaryName'] = Selector(response).xpath( '//*[@class="styled-components__Name-soafp9-0 cWmvtr"]/text()' ).get() item['Type'] = Selector(response).xpath( '//*[@class="styled-components__Capitalize-soafp9-10 gihoQE"]/text()' ).get() item['Address'] = Selector(response).xpath( '//*[@class="styled-components__AddressRow-sc-1k0lbjf-2 dwPNra"]/text()' ).get() list_working_time = Selector(response).xpath( '//*[@class="src__Box-sc-1sbtrzs-0 open-hours__Range-xpgk3n-7 fCOJPV"]/text()' ) for working_time in list_working_time: if list_working_time.index(working_time) == 0: item['Monday'] = working_time.get() elif list_working_time.index(working_time) == 1: item['Tuesday'] = working_time.get() elif list_working_time.index(working_time) == 2: item['Wednesday'] = working_time.get() elif list_working_time.index(working_time) == 3: item['Thursday'] = working_time.get() elif list_working_time.index(working_time) == 4: item['Friday'] = working_time.get() elif list_working_time.index(working_time) == 5: item['Saturday'] = working_time.get() elif list_working_time.index(working_time) == 6: item['Sunday'] = working_time.get() item['Phone'] = Selector(response).xpath( '//a[contains(@href, "tel:")]/text()').get() item['Email'] = Selector(response).xpath( '//a[contains(@href, "mailto:")]/text()').get() # item['Website'] = Selector(response).xpath('//a[contains(@href, "mailto:")]/text()').get() item['Instagram'] = Selector(response).xpath( '//a[contains(@href, "https://www.instagram.com")]/text()').get() item['Twitter'] = Selector(response).xpath( '//a[contains(@href, "https://twitter.com")]/text()').get() item['Facebook'] = Selector(response).xpath( '//a[contains(@href, "https://www.facebook.com")]/text()').get() item['StoreUrl'] = data[5] item['State'] = data[4] item['ListingUrl'] = data[2] item['Listings'] = data[3] item['ChildListingUrl'] = data[0] item['ChildListings'] = data[1] # item['Address'] = Selector(response).xpath( # '//*[@class="styled-components__Capitalize-soafp9-10 gihoQE"]/text()').get() # styled-components__AddressRow-sc-1k0lbjf-2 dwPNra # item['Address'] = Selector(response).xpath( # '//*[@class="styled-components__Capitalize-soafp9-10 gihoQE"]/text()').get() # styled-components__AddressRow-sc-1k0lbjf-2 dwPNra # item['Address'] = Selector(response).xpath( # '//*[@class="styled-components__Capitalize-soafp9-10 gihoQE"]/text()').get() # styled-components__AddressRow-sc-1k0lbjf-2 dwPNra # item['Address'] = Selector(response).xpath( # '//*[@class="styled-components__Capitalize-soafp9-10 gihoQE"]/text()').get() # styled-components__AddressRow-sc-1k0lbjf-2 dwPNra yield item
def registro_electoral(cls, nac, ci): html = cls.request_re(nac, ci) if html is None: return config.failed html = html.read().decode('utf-8') html = html.replace('\t', '').replace('\n', '').replace('\r', '') data = Selector(text=html).xpath(cls.re_xpath).extract() if not 'Nombre:' in data: return config.failed return { 'ci': cls.get_ci(nac, ci), 'nacionalidad': nac, 'cedula': ci, 'nombre': cls.parse_txt(data[data.index('Nombre:') + 1]).title(), 'estado': cls.parse_edo(data[data.index('Estado:') + 1]), 'municipio': cls.parse_mp(data[data.index('Municipio:') + 1]), 'parroquia': cls.parse_pq(data[data.index('Parroquia:') + 1]), 'centro': cls.parse_txt(data[data.index('Centro:') + 1]).capitalize(), 'direccion': cls.parse_txt(data[data.index('Dirección:') + 1]).capitalize() }
def parse2(self, response): #详情页取定义存数据的item item = response.meta['item'] #通过url取数据 movies = Selector( response=response).xpath('//div[@class="movie-brief-container"]') #将取到的数据依次存入item for movie in movies: filmtitle = movie.xpath('./h1/text()') #filmtype不定,有多个,用循环取,暂未实现 filmtypes = Selector( response=response).xpath('//li[@class="ellipsis"]') print("-------------打印filmtypes-------------------------") print(filmtypes) filmtypestr = ' ' for filmtype in filmtypes: print("-------------打印filmtype-------------------------") print(filmtype) i = filmtypes.index(filmtype) #print("i:" + str(i)) tempstr = filmtype.xpath( '/html/body/div[3]/div/div[2]/div[1]/ul/li[1]/a[1]/text()' ).extract_first().strip() print("-------------打印tempstr-------------------------") print(tempstr) filmtypestr.join(tempstr) print("-------------打印filmtypesstr-------------------------") print(filmtypestr) #filmtype只取了一个 filmtype = movie.xpath( '/html/body/div[3]/div/div[2]/div[1]/ul/li[1]/a[1]/text()') filmdate = movie.xpath( '/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/text()') #打印调试 print( "-------------------打印parse2中item的各项---------------------------" ) print(filmtitle.extract_first().strip()) print(filmtype.extract_first().strip()) print(filmdate.extract_first().strip()) #传入值 item['filmtitle'] = filmtitle.extract_first().strip() item['filmtype'] = filmtype.extract_first().strip() item['filmdate'] = filmdate.extract_first().strip() #返回数据 yield item
def parse_item(self, response): nameList = response.xpath( '//*[@id="ListViewInner"]/li/h3/a/text()').extract() linkList = response.xpath( '//*[@id="ListViewInner"]/li/h3/a/@href').extract() raw_costList = response.xpath( '//*[@id="ListViewInner"]/li/ul[@class="lvprices left space-zero"]' ).extract() costList = [] for htmlpart in raw_costList: #extract first cost raw_cost = Selector(text=htmlpart).xpath( '//ul[@class="lvprices left space-zero"]/li[@class="lvprice prc"]/span/text()' ).extract_first() try: result = raw_cost[raw_cost.index('$') + 1:] except: result = '-' costList.append(result) countryList_raw = response.xpath( '//*[@id="ListViewInner"]/li/ul[@class="lvdetails left space-zero full-width"]/li/text()' ).extract() countryList = [] for cnt in countryList_raw: # clear country list if cnt.isspace() == False: cnt = cnt[cnt.index('F'):] countryList.append(cnt) else: pass imgUrlList = response.xpath( '//*[@id="ListViewInner"]/li/div/div/a/img/@src').extract() for i in range(len(nameList)): try: yield { "name": nameList[i], "link": linkList[i], "cost": costList[i], "country": countryList[i], "imgUrl": imgUrlList[i] } except: pass
def page_parse(self, response): item = response.meta['item'] item['location'] = "" try: locationFetch = Selector(response).xpath( '//span[@class="street-address"]/text()').extract()[0] item['location'] = item['location'] + locationFetch except: pass try: locationFetch = Selector(response).xpath( '//span[@class="locality"]/text()').extract()[0] item['location'] = item['location'] + " " + locationFetch except: pass try: locationFetch = Selector(response).xpath( '//span[@class="country-name"]/text()').extract()[0] item['location'] = item['location'] + " " + locationFetch except: pass item['location'] = re.sub(' +', ' ', item['location']).strip() features = Selector(response).xpath( '//div[@class="rating_and_popularity"]/span[@class="header_detail attraction_details"]/div[@class="detail"]/a' ) item['features'] = "" features_list = [] for feature in features: index = features.index(feature) features_list.append(feature.xpath('text()').extract()[0]) item['features'] = item['features'] + feature.xpath( 'text()').extract()[0] item['features'] = item['features'] + ", " item['features'] = item['features'][:-2] yield item
def parse(self, response): nameList = response.xpath( '//*[@id="ListViewInner"]/li/h3/a/text()').extract() linkList = response.xpath( '//*[@id="ListViewInner"]/li/h3/a/@href').extract() raw_costList = response.xpath( '//*[@id="ListViewInner"]/li/ul[@class="lvprices left space-zero"]' ).extract() costList = [] for htmlpart in raw_costList: #extract first cost raw_cost = Selector(text=htmlpart).xpath( '//ul[@class="lvprices left space-zero"]/li[@class="lvprice prc"]/span/text()' ).extract_first() try: result = raw_cost[raw_cost.index('$') + 1:] except: result = '-' costList.append(result) countryList_raw = response.xpath( '//*[@id="ListViewInner"]/li/ul[@class="lvdetails left space-zero full-width"]/li/text()' ).extract() countryList = [] for cnt in countryList_raw: # clear country list if cnt.isspace() == False: cnt = cnt[cnt.index('F'):] countryList.append(cnt) else: pass # Not all imgUrls pull correct if use > #imgUrlList = response.xpath('//*[@id="ListViewInner"]/li/div/div/a/img/@src').extract() imgUrlList = [] for link in linkList: #imgUrl example "http://thumbs.ebaystatic.com/images/g/B80AAOSwZQRYaBjw/s-l225.jpg" #part B80AAOSwZQRYaBjw extract from item link index = link.rindex(':') imgUrl = "http://thumbs.ebaystatic.com/images/g/" + link[ index + 1:] + "/s-l225.jpg" imgUrlList.append(imgUrl) for i in range(len(nameList)): try: yield { "name": nameList[i], "link": linkList[i], "cost": costList[i], "country": countryList[i], "imgUrl": imgUrlList[i] } except: pass crawledLinks = self.start_urls nextPageUrl = response.xpath( '//*[@class="gspr next"]/@href').extract()[0] print("NPU >>> ", nextPageUrl) if nextPageUrl != None and len(crawledLinks) < 5: crawledLinks.append(nextPageUrl) yield Request(response.urljoin(nextPageUrl), callback=self.parse) else: "nextPageUrl not defined" print(crawledLinks)