def parse(self, response): title = response.css('title::text').extract_first().encode('utf-8') try: subtitle = response.xpath('//*[@class="detail-title_subtitle"]/text()').extract_first().encode('utf-8') except: subtitle = "" try: price = response.xpath('//*[@id="price"]/text()').extract_first().encode('utf-8') except: price = "" try: description = response.xpath('//input[(@type="hidden") and (@name="description")]/@value').extract_first().encode('utf-8') except: description = "" photo = response.xpath('//input[(@type="hidden") and (@name="urlphoto")]/@value').extract_first() try: ville = response.xpath('//input[(@type="hidden") and (@name="ville")]/@value').extract_first().encode('utf-8') except: ville = "" try: codepostal = response.xpath('//input[(@type="hidden") and (@name="codepostal")]/@value').extract_first().encode('utf-8') except: codepostal = "" try: typebien = response.xpath('//input[(@type="hidden") and (@name="typebien")]/@value').extract_first().encode('utf-8') except: typebien = "" try: surface = response.xpath('//input[(@type="hidden") and (@name="surface")]/@value').extract_first().encode('utf-8') except: surface = "" try: northeastLatitude = response.xpath('//div[@id="resume__map_new"]/@data-boudingbox-northeast-latitude').extract_first().encode('utf-8') northeastLongitude = response.xpath('//div[@id="resume__map_new"]/@data-boudingbox-northeast-longitude').extract_first().encode('utf-8') southwestLatitude = response.xpath('//div[@id="resume__map_new"]/@data-boudingbox-southwest-latitude').extract_first().encode('utf-8') southwestLongitude = response.xpath('//div[@id="resume__map_new"]/@data-boudingbox-southwest-longitude').extract_first().encode('utf-8') lat = (float(northeastLatitude) + float(southwestLatitude))/2 lon = (float(northeastLongitude) + float(southwestLongitude))/2 except: lat=0 lon=0 rubedo.insertContent(title, subtitle, price,description, photo, ville, codepostal, typebien, surface, lat, lon)
def parse(self, response): item_index = response.meta['index'] originalUrl = response.request.meta['redirect_urls'][0] m = re.search(r'\d+',originalUrl) if m: contentId = m.group(0) else: contentId = None if contentId is not None: title = response.css('h1::text').extract_first() chapeau = title content = response.xpath('//*[@id="content"]') content = content.xpath('*[not(self::form or ancestor::form)]') content = content.xpath('*[not(@id="outils" or ancestor::div/@id="outils")]') texte = "".join(content.extract()) visuel = response.xpath('//img[contains(@src, "arton")]/@src').extract_first() rubedo.insertContent(contentId, title, title, texte, visuel, self.type[item_index], self.taxo[item_index], self.workspaces[item_index]) else: print(originalUrl) pass