class DrivySpider(scrapy.Spider): name = "lamachineduvoisin" category = "daily" subcategory = "washing" allowed_domains = ["http://www.lamachineduvoisin.fr"] # scrap lamachineduvoisin par villes France = France() cities = France.cities start_urls = list(map(lambda x: "http://www.lamachineduvoisin.fr/fr/find/"+str(x), cities)) def parse(self, response): for sel in response.xpath('//div[@data-car-id]'): item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath("div[@class='search_card_content car_content']/a[@class='car_title']/@title").extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('div[@class="search_card_aside car_photo"]/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = sel.xpath('div[@class="search_card_content car_content"]/a[@class="car_title"]/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath('div[@class="search_card_content car_content"]/div[@class="car_subtitle"]/text()').extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath('div[@class="search_card_content car_content"]/div[@class="car_location"]/text()[2]').extract()[0] except: item['location'] = empty item['latitude'] = empty item['longitude'] = empty try: item['price'] = sel.xpath('div[@class="search_card_content car_content"]/span[@class="js_car_price car_price"]/strong/text()').extract()[0].encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath('div[@class="search_card_content car_content"]/span[@class="js_car_price car_price"]/text()').extract()[0] except: item['period'] = empty yield item
class EloueBricoSpider(scrapy.Spider): name = "zealguide" category = "leisure" subcategory = "visiting" France = France() allowed_domains = ["https://www.zealguide.com"] start_urls = [ "https://www.zealguide.com/fr?q=france&transaction_type=offering&view=list" ] def parse(self, response): for sel in response.xpath("//div[@class='home-list-item']"): item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath("div[2]/h2/a/text()").extract()[0] item['location'] = self.France.city_from_title(item['title']) except: item['title'] = empty item['location'] = empty try: item['media'] = sel.xpath('a/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'div[3]/div[2]/a/text()').extract()[0] except: item['description'] = empty item['latitude'] = empty item['longitude'] = empty try: item['price'] = sel.xpath('div/div/div/text()').extract( )[0].encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty item['period'] = "day" yield item
class SharedparkingSpider(scrapy.Spider): name = "sharedparking" category = "parking" subcategory = "parking" allowed_domains = ["http://www.sharedparking.fr/"] France = France() cities = France.cities urls = list(map(lambda x: "http://www.sharedparking.fr/search?sc-cat=2&w="+str(x), cities)) start_urls = [url+"&page="+str(i) for url in urls for i in range(1,4)] def parse(self, response): for sel in response.xpath('//table[@class="annonces"]/tr'): item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('td/a/@title').extract()[0] except: item['title'] = empty item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath('td/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath('td[3]/text()').extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath('td[2]/span/span/text()').extract()[0] except: item['location'] = empty item['latitude'] = empty item['longitude'] = empty try: price = sel.xpath('td[@style="text-align: right;"]/text()').extract()[0].split('/') item['price'] = price[0].encode('utf-8').strip('€') item['period'] = price[1] item['currency'] = "€" except: item['price'] = empty item['currency'] = empty item['period'] = empty yield item
class BricolibSpider(scrapy.Spider): name = "bricolib" category = "daily" subcategory = "brico" allowed_domains = ["http://www.bricolib.net"] # scrap zilok by categories start_urls = list( map(lambda x: "http://www.bricolib.net/annonces/page/" + str(x), range(1, 200))) France = France() geo = France.geo def parse(self, response): for sel in response.xpath('//div[@class="post-block"]'): item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath( 'div[@class="post-left"]/a/@title').extract()[0] except: item['title'] = empty try: item['media'] = "https:" + sel.xpath( 'div[@class="post-left"]/a/@data-rel').extract()[0] except: item['media'] = empty try: item['url'] = sel.xpath( 'div[@class="post-left"]/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'div[@class="post-right"]/p[@class="post-desc"]/text()' ).extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath( 'div[@class="post-right"]/p[@class="post-meta"]/span[@class="cp_city"]/text()' ).extract()[0] except: item['location'] = empty try: item['postal_code'] = sel.xpath( 'div[@class="post-right"]/p[@class="post-meta"]/span[@class="cp_zipcode"]/text()' ).extract()[0] except: item['postal_code'] = 0 try: item['latitude'] = float(self.geo[item['location']]['lat']) except: item['latitude'] = empty try: item['longitude'] = float(self.geo[item['location']]['lon']) except: item['longitude'] = empty try: price = sel.xpath( 'div[@class="post-right"]/div[@class="price-wrap"]/p[@class="post-price"]/text()' ).extract()[0].split('/') item['price'] = price[0].strip(' ').encode('utf-8').strip('€') item['period'] = price[1] item['currency'] = "€" except: item['price'] = empty item['period'] = empty item['currency'] = empty item['evaluations'] = empty yield item
class EloueBricoSpider(scrapy.Spider): name = "eloue" category = "daily" subcategory = "brico" allowed_domains = ["https://www.e-loue.com"] # scrap zilok by categories start_urls0 = list(map(lambda x: "https://www.e-loue.com/location/page/%s/?r=9"%str(x), range(1,20))) France = France() cities = France.cities start_urls= [url+'&l='+city for url in start_urls0 for city in cities ] def parse(self, response): for sel in response.xpath('//ol[@class="product-layout"]/li'): item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath("@name").extract()[0] except: item['title'] = empty try: item['media'] = "https:"+sel.xpath('div/div/a/img/@style').extract()[0].split(')')[0].split(':')[-1] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath('div/div/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath('div/div[@class="info"]/p[@class="full_description"]/text()').extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath('div/div[@class="info"]/p/text()').extract()[0] item['postal_code'] = int(item['location'].split(', ')[1]) except: item['location'] = empty item['postal_code'] = 0 try: item['latitude'] = sel.xpath("@locationx").extract()[0] except: item['latitude'] = empty try: item['longitude'] = sel.xpath("@locationy").extract()[0] except: item['longitude'] = empty try: price = sel.xpath('div/div/span[@class="badge price"]/text()').extract()[0].split('/') item['price'] = price[0].strip(' ').encode('utf-8').strip('€') item['period'] = price[1] item['currency'] = "€" except: item['price'] = empty item['period'] = empty item['currency'] = empty item['evaluations'] = empty yield item
class HousetripSpider(scrapy.Spider): name = "housetrip" category = "housing" subcategory = "apartment" allowed_domains = ["http://www.housetrip.fr"] France = France() geo = France.geo cities = geo.keys() start_urls_0 = list( map( lambda x: "http://www.housetrip.fr/fr/chercher-appartements-vacances/" + str( x), cities)) start_urls = [ url + "?page=" + str(x) for url in start_urls_0 for x in range(100) ] def parse(self, response): for sel in response.xpath('//li[@data-element-id]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath( 'div[2]/div[1]/h3/a/text()').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('div[1]/@style').extract()[0].split( '(')[1].split(')')[0].strip("'") except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'div[2]/div[1]/h3/a/@href').extract()[0] except: item['url'] = empty try: desc0 = sel.xpath('div[2]/div/ul[1]/li[1]/text()').extract()[0] desc1 = sel.xpath('div[2]/div/ul[1]/li[2]/text()').extract()[0] #desc2 = sel.xpath('div[2]/div/ul[2]/li/text()').extract()[0] item['description'] = desc0 + " " + desc1 + " " except: item['description'] = empty try: item['location'] = sel.xpath( 'div[2]/div[1]/h4/text()').extract()[0] except: item['location'] = empty item['postal_code'] = empty item['evaluations'] = empty url_city = response.url.split('?')[0].split('/')[-1] try: item['latitude'] = float(self.geo[url_city]['lat']) except: item['latitude'] = empty try: item['longitude'] = float(self.geo[url_city]['lon']) except: item['longitude'] = empty try: item['price'] = sel.xpath('div[2]/div[3]/p/text()').extract( )[0].strip('\n').encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath( 'div[2]/div[3]/p[2]/text()').extract()[0] except: item['period'] = empty yield item
class ParkadomSpider(scrapy.Spider): name = "parkadom" category = "parking" subcategory = "parking" allowed_domains = ["http://www.parkadom.com"] #start_urls = list(map(lambda x: "http://www.parkadom.com/location-parking/resultat-de-recherche?page"+str(x), range(1,52))) start_urls = ["http://www.parkadom.com/location-parking/resultat-de-recherche?group=100"] pattern = re.compile("\d{1,}") France = France() geo = France.geo def parse(self, response): for sel in response.xpath('//div[@class="box-parking-dispo"]'): item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('div/span[@class="title-parking"]/text()').extract()[0] item['postal_code'] = searchZip(item['title']) except: item['title'] = empty item['postal_code'] = 0 try: item['media'] = self.allowed_domains[0] + sel.xpath('div/div/div[@class="detail-parking-left"]/div/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath('div/div/div[@class="detail-parking-right"]/div[2]/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath('div/div/div[@class="detail-parking-left"]/div/img/@alt').extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath('div/div/div/div/h1/span/text()').extract()[0] except: item['location'] = empty try: item['latitude'] = self.geo[item['location'].split(',')[-2].strip(' ')]['lat'] except: item['latitude'] = empty try: item['longitude'] = self.geo[item['location'].split(',')[-2].strip(' ')]['lon'] except: item['longitude'] = empty try: item['price'] = sel.xpath('div/div/div[@class="detail-parking-right"]/div/span/span/text()').extract()[0].encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath('div/div/div[@class="detail-parking-right"]/div/span/text()').extract()[0].strip('/') except: item['period'] = empty try: item['evaluations'] = re.search(self.pattern, sel.xpath('div/div/div[@class="detail-parking-left"]/div/div/span/text()').extract()[0]).group() except: item['evaluations'] = empty yield item
class SailsharingSpider(scrapy.Spider): name = "sailsharing" category = "leisure" subcategory = "boat" allowed_domains = ["http://www.sailsharing.com"] # scrap zilok by categories start_urls = list(map(lambda x: "http://www.sailsharing.com/fr/location-bateau/search?page="+str(x), range(1,36))) France = France() geo = France.geo def parse(self, response): for sel in response.xpath('//div[@class="block"]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('div/h2/a/text()').extract()[0].strip("\n ") except: item['title'] = empty try: item['media'] = self.allowed_domains[0] + sel.xpath('a/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath('a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath('div/div[@class="boat-info"]/text()').extract()[0].strip("\n ") except: item['description'] = empty try: item['evaluations'] = sel.xpath('div/div[@class="boat-skipper"]/div[@class="nb-commentaires"]/span[@class="nb-com"]/text()').extract()[0].strip("\n ") except: item['evaluations'] = empty try: item['location'] = sel.xpath('div/div/h4/strong/text()').extract()[0].strip(' -') except: item['location'] = empty try: item['latitude'] = self.geo[item['location']]['lat'] except: item['latitude'] = empty try: item['longitude'] = self.geo[item['location']]['lon'] except: item['longitude'] = empty try: item['price'] = sel.xpath('div[@class="hosting-meta"]/div/span/strong/text()').extract()[0].encode('utf-8').strip('€') item['currency'] = '€' except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath('div[3]/span/text()').extract()[0] except: item['period'] = empty item['postal_code'] = empty yield item
class ZilokManutentionSpider(scrapy.Spider): name = "zilokmanutention" category = "daily" subcategory = "brico" allowed_domains = ["http://www.housetrip.fr"] France = France() cities = France.cities start_urls_0 = list(map(lambda x: "http://www.housetrip.fr/fr/rechercher/"+str(x), cities)) start_urls = [url+"?page="+str(x) for url in start_urls_0 for x in range(100)] def parse(self, response): for sel in response.xpath('//div[@data-element-id]'): item = AdItem() empty = '' item['source'] = "zilok" item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('div[2]/h3/a/text()').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('div[1]/@style').extract()[0].split('(')[1].split(')')[0] except: item['media'] = empty try: item['url'] = sel.xpath('div[2]/h3/a/@href').extract()[0] except: item['url'] = empty try: desc0 = sel.xpath('div[2]/div/ul[1]/li[1]/text()').extract()[0] desc1 = sel.xpath('div[2]/div/ul[1]/li[2]/text()').extract()[0] desc2 = sel.xpath('div[2]/div/ul[2]/li/text()').extract()[0] item['description'] = desc0 + " " + desc1 + " " + desc2 except: item['description'] = empty try: item['location'] = sel.xpath('div[2]/h4/text()').extract()[0] except: item['location'] = empty item['latitude'] = empty item['longitude'] = empty try: item['price'] = sel.xpath('div[3]/div/p/text()').extract()[0].strip('\n').encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath('div[3]/div/p[2]/text()').extract()[0] except: item['period'] = empty yield item
class ClickandboatSpider(scrapy.Spider): name = "clickandboat" category = "leisure" subcategory = "boat" allowed_domains = ["https://www.clickandboat.com"] France = France() cities = France.cities urls = list( map( lambda x: "https://www.clickandboat.com/location-bateau/search?where=" + str( x), cities)) start_urls = [url + "&_page=" + str(i) for i in range(30) for url in urls] def parse(self, response): for sel in response.xpath('//ul[@id="results"]/li'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath( 'div/div[2]/a/h2[@class="titre-annonce"]/text()').extract( )[0] except: item['title'] = empty try: item['media'] = self.allowed_domains[0] + sel.xpath( 'div/div/a/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'div/div/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = "capacite " + sel.xpath( 'div/div[2]/div/div/div[2]/div[2]/p/span/text()').extract( )[0] + " personnes" except: item['description'] = empty try: item['location'] = sel.xpath( 'div/div[2]/div/div/div[1]/div[2]/p/span/text()').extract( )[0] except: item['location'] = empty item['postal_code'] = 0 try: item['latitude'] = sel.xpath( 'div/input[@class="annonce_lat"]/@value').extract()[0] except: item['latitude'] = empty try: item['longitude'] = sel.xpath( 'div/input[@class="annonce_ltd"]/@value').extract()[0] except: item['longitude'] = empty try: item['price'] = sel.xpath( 'div/div[3]/h2/b/span[@class="prix"]/text()').extract()[0] item['currency'] = '€' except: item['price'] = empty item['currency'] = empty try: item['evaluations'] = sel.xpath( 'div/div[2]/div/div/div[4]/div[2]/p/span[1]').extract()[0] except: item['evaluations'] = empty try: item['period'] = sel.xpath('div/div[3]/h2/small[2]/sup/text()' ).extract()[0].strip('/') except: item['period'] = empty yield item
class MonsieurParkingSpider(scrapy.Spider): name = "monsieurparking" category = "parking" subcategory = "parking" allowed_domains = ["http://www.monsieurparking.com"] # scrap by cities France = France() geo_cities = France.geo cities = geo_cities.keys() start_urls = list(map(lambda x: "http://www.monsieurparking.com/location/"+str(x)+".html", cities)) def parse(self, response): print response.url for sel in response.xpath("//div[@id='loginbox']"): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('div/div/div/div/p/a/text()').extract()[0] except: item['title'] = empty try: item['location'] = response.url.split('/')[-1].split('.')[0] except: item['location'] = empty item['postal_code'] = 0 try: item['media'] = sel.xpath('div[@class="detail"]/img/@src').extract()[0] except: item['media'] = self.allowed_domains[0] + "/images/parking-orange-26x26.png" try: item['url'] = self.allowed_domains[0] + sel.xpath("div/div/div/div/p/a/@href").extract()[0] except: item['url'] = empty try: desc0 = sel.xpath('div/div/div/div/span/text()').extract()[0] desc1 = sel.xpath('div/div/div/div/span[2]/text()').extract()[0] item['description'] = desc0 + ", " + desc1 except: item['description'] = empty try: item['latitude'] = float(self.geo_cities[item['location']]['lat']) except: item['latitude'] = empty try: item['longitude'] = float(self.geo_cities[item['location']]['lon']) except: item['longitude'] = empty try: item['price'] = sel.xpath("div/div/div/div/span[3]/text()").extract()[0].split('/')[0].encode('utf-8').split('€')[0] item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath("div/div/div/div/span[3]/text()").extract()[0].split('/')[1] except: item['period'] = empty item['evaluations'] = empty yield item
class HousetripSpider(scrapy.Spider): name = "pretersonjardin" category = "eating" subcategory = "gardens" allowed_domains = ["http://www.pretersonjardin.com"] pages = 18 * range(1, 1000) start_urls = list( map( lambda x: "http://www.pretersonjardin.com/annonces/toutes-les-annonces/Page-%s.html" % str(x), range(1, 100))) France = France() geo = France.geo def parse(self, response): for sel in response.xpath('//tr'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath( 'td[@id="colonne4"]/div[@id="title_ad"]/a/text()').extract( )[0].strip(' ').title() except: item['title'] = empty item['media'] = empty try: item['url'] = sel.xpath( 'td[@id="colonne4"]/div[@id="title_ad"]/a/@href').extract( )[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'td[@id="colonne4"]/div[@id="text_ad"]/a/text()').extract( )[0] except: item['description'] = empty try: item['location'] = sel.xpath('td[@id="colonne3"]/text()' ).extract()[0].strip(' ').title() except: item['location'] = empty try: item['latitude'] = self.geo[item['location']]['lat'] except: item['latitude'] = empty try: item['longitude'] = self.geo[item['location']]['lon'] except: item['longitude'] = empty item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath( 'td[@id="colonne5"]/div/text()').extract()[0] except: item['period'] = empty item['postal_code'] = empty item['evaluations'] = empty yield item
class OuicarSpider(scrapy.Spider): name = 'ouicar' category = 'moving' subcategory = "car" allowed_domains = ["http://www.ouicar.fr"] France = France() cities = France.cities start_urls_0 = list( map(lambda x: "http://www.ouicar.fr/car/search?where=" + str(x), cities)) start_urls = [ url + "&page=" + str(x) for url in start_urls_0 for x in range(100) ] def parse(self, response): for sel in response.xpath('//tr[@data-dpt]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('td/div/a/h3/text()').extract( )[0] + sel.xpath('td/div/a/h3/small/text()').extract()[0] except: item['title'] = empty try: item['media'] = "https:" + sel.xpath( 'td/span/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = sel.xpath('td/div/a/@href').extract()[0] except: item['url'] = empty try: desc0 = sel.xpath( 'td/div/p[@class="ZAuto_content"]/text()').extract()[0] desc1 = sel.xpath( 'td/div/div[@class="z-car-search-livraison"]/text()' ).extract()[0] item['description'] = desc0 + "\n" + desc1 except: item['description'] = empty try: item['location'] = sel.xpath('@data-city').extract()[0] except: item['location'] = empty try: item['latitude'] = sel.xpath('@data-lat').extract()[0] except: item['latitude'] = empty try: item['longitude'] = sel.xpath('@data-lng').extract()[0] except: item['longitude'] = empty try: item['price'] = sel.xpath('td[2]/p/text()').extract( )[0].encode('utf-8').split('€')[0].strip('\n\t') item['currency'] = '€' except: item['price'] = empty item['currency'] = empty item['period'] = "jour" try: res = sel.xpath( 'td/div/p[@class="ZAuto_location"]/text()').extract()[0] item['postal_code'] = searchZip(res) except: item['postal_code'] = empty try: item['evaluations'] = sel.xpath( 'td/div/a/h3/small[@class="ZAuto_title_ratings"]/text()' ).extract()[0].strip('( )') except: item['evaluations'] = empty yield item
class ZilokSpider(scrapy.Spider): name = "zilok" category = "daily" subcategory = "brico" allowed_domains = ["http://www.zilok.fr"] France = France() cities = France.geo start_urls = [] for k, v in cities.items(): url = "http://fr.zilok.com/apiv2/index.php/item/search/api/?action=item.search&api_key=akaka12JHKLAs455saasasa54sJLJLA&distance=15000&language=2&lat=" + str( v["lat"]) + "&limit=1000&lng=" + str( v["lon"]) + "&real_search=1&where=" + k start_urls.append(url) #start_urls = list(map(lambda x:"http://fr.zilok.com/apiv2/index.php/item/search/api/?action=item.search&api_key=akaka12JHKLAs455saasasa54sJLJLA&distance=15000&language=2&lat=%s&limit=30&lng=%s&real_search=1&where=%s"%(_geo[x]["lat"], _geo[x]["lon"], x), cities)) print start_urls def parse(self, response): for sel in response.xpath('//item[@id]'): item = AdItem() item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory empty = "" try: item['title'] = sel.xpath('title/text()').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('image/palm/@url').extract()[0] except: item['media'] = empty try: item['url'] = sel.xpath('link/text()').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath('subtitle/text()').extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath( 'location/locality/text()').extract()[0] except: item['location'] = empty try: item['postal_code'] = sel.xpath( 'location/postal_code/text()').extract()[0] except: item['postal_code'] = empty try: item['latitude'] = sel.xpath('location/lat/text()').extract( )[0] if len(sel.xpath('location/lat/text()').extract( )[0]) > 1 else sel.xpath('/search/lat/text()').extract()[0] except: item['latitude'] = empty try: item['longitude'] = sel.xpath('location/lng/text()').extract( )[0] if len(sel.xpath('location/lng/text()').extract( )[0]) > 1 else sel.xpath('/search/lng/text()').extract()[0] except: item['longitude'] = empty try: item['price'] = sel.xpath('price/text()').extract()[0] except: item['price'] = empty try: item['currency'] = sel.xpath('price/@currency').extract()[0] except: item['currency'] = empty try: item['evaluations'] = sel.xpath( 'evaluation_number/text()').extract()[0] except: item['evaluations'] = empty item['period'] = "jour" yield item
class EzilizeSpider(scrapy.Spider): name = "ezilize" categories = { "bricolage": { "category": "daily", "subcategory": "brico" }, "evenements": { "category": "meet", "subcategory": "events" }, "mode-vetements": { "category": "daily", "subcategory": "dressing" }, "sports-loisirs": { "category": "leisure", "subcategory": "sport" }, "vehicules": { "category": "moving", "subcategory": "car" } } allowed_domains = ["https://ezilize.fr"] France = France() cities = France.cities start_urls_0 = list( map(lambda x: "https://ezilize.fr/location/" + str(x), categories)) start_urls = [ url + "?p=" + str(x) for url in start_urls_0 for x in range(10) ] def parse(self, response): for sel in response.xpath('//div[@itemtype]'): item = AdItem() empty = "" item['source'] = self.name category = response.url.split('?')[0].split('/')[-1] item['category'] = self.categories[category]["category"] item['subcategory'] = self.categories[category]["subcategory"] try: item['title'] = sel.xpath( 'div/div[@class="nsadtitle"]/text()').extract()[0] except: item['title'] = empty try: item['media'] = "https:" + sel.xpath( 'div/div/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'div[@class="nsadprice"]/div/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'div/div[@class="nsadsub"]/text()').extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath( 'div[2]/div[3]/span[2]/text()').extract()[0] item['postal_code'] = int( item['location'].test.split(' - ')[0]) except: item['location'] = empty item['postal_code'] = 0 item['latitude'] = empty item['longitude'] = empty try: item['price'] = sel.xpath( 'div[@class="nsadprice"]/div[@class="nsofferamount"]/text()' ).extract()[0].encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty item['period'] = "jour" item['evaluations'] = empty yield item
class WimduSpider(scrapy.Spider): name = "wimdu" category = "housing" subcategory = "apartment" allowed_domains = ["http://www.wimdu.fr"] # scrap by cities France = France() cities = France.cities start_urls_0 = list(map(lambda x: "http://www.wimdu.fr/" + str(x), cities)) start_urls = [ url + "?page=" + str(x) for url in start_urls_0 for x in range(10) ] def parse(self, response): for sel in response.xpath("//ul[@id='results']/li"): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath( "div/div[2]/div[@class='offer__details']/h3/a/text()" ).extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath( 'div/div/a/img[2]/@data-src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'div/div/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( "div/div[2]/div[@class='offer__details']/div[@class='offer__description']/text()" ).extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath( "div/div[2]/div[@class='offer__details']/div[@class='offer__subtitle']/text()" ).extract()[0] except: item['location'] = empty item['latitude'] = empty item['longitude'] = empty try: item['price'] = sel.xpath( "div/div[2]/div[@class='price price--mini js-price-per-night']/div/text()[2]" ).extract()[0].strip('\n').encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath( "div/div[2]/div[@class='price price--mini js-price-per-night']/div[2]/text()" ).extract()[0] except: item['period'] = empty yield item
class OwlcampSpider(scrapy.Spider): name = "owlcamp" category = "housing" subcategory = "camping" allowed_domains = ["http://owlcamp.com"] France = France() geo = France.geo start_urls = list( map(lambda x: "http://owlcamp.com/fre/gardens/all/page:%s" % str(x), range(2, 15))) start_urls.append("http://owlcamp.com/fre/gardens/all") def parse(self, response): for sel in response.xpath('//div[@class="garden-card"]'): item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath( 'div[@class="garden-card__location"]/text()').extract( )[0].strip(' \n') except: item['title'] = empty try: item['media'] = self.allowed_domains[0] + sel.xpath( 'a[@rel]/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'a[@rel]/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'div[@class="garden-card__location"]/text()').extract( )[0].strip(' \n') except: item['description'] = empty try: item['location'] = sel.xpath( 'div[@class="garden-card__location"]/text()').extract( )[0].strip(' \n') except: item['location'] = empty try: item['latitude'] = str(self.geo[item['location']]['lat']) except: item['latitude'] = empty try: item['longitude'] = str(self.geo[item['location']]['lon']) except: item['longitude'] = empty try: price = sel.xpath('div[@class="garden-card__price"]/div/text()' ).extract()[0].strip(' ') if price == "gratuit": item['price'] = 0 else: item['price'] = price.split('/')[0] item['period'] = price.split('/')[-1] item['currency'] = "€" except: item['price'] = empty item['period'] = empty item['currency'] = empty item['postal_code'] = empty item['evaluations'] = empty yield item
class CookeningSpider(scrapy.Spider): name = "cookening" category = "eating" subcategory = "meals" allowed_domains = ["https://www.cookening.com"] # scrap by cities France = France() cities = France.cities start_urls = list( map(lambda x: "https://www.cookening.com/fr/explore/" + str(x), cities)) geo = France.geo def parse(self, response): for sel in response.xpath("//ul[@id='MealCards']/li"): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath( "a/div[@id='myCarouselGroup']/div[@class='Title myCarousel']/div[@class='Info']/h3/text()" ).extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('a/div/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'a/@href').extract()[0] except: item['url'] = empty try: desc0 = sel.xpath( "a/div[@class='Host']/span[@class='Name']/text()").extract( )[0] desc1 = sel.xpath( "a/div[@class='Host']/span[@class='Bio']/text()").extract( )[0] item['description'] = desc0 + " " + desc1 except: item['description'] = empty try: item['location'] = sel.xpath( "a/div[2]/div[2]/div/span[@class='Place']/text()").extract( )[0] except: item['location'] = empty item['postal_code'] = 0 try: item['latitude'] = float(self.geo[item['location']]['lat']) except: item['latitude'] = empty try: item['longitude'] = float(self.geo[item['location']]['lon']) except: item['longitude'] = empty try: item['price'] = sel.xpath( "a/div[2]/div[2]/div/span[@class='Contribution']/strong/text()" ).extract()[0].strip('\n').encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath( "a/div[2]/div[2]/div/span[@class='Contribution']/span/text()" ).extract()[0] except: item['period'] = empty item['evaluations'] = empty yield item
class SailsharingSpider(scrapy.Spider): name = "wikicampers" category = "moving" subcategory = "camping car" allowed_domains = ["http://www.wikicampers.fr"] # scrap zilok by categories France = France() geo = France.geo cities = geo.keys() start_urls = list( map( lambda x: "http://www.wikicampers.fr/annonces-location-camping-car/" + str( x), cities)) def parse(self, response): for sel in response.xpath('//div[@class="annonces"]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath( 'div/h3/a/text()').extract()[0].strip("\n ") except: item['title'] = empty try: item['media'] = self.allowed_domains[0] + sel.xpath( 'div/a/img/@src').extract()[0].split('..')[-1] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'div/a/@href').extract()[0] except: item['url'] = empty try: item['evaluations'] = int( sel.xpath('div/h3/a[2]/text()[2]').extract()[0].strip(' ')) except: item['evaluations'] = empty try: item['description'] = sel.xpath( 'div[@class="grid_inner annonce"]/p/text()').extract( )[0].strip("\n ") except: item['description'] = empty try: item['location'] = sel.xpath( 'div/div[@class="city"]/text()').extract()[0].strip("\n ") except: item['location'] = empty try: item['latitude'] = float(self.geo[item['location']]['lat']) except: item['latitude'] = empty try: item['longitude'] = float(self.geo[item['location']]['lon']) except: item['longitude'] = empty try: item['price'] = sel.xpath('div/span/text()').extract( )[0].strip("\n ").split(' ')[0].encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath( 'div/span/text()').extract()[0].strip("\n ").split(' ')[-1] except: item['period'] = empty item['postal_code'] = empty yield item
class MobyparkSpider(scrapy.Spider): name = "mobypark" category = "storing" subcategory = "space" allowed_domains = ["http://www.mobypark.fr"] France = France() cities = France.cities start_urls = list( map( lambda x: "https://www.mobypark.fr/api/offers?format=json?distance=15&radius=15&q=" + str(x), cities)) def parse(self, response): jsonresponse = json.loads(response.body_as_unicode()) result = jsonresponse["result"] if result.has_key('offers'): results = result["offers"] for sel in results: item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel['car_park']['location'][ 'formatted_address'] except: item['title'] = empty try: item['media'] = sel["car_park"]["first_picture"]["url"] print item['media'] except: item['media'] = empty try: url_id = sel["car_park"]["id"] item['url'] = self.allowed_domains[0] + "/carpark/" + str( url_id) + "/show" except: item['url'] = empty try: item['description'] = sel["car_park"]["description"] except: item['description'] = empty try: item['location'] = sel['car_park']['location'][ 'formatted_address'] item['postal_code'] = searchZip(item['location']) except: item['location'] = empty item['postal_code'] = 0 try: item['latitude'] = sel['car_park']['location']['latitude'] except: item['latitude'] = empty try: item['longitude'] = sel['car_park']['location'][ 'longitude'] except: item['longitude'] = empty try: item['price'] = sel["car_park"]["day_rate"] item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel["car_park"]["minimal_duration"] except: item['period'] = empty item['evaluations'] = empty yield item
class yoopiesSpider(scrapy.Spider): name = "yoopies" category = "daily" subcategory = "babysitting" allowed_domains = ["https://yoopies.fr"] # scrap by cities France = France() cities = France.cities start_urls = list( map( lambda x: "https://yoopies.fr/recherche-baby-sitting/results?c=" + str(x), cities)) def parse(self, response): for sel in response.xpath('//article'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath( 'a/div[2]/header/h1/text()').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath( 'a/aside/figure/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = sel.xpath('a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'a/div[2]/p[@class="description"]/text()').extract( )[0].strip('\n') except: item['description'] = empty try: item['latitude'] = sel.xpath('a/@data-latitude').extract()[0] except: item['latitude'] = empty try: item['longitude'] = sel.xpath('a/@data-longitude').extract()[0] except: item['longitude'] = empty try: item['location'] = sel.xpath( 'a/aside/div[@class="user-city"]/text()').extract()[0] except: item['location'] = empty item['postal_code'] = empty item['evaluations'] = empty item['price'] = empty item['currency'] = empty item['period'] = empty yield item
class HousetripSpider(scrapy.Spider): name = "cavientdujardin" category = "eating" subcategory = "vegetables" allowed_domains = ["http://www.cavientdujardin.com"] start_urls = list( map( lambda x: "http://www.cavientdujardin.com/petites-annonces/0-0-0-0-%s.html" % str(x), range(1, 10))) France = France() geo = France.geo def parse(self, response): for sel in response.xpath('//div[@class="LigneAnnonce"]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath( 'div[@class="ListDet"]/a[@class="ListTitre1"]/text()' ).extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath( 'div[@class="ListImg"]/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'div[@class="ListDet"]/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'div[@class="ListDet"]/a[@class="ListTitre"]/text()' ).extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath( 'div[@class="ListDet"]/span[@class="ville"]/text()' ).extract()[0] except: item['location'] = empty item['postal_code'] = empty try: item['latitude'] = float(self.geo[item['location']]['lat']) except: item['latitude'] = empty try: item['longitude'] = float(self.geo[item['location']]['lon']) except: item['longitude'] = empty try: item['price'] = sel.xpath( 'div[@class="ListDet"]/span[@class="ListPrix"]/text()' ).extract()[0] item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath( 'div[@class="ListCol1"]/text()').extract()[0] except: item['period'] = empty item['evaluations'] = empty yield item
class HousetripSpider(scrapy.Spider): name = "chambrealouer" category = "housing" subcategory = "room" allowed_domains = ["http://fr.chambrealouer.com"] France = France() cities = France.cities start_urls = list(map(lambda x: "http://fr.chambrealouer.com/location/FR-France/"+str(x), cities)) def parse(self, response): for sel in response.xpath('//div[@class="rentResult ad-list-item"]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('div[@class="detail"]/img/@alt').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('div[@class="detail"]/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = sel.xpath('div[@class="detail"]/meta/@content').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath('div[@class="detail"]/div/p/span/text()').extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath('//div[@class="rentResult ad-list-item"]/div[@class="detail"]/div/div[@itemprop="address"]/span[@class="location"]/span/text()').extract()[0] except: item['location'] = empty item['postal_code'] = 0 item['latitude'] = sel.xpath('div[@class="detail"]/div/div[@itemprop="geo"]/meta[@itemprop="latitude"]/@content').extract()[0] item['longitude'] = sel.xpath('div[@class="detail"]/div/div[@itemprop="geo"]/meta[@itemprop="longitude"]/@content').extract()[0] try: price0 = sel.xpath('table/tr[2]/td[1]/text()').extract()[0].encode('utf-8').strip('€') price1 = sel.xpath('table/tr[2]/td[2]/text()').extract()[0].encode('utf-8').strip('€') price2 = sel.xpath('table/tr[2]/td[3]/text()').extract()[0].encode('utf-8').strip('€') item['price'] = price0 + ", " + price1 + ", " + price2 item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: period0 = sel.xpath('table/tr/td[1]/text()').extract()[0] period1 = sel.xpath('table/tr/td[2]/text()').extract()[0] period2 = sel.xpath('table/tr/td[3]/text()').extract()[0] item['period'] = period0 + ", " + period1 + ", " + period2 except: item['period'] = empty item['evaluations'] = empty yield item
class BandbikeSpider(scrapy.Spider): name = "bandbike" category = "moving" subcategory = "velo" allowed_domains = ["http://bandbike.com"] start_urls = [] France = France() geo = France.geo cities = geo.keys() for city in cities: url = "http://bandbike.com/ref/city/" + city req = requests.get(url=url) res = json.loads(req.text) for r in res: url = "http://bandbike.com/ad/search?terms=%s+(%s)&searchCityId=%s" % ( r['name'], r['zipcode'], r['id']) urls = [url + "¤tPage=" + str(x) for x in xrange(1, 10)] start_urls += urls #print start_urls def parse(self, response): for sel in response.xpath('//div[@class="row"]'): item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath("div/div/div/h4/text()").extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('div/div/div/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'div/div/div/a/@href').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'div/div/div/div/div/h5/text()').extract()[0] except: item['description'] = empty item['location'] = response.url.split('terms=')[1].split('+')[0] item['postal_code'] = response.url.split('terms=')[1].split( '+')[1].split(')')[0].strip('(') item['evaluations'] = empty try: item['latitude'] = float(self.geo[item['location']]['lat']) except: item['latitude'] = empty try: item['longitude'] = float(self.geo[item['location']]['lon']) except: item['longitude'] = empty try: price = sel.xpath( 'div/div/div/div/div[3]/h5/text()').extract()[0].split('/') item['price'] = price[0].strip(' ').encode('utf-8').strip('€') item['period'] = price[1] item['currency'] = "€" except: item['price'] = empty item['period'] = empty item['currency'] = empty yield item
class VizeatSpider(scrapy.Spider): name = "vizeat" category = "eating" subcategory = "meals" allowed_domains = ["https://fr.vizeat.com"] # scrap by cities France = France() cities = France.cities start_urls = list( map(lambda x: "https://fr.vizeat.com/events/search?q=" + str(x), cities)) def parse(self, response): for sel in response.xpath('//div[@class="itemInside event-box p15"]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('a/img/@title').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('a/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'div/div/h2/a/@href').extract()[0] except: item['url'] = empty try: desc0 = sel.xpath( 'div[@class="dateHeureEvent"]/a/text()').extract()[0] item['description'] = desc0 except: item['description'] = empty try: item['location'] = sel.xpath( 'div[@class="author"]/div[@class="authorRight"]/a[2]/text()' ).extract()[0] except: item['location'] = empty item['latitude'] = empty item['longitude'] = empty try: item['price'] = sel.xpath("div/div[2]/div/text()").extract( )[0].strip('\n').encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath( 'div[@class="dateHeureEvent"]/a/text()').extract()[0] except: item['period'] = empty yield item
class DrivySpider(scrapy.Spider): name = "drivy" category = "moving" subcategory = "car" allowed_domains = ["https://www.drivy.com"] # scrap zilok by categories France = France() geo = France.geo urls = [] for k, v in geo.items(): url = "https://www.drivy.com/search?latitude=" + str( v["lat"]) + "&longitude=" + str( v["lon"]) + "&city_display_name=" + k + "&area_type=city" urls.append(url) start_urls = [ url + "&page=" + str(i) for url in urls for i in xrange(1, 51) ] def parse(self, response): for sel in response.xpath('//div[@data-car-id]'): item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath( "div[@class='search_card_content car_content']/a[@class='car_title']/@title" ).extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath( 'div[@class="search_card_aside car_photo"]/img/@src' ).extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath( 'div[@class="search_card_content car_content"]/a[@class="car_title"]/@href' ).extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'div[@class="search_card_content car_content"]/div[@class="car_subtitle"]/text()' ).extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath( 'div[@class="search_card_content car_content"]/div[@class="car_location"]/text()[2]' ).extract()[0].strip('\n') except: item['location'] = response.url.split( 'city_display_name=')[1].split('&')[0] try: item['evaluations'] = float( sel.xpath('div[2]/div[3]/div/span/text()').extract()[0]) except: item['evaluations'] = empty item['postal_code'] = empty url_city = response.url.split('city_display_name=')[1].split( '&')[0] try: item['latitude'] = float(self.geo[url_city]['lat']) except: item['latitude'] = empty try: item['longitude'] = float(self.geo[url_city]['lon']) except: item['longitude'] = empty try: item['price'] = sel.xpath( 'div[@class="search_card_content car_content"]/span[@class="js_car_price car_price"]/strong/text()' ).extract()[0].encode('utf-8').strip('€') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = "jour" except: item['period'] = empty yield item
class OuistockSpider(scrapy.Spider): name = "ouistock" category = "storing" subcategory ="space" allowed_domains = ["https://www.ouistock.fr"] # scrap by cities France = France() cities = France.cities start_urls_0 = list(map(lambda x: "https://www.ouistock.fr/s/"+str(x), cities)) start_urls = [url+"?page="+str(x) for url in start_urls_0 for x in range(100)] def parse(self, response): for sel in response.xpath('//ul[@id="results"]/li'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('div[@class="resultContainer"]/div[@class="resultInfos"]/h3[@class="resultUserName"]/text()').extract()[0].strip('\n ') except: item['title'] = empty try: item['media'] = "https:"+sel.xpath('div[@class="resultContainer"]/div[@class="resultImgContainer"]/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath('div[@class="resultContainer"]/a/@href').extract()[0] except: item['url'] = empty try: desc0 = sel.xpath('div[@class="resultContainer"]/div[@class="resultInfos"]/span[@class="resultType"]/text()').extract()[0].strip('\n ') desc1 = sel.xpath('div[@class="resultContainer"]/div[@class="resultInfos"]/span[@class="resultUsefull"]/text()').extract()[0].strip('\n ') item['description'] = desc0 + " "+ desc1 except: item['description'] = empty try: item['location'] = sel.xpath('div[@class="resultContainer"]/div[@class="resultInfos"]/span[@class="resultUsefull"]/text()').extract()[0].strip('\n ').split(' ')[-1] except: item['location'] = empty item['latitude'] = empty item['longitude'] = empty try: item['price'] = sel.xpath('div[@class="resultContainer"]/div[@class="priceSpan"]/div[@class="innerSpan"]/i/text()').extract()[0].encode('utf-8').strip('\n €') item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath('div[@class="resultContainer"]/div[@class="priceSpan"]/div[@class="innerSpan"]/i/text()').extract()[0].strip("\n' /") except: item['period'] = empty yield item
class CostockageSpider(scrapy.Spider): name = "costockage" category = "storing" subcategory = "space" allowed_domains = ["https://www.costockage.fr"] France = France() cities = France.cities geo = France.geo start_urls_0 = list( map( lambda x: "https://www.costockage.fr/garde-meuble/%s-5&plus-proche=10" % str( x), cities)) start_urls = [ url + "&" + "page=" + str(x) for url in start_urls_0 for x in range(10) ] def parse(self, response): for sel in response.xpath( '//div[@itemtype="http://schema.org/Product"]'): item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('@title').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath( 'div[1]/div[@class="customer_name_search"]/p/img/@src' ).extract()[0] except: item['media'] = empty try: item['url'] = sel.xpath('@id').extract()[0] except: item['url'] = empty try: item['description'] = sel.xpath( 'div[1]/div[@class="address"]/text()[2]').extract()[0] except: item['description'] = empty try: item['location'] = sel.xpath( 'div[1]/div[@class="address"]/a/text()').extract()[0] item['postal_code'] = int(item['location'].split('- ')[1]) except: item['location'] = empty item['postal_code'] = 0 try: item['latitude'] = float(self.geo[item['location']]['lat']) except: item['latitude'] = empty try: item['longitude'] = float(self.geo[item['location']]['lon']) except: item['longitude'] = empty try: item['price'] = sel.xpath( 'div[3]/div[@class="price_div"]/div[@class="new_price"]/b/text()' ).extract()[0].encode('utf-8').split('€')[0] item['currency'] = "€" except: item['price'] = empty item['currency'] = empty try: item['period'] = sel.xpath( 'div[3]/div[@class="price_div"]/div[@class="new_price"]/text()[2]' ).extract()[0].strip('/') except: item['period'] = empty item['evaluations'] = empty yield item
class PrendsmaplaceSpider(scrapy.Spider): name = "prendsmaplace" category = "parking" subcategory = "parking" allowed_domains = ["http://www.prendsmaplace.fr"] pattern = re.compile('\d{2}') France = France() cities = France.cities geo = France.geo start_urls = list(map(lambda x: "http://www.prendsmaplace.fr/page/%s/?s&geo-radius=100&geo-lat&geo-lng&categories=0&locations=0&dir-search=yes"%str(x), range(1,25))) def parse(self, response): for sel in response.xpath('//ul[@class="items"]/li'): item = AdItem() empty = "" item['source'] = self.name item['category'] = self.category item['subcategory'] = self.subcategory try: item['title'] = sel.xpath('div[@class="description"]/h3/a/text()').extract()[0] except: item['title'] =empty try: item['location'] = item['title'].split(' (')[0].split(' ')[-1] except: item['location'] = empty try: item['media'] = sel.xpath('div[@class="thumbnail"]/img/@src').extract()[0] except: item['media'] =empty try: item['evaluations'] = sel.xpath('div[@class="thumbnail"]/div[@class="comment-count"]/text()').extract()[0] except: item['evaluations'] =empty try: item['url'] = sel.xpath('div[@class="description"]/h3/a/@href').extract()[0] except: item['url'] =empty try: item['description'] = sel.xpath('div[@class="description"]/text()[3]').extract()[0] except: item['description'] =empty try: item['latitude'] = self.geo[item['location']]['lat'] except: item['latitude'] = empty try: item['longitude'] = self.geo[item['location']]['lon'] except: item['longitude'] = empty item['longitude'] = empty item['price'] = empty item['currency'] = empty item['period'] = empty try: item['postal_code'] = re.search(self.pattern, item['title']).group() except: item['postal_code'] = empty yield item
class AirbnbSpider(scrapy.Spider): name = "airbnb" category = "housing" #subcategory = "room" allowed_domains = ["https://www.airbnb.com"] # scrap by cities France = France() cities = France.cities start_urls_0 = list(map(lambda x: "https://www.airbnb.fr/s/"+str(x), cities)) start_urls = [url+"?page="+str(x) for url in start_urls_0 for x in range(10)] def parse(self, response): for sel in response.xpath('//div[@data-id]'): item = AdItem() empty = '' item['source'] = self.name item['category'] = self.category try: item['title'] = sel.xpath('@data-name').extract()[0] except: item['title'] = empty try: item['media'] = sel.xpath('div/a/div/img/@src').extract()[0] except: item['media'] = empty try: item['url'] = self.allowed_domains[0] + sel.xpath('@data-url').extract()[0].split('?')[0] except: item['url'] = empty try: item['description'] = sel.xpath('div[2]/div/div[@itemprop="description"]/a/text()').extract()[0] except: item['description'] = sel.xpath('@data-name').extract()[0] if "Chambre" in item['description']: item['subcategory'] = "room" else: item['subcategory'] = "apartment" try: item['evaluations'] = 0 find = re.search(pattern, item['description']) if find: item['evaluations'] = int(find.group()) except: item['evaluations'] = 0 item['latitude'] = sel.xpath('@data-lat').extract()[0] item['longitude'] = sel.xpath('@data-lng').extract()[0] try: item['location'] = urllib.unquote(response.url.split('?')[0].split('s/')[-1]) except: item['location']= empty item['postal_code'] = 0 try: item['price'] = sel.xpath('div/a[2]/div/span/text()').extract()[0] item['currency'] = "€" except: item['price'] = empty item['currency'] = empty item['period'] = "nuit" yield item