class DartySpider(scrapy.Spider): name = "darty" allowed_domains = ["darty.com"] base_url = "https://www.darty.com" start_urls = [ base_url + '/nav/extra/list?p=200&s=topa&cat=756' # base_url + '/nav/extra/list?p=200&s=topa&cat=790', # base_url + '/nav/extra/list?p=200&s=topa&cat=43554', # base_url + '/nav/extra/list?p=200&s=topa&cat=12453&fa=17010-135552-42552', # base_url + '/nav/extra/list?cat=98054&s=prix_asc&p=200&aff=GRID&fa=767' ] already_crawled = u.get_already_crawled() def __init__(self): dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_closed(self, spider): u.update_already_crawled(self.already_crawled) def parse(self, response): # Yield list pages. x_pagination = response.xpath( '//body[@id="darty_liste_produit"]//div[@id="main_pagination_top"]/div[' + u.x_class('darty_product_list_pages_list') + ']') if x_pagination: url_next_page = x_pagination.xpath( './a[text()=" Page suivante"][last()]/@href').extract_first() if url_next_page is not None: yield Request(self.base_url + url_next_page.strip(), callback=self.parse) # Yield product pages. x_list = response.xpath( '//body[@id="darty_liste_produit"]//div[@id="main_products_list"]') if x_list: urls = x_list.xpath('.//div[' + u.x_class('infos_container') + ']/h2/a/@href').extract() for url in urls: url = self.base_url + url.strip() open_ssl_hash = u.generate_open_ssl_hash(url) if open_ssl_hash not in self.already_crawled: self.already_crawled.append(open_ssl_hash) yield Request(url, callback=self.parse) # Yield product. x_product = response.xpath('//body[@id="page_product"]') if x_product: item = Product() # Categories x_categories = response.xpath('//ul[@id="dartyCom_fil_ariane"]') main_category = x_categories.xpath( './li[2]/a/text()').extract_first() if main_category is not None: main_category = main_category.strip() categories = x_categories.xpath( './li[position() >= 3 and position() < last()]/a/text()' ).extract() if categories: for i, category in enumerate(categories): categories[i] = category.strip() # Brand brand = response.xpath( '//a[@id="darty_product_brand"]/text()').extract_first() if brand is not None: brand = brand.strip() # Name name = re.sub( ' +', ' ', ''.join( response.xpath('//h1[' + u.x_class('product_head') + ']//div[' + u.x_class('product_name') + ']/span//text()').extract()).replace( '\n', '').replace('\r', '').strip()) # Price price, price_old, currency = p.get_darty_prices(response) # Image src = response.xpath( '//div[' + u.x_class('darty_product_picture_main_pic_container') + ']/div[1]//img/@src').extract_first() if src is not None: src = src.strip() # Avis x_avis = response.xpath('//div[' + u.x_class('bloc_reviews_resume') + ']') rate = x_avis.xpath( '//meta[@itemprop="ratingValue"]/@content').extract_first() if rate is not None: rate = u.string_to_float(rate.strip()) max_rate = x_avis.xpath('//div[' + u.x_class('bloc_reviews_note') + ']/sub/text()').extract_first() if max_rate is not None: max_rate = int(re.sub('\D', ' ', max_rate.strip())) nb_avis = x_avis.xpath( '//meta[@itemprop="ratingCount"]/@content').extract_first() if nb_avis is not None: nb_avis = int(re.sub('\D', ' ', nb_avis.strip())) item['store'] = self.name item['url'] = response.url item['main_category'] = main_category item['categories'] = categories item['brand'] = brand item['openssl_hash'] = u.generate_open_ssl_hash(item['url']) item['name'] = name item['price_old'] = price_old item['price'] = price item['currency'] = currency item["image_urls"] = [src] item["image_name"] = item['openssl_hash'] item["rate"] = rate item["max_rate"] = max_rate item["nb_avis"] = nb_avis item["price_history"] = [{ 'date': time.strftime("%Y/%m/%d"), 'price_old': price_old, 'price': price, 'currency': currency }] yield item
class FnacSpider(scrapy.Spider): name = "fnac" allowed_domains = ["fnac.com"] base_url = "https://www.fnac.com" start_urls = [ base_url + '/Tous-les-ordinateurs-portables/Ordinateurs-portables/nsh154425/w-4?PageIndex=1' # base_url + '/Tous-les-PC-de-bureau/Ordinateur-de-bureau/nsh51426/w-4?PageIndex=1', # base_url + '/Toutes-les-tablettes/Toutes-les-tablettes/nsh227099/w-4?PageIndex=1', # base_url + '/Tous-les-disques-durs/Disque-Dur/nsh119663/w-4?PageIndex=1', # base_url + '/Memoire-ordinateur-de-bureau/Composants/nsh181559/w-4#bl=MICComposantsARBO', ] src_no_image = "https://www4-fr.fnac-static.com/Nav/Images/Noscan/noscan_340x340.gif" already_crawled = u.get_already_crawled() def __init__(self): dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_closed(self, spider): u.update_already_crawled(self.already_crawled) def parse(self, response): # Yield list pages. x_pagination = response.xpath('//ul[' + u.x_class('Pager bottom-toolbar') + ']') if x_pagination: url_next_page = x_pagination.xpath( './/a[' + u.x_class('prevnext actionNext') + ']/@href').extract_first() if url_next_page is not None: yield Request(url_next_page, callback=self.parse) # Yield product pages. x_list = response.xpath('//ul[' + u.x_class('articleList') + ']') if x_list: urls = x_list.xpath('.//p[' + u.x_class('Article-desc') + ']/a/@href').extract() for url in urls: open_ssl_hash = u.generate_open_ssl_hash(url) if open_ssl_hash not in self.already_crawled: self.already_crawled.append(open_ssl_hash) yield Request(url, callback=self.parse) # Yield product. x_product = response.xpath('//div[' + u.x_class('f-productPage') + ']') if x_product: item = Product() # Categories x_categories = response.xpath('//ul[' + u.x_class('f-breadcrumb') + ']') main_category = x_categories.xpath( './li[2]/a/text()').extract_first() if main_category is not None: main_category = main_category.strip() categories = x_categories.xpath( './li[position() >= 3]/a/text()').extract() if categories: for i, category in enumerate(categories): categories[i] = category.strip() # Name name = response.xpath('//h1[' + u.x_class('f-productHeader-Title') + ']/text()').extract_first().strip() # Price price, price_old, currency = p.get_fnac_prices(response) # Image src = response.xpath('//img[' + u.x_class('f-productVisuals-mainMedia') + ']/@src').extract_first() if src is not None: src = src.strip() # Avis x_avis = response.xpath('//div[' + u.x_class('f-review-header') + ']') rate = x_avis.xpath('.//div[' + u.x_class('f-review-headerRate') + ']/text()').extract_first() if rate is not None: rate = u.string_to_float(rate.strip()) max_rate = x_avis.xpath('.//span[' + u.x_class('f-review-headerRateTotal') + ']/text()').extract_first() if max_rate is not None: max_rate = u.string_to_float(max_rate.strip().replace("/", "")) nb_avis = response.xpath('//div[' + u.x_class('f-productHeader-review') + ']//span[' + u.x_class('f-productHeader-reviewLabel') + ']/text()').extract_first() if nb_avis is not None: nb_avis = u.string_to_float(re.sub("\D", "", nb_avis.strip())) item['store'] = self.name item['url'] = response.url item['main_category'] = main_category item['categories'] = categories item['brand'] = None item['openssl_hash'] = u.generate_open_ssl_hash(item['url']) item['name'] = name item['price_old'] = price_old item['price'] = price item['currency'] = currency item["image_urls"] = [src] item["image_name"] = item['openssl_hash'] item["rate"] = rate item["max_rate"] = max_rate item["nb_avis"] = nb_avis item["price_history"] = [{ 'date': time.strftime("%Y/%m/%d"), 'price_old': price_old, 'price': price, 'currency': currency }] if src == self.src_no_image: copyfile( "data/default.jpg", "data/" + self.name + "/img/" + item["image_name"] + ".jpg") yield item
class AuchanSpider(scrapy.Spider): name = "auchan" allowed_domains = ["auchan.fr"] base_url = "https://www.auchan.fr" start_urls = [ base_url + '/informatique/ordinateur-portable/c-7638110' # base_url + '/informatique/ordinateur-de-bureau/c-7638112', # base_url + '/informatique/tablette-tactile/c-7328319', # base_url + '/informatique/stockage/disque-dur-interne/c-6721660', # base_url + '/informatique/stockage/disque-dur-externe/c-8216', # base_url + '/informatique/composant-assemblage/barrette-memoire/c-201612291119' ] already_crawled = u.get_already_crawled() def __init__(self): dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_closed(self, spider): u.update_already_crawled(self.already_crawled) def parse(self, response): # Yield list pages. x_pagination = response.xpath('//nav[' + u.x_class('ui-pagination') + ']') if x_pagination: url_next_page = x_pagination.xpath('.//a[' + u.x_class('ui-pagination--next') + ']/@href').extract_first() if url_next_page is not None: yield Request(self.base_url + url_next_page.strip(), callback=self.parse) # Yield product pages. x_list = response.xpath('//div[' + u.x_class('product-list--container') + ']') if x_list: urls = x_list.xpath('.//div[' + u.x_class('product-item--wrapper') + ']/a/@href').extract() for url in urls: url = self.base_url + url.strip() open_ssl_hash = u.generate_open_ssl_hash(url) if open_ssl_hash not in self.already_crawled: self.already_crawled.append(open_ssl_hash) yield Request(url, callback=self.parse) # Yield product. x_product = response.xpath('//div[' + u.x_class('product-detail') + ']') if x_product: item = Product() # Categories x_categories = response.xpath('//div[' + u.x_class('ui-breadcrumb--scroller') + ']/nav') main_category = x_categories.xpath('./span[2]/meta[@itemprop="name"]/@content').extract_first() if main_category is not None: main_category = main_category.strip() categories = x_categories.xpath('./span[position() >= 3 and position() < last()]/meta[@itemprop="name"]/@content').extract() if categories: for i, category in enumerate(categories): categories[i] = category.strip() # Brand x_brand_name = response.xpath('//div[' + u.x_class('product-detail--wrapper') + ']') brand = x_brand_name.xpath('./meta[@itemprop="brand"]/@content').extract_first() if brand is not None: brand = brand.strip() # Name name = x_brand_name.xpath('./h1[' + u.x_class('product-detail--title') + ']/text()').extract_first().replace('\n', '').replace('\r', '').strip() # Price price, price_old, currency = p.get_auchan_prices(response) # Image src = response.xpath('//div[' + u.x_class('x-scroller') + ']/label[1]//img/@src').extract_first() if src is not None: src = src.strip() # Avis x_avis = response.xpath('//div[' + u.x_class('product-detail--rating') + ']') rate = x_avis.xpath('.//meta[@itemprop="ratingValue"]/@content').extract_first() if rate is not None: rate = u.string_to_float(rate.strip()) nb_avis = x_avis.xpath('.//meta[@itemprop="reviewCount"]/@content').extract_first() if nb_avis is not None: nb_avis = int(nb_avis.strip()) max_rate = x_avis.xpath('.//span[' + u.x_class('ui-rating--background') + ']/i[' + u.x_class('icon-auchan-82') + ']').extract() max_rate = len(max_rate) if max_rate else None item['store'] = self.name item['url'] = response.url item['main_category'] = main_category item['categories'] = categories item['brand'] = brand item['openssl_hash'] = u.generate_open_ssl_hash(item['url']) item['name'] = name item['price_old'] = price_old item['price'] = price item['currency'] = currency item["image_urls"] = [src] item["image_name"] = item['openssl_hash'] item["rate"] = rate item["max_rate"] = max_rate item["nb_avis"] = nb_avis item["price_history"] = [{'date': time.strftime("%Y/%m/%d"), 'price_old': price_old, 'price': price, 'currency': currency}] yield item
class CdiscountSpider(scrapy.Spider): name = "cdiscount" allowed_domains = ["cdiscount.com"] base_url = "https://www.cdiscount.com" start_urls = [ base_url + '/informatique/ordinateurs-pc-portables/pc-portables/l-1070992.html' # base_url + '/informatique/achat-pc-ordinateur/tous-les-pc-de-bureau/l-1070840.html', # base_url + '/informatique/disques-durs/disque-dur-externe-ssd/l-1073610.html', # base_url + '/informatique/disques-durs/disques-durs-externes/l-1073602.html', # base_url + '/informatique/disques-durs/disque-dur/l-1073632.html', # base_url + '/informatique/memoire-ram/l-10716.html' ] first = True already_crawled = u.get_already_crawled() nb_crawled = 0 def __init__(self): dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_closed(self, spider): u.update_already_crawled(self.already_crawled) def parse(self, response): # Yield list pages. x_pagination = response.xpath('//ul[@id="PaginationForm_ul"]') if x_pagination and self.first: self.first = False nb_page = x_pagination.xpath('./li[last()]/a/text()').extract_first() if nb_page is not None: for x in range(1, int(nb_page.strip())): yield Request(response.url[:-5] + "-" + str(x) + response.url[-5:], callback=self.parse) # Yield product pages. x_list = response.xpath('//ul[@id="lpBloc"]') if x_list: urls = x_list.xpath('.//div[' + u.x_class('prdtBILDetails') + ']/a/@href').extract() for url in urls: url = url.strip() open_ssl_hash = u.generate_open_ssl_hash(url) if open_ssl_hash not in self.already_crawled and self.nb_crawled < 300 : self.nb_crawled += 1 self.already_crawled.append(open_ssl_hash) yield Request(url, callback=self.parse) # Yield product. x_product = response.xpath('//h1[@itemprop="name"]') if x_product: item = Product() # Categories x_categories = response.xpath('//div[@id="bc"]') main_category = x_categories.xpath('.//li[3]//span/text()').extract_first() if main_category is not None: main_category = main_category.strip() categories = x_categories.xpath('.//li[position() >= 4 and position() < last()]//span/text()').extract() if categories: for i, category in enumerate(categories): categories[i] = category.strip() # Brand brand = response.xpath('//table[' + u.x_class('fpDescTb fpDescTbPub') + ']//span[@itemprop="brand"]//span[@itemprop="name"]/text()').extract_first() if brand is not None: brand = brand.strip() # Name name = re.sub(' +', ' ', x_product.xpath('./text()').extract_first().strip()) # Price price, price_old, currency = p.get_cdiscount_prices(response) # Image src = response.xpath('//div[' + u.x_class('fpMainImg') + ']/a[@itemprop="image"]/@href').extract_first() if src is not None: src = src.strip() # Avis x_avis = response.xpath('//div[' + u.x_class('topMainRating') + ']') rate = x_avis.xpath('//span[@itemprop="ratingValue"]/text()').extract_first() if rate is not None: rate = u.string_to_float(rate.strip()) nb_avis = x_avis.xpath('//span[@itemprop="ratingCount"]/text()').extract_first() if nb_avis is not None: nb_avis = int(nb_avis.strip()) item['store'] = self.name item['url'] = response.url item['main_category'] = main_category item['categories'] = categories item['brand'] = brand item['openssl_hash'] = u.generate_open_ssl_hash(item['url']) item['name'] = name item['price_old'] = price_old item['price'] = price item['currency'] = currency item["image_urls"] = [src] item["image_name"] = item['openssl_hash'] item["rate"] = rate item["max_rate"] = 5 item["nb_avis"] = nb_avis item["price_history"] = [{'date': time.strftime("%Y/%m/%d"), 'price_old': price_old, 'price': price, 'currency': currency}] yield item
class BoulangerSpider(scrapy.Spider): name = "boulanger" allowed_domains = ["boulanger.com"] base_url = "https://www.boulanger.com" start_urls = [ base_url + '/c/tous-les-ordinateurs-portables' # base_url + '/c/tous-les-ordinateurs-de-bureau', # base_url + '/c/toutes-les-tablettes-tactiles', # base_url + '/c/disque-dur-externe', # base_url + '/c/disque-ssd', # base_url + '/c/memoire-vive' ] already_crawled = u.get_already_crawled() def __init__(self): dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_closed(self, spider): u.update_already_crawled(self.already_crawled) def parse(self, response): # Yield list pages. x_pagination = response.xpath('//div[' + u.x_class('navigationListe') + ']') if x_pagination: url_next_page = x_pagination.xpath( './/span[' + u.x_class('navPage navPage-right') + ']/a/@href').extract_first() if url_next_page is not None: yield Request(self.base_url + url_next_page.strip(), callback=self.parse) # Yield product pages. x_list = response.xpath('//div[' + u.x_class('productListe') + ']') if x_list: urls = x_list.xpath('.//div[' + u.x_class('designations') + ']/h2/a/@href').extract() for url in urls: url = self.base_url + url.strip() open_ssl_hash = u.generate_open_ssl_hash(url) if open_ssl_hash not in self.already_crawled: self.already_crawled.append(open_ssl_hash) yield Request(url, callback=self.parse) # Yield product. x_product = response.xpath('//h1[@itemprop="name"]') if x_product: item = Product() # Categories x_categories = response.xpath('//div[@id="filAriane"]') main_category = x_categories.xpath( './/li[2]//a/text()').extract_first() if main_category is not None: main_category = main_category.strip() categories = x_categories.xpath( './/li[position() >= 3 and position() <= last()]//a/text()' ).extract() if categories: for i, category in enumerate(categories): categories[i] = category.strip() # Name name = re.sub( ' +', ' ', ''.join(x_product.xpath('./text()').extract()).replace( '\n', '').replace('\r', '').strip()) # Price price, price_old, currency = p.get_boulanger_prices(response) # Image src = response.xpath( '//span[@itemprop="gtin13"]/text()').extract_first() if src is not None: src = "https://boulanger.scene7.com/is/image/Boulanger/" + src.strip( ) + "_h_f_l_0" # Avis x_avis = response.xpath('//div[' + u.x_class('top') + ']/div[' + u.x_class('right') + ']//span[' + u.x_class('rating') + ']') rate = x_avis.xpath('./@class').extract_first() if rate is not None: rate = re.sub('\D', '', rate.strip()) if rate != "0": if len(rate) > 1: rate = rate[:1] + "," + rate[1:] rate = u.string_to_float(rate) else: rate = None nb_avis = x_avis.xpath('./span[' + u.x_class('link') + ']/text()').extract_first() if nb_avis is not None: nb_avis = int(re.sub('\D', '', nb_avis.strip())) item['store'] = self.name item['url'] = response.url item['main_category'] = main_category item['categories'] = categories item['brand'] = None item['openssl_hash'] = u.generate_open_ssl_hash(item['url']) item['name'] = name item['price_old'] = price_old item['price'] = price item['currency'] = currency item["image_urls"] = [src] item["image_name"] = item['openssl_hash'] item["rate"] = rate item["max_rate"] = 5 item["nb_avis"] = nb_avis item["price_history"] = [{ 'date': time.strftime("%Y/%m/%d"), 'price_old': price_old, 'price': price, 'currency': currency }] yield item
class MaterielNetSpider(scrapy.Spider): name = "materiel_net" allowed_domains = ["materiel.net"] base_url = "https://www.materiel.net" start_urls = [ base_url + '/pc-portable/?p=1' # base_url + '/ordinateur/?p=1', # base_url + '/tablette-tactile/?p=1', # base_url + '/disque-ssd/?p=1', # base_url + '/disque-dur/?p=1', # base_url + '/disque-dur-externe/?p=1', # base_url + '/barrette-memoire-pour-pc/?p=1', # base_url + '/barrette-memoire-pour-pc-portable/?p=1' ] already_crawled = u.get_already_crawled() def __init__(self): dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_closed(self, spider): u.update_already_crawled(self.already_crawled) def parse(self, response): # Yield list pages. x_pagination = response.xpath('//ul[' + u.x_class('pagination pagination-sm') + ']') if x_pagination: url_next_page = x_pagination.xpath( './li[position() = last()]/a/@href').extract_first() if url_next_page is None: url_next_page = x_pagination.xpath( './li[position() = (last() - 1)]/a/@href').extract_first() if url_next_page is not None: yield Request(self.base_url + url_next_page, callback=self.parse) # Yield product pages. x_list = response.xpath('//table[' + u.x_class('ProdList') + ']') if x_list: urls = x_list.xpath('.//td[' + u.x_class('Photo') + ']/span/@data-href').extract() for url in urls: url = self.base_url + url open_ssl_hash = u.generate_open_ssl_hash(url) if open_ssl_hash not in self.already_crawled: self.already_crawled.append(open_ssl_hash) yield Request(url, callback=self.parse) # Yield product. x_product = response.xpath('//div[@id="prod"]') if x_product: item = Product() # Categories x_categories = response.xpath('//nav[@id="breadcrumb"]') categories = x_categories.xpath( './/li[position() >= 3 and position() < last()]/a/text()' ).extract() if categories: for i, category in enumerate(categories): categories[i] = category.strip() # Brand brand = x_categories.xpath('.//li[2]/a/text()').extract_first() if brand is not None: brand = brand.strip() # Name name = re.sub( ' +', ' ', ''.join( response.xpath( '//h1[@id="ProdTitle"]//text()').extract()).replace( '\n', '').replace('\r', '').strip()) # Price price, price_old, currency = p.get_materiel_net_prices(response) # Image src = response.xpath('//div[' + u.x_class('swiper-wrapper') + ']//a/@data-zoom-image').extract_first() if src is None: src = response.xpath( '//div[@id="container-image"]/@data-zoom-image' ).extract_first() if src is not None: src = src.strip() # Avis x_avis = response.xpath('//div[' + u.x_class('headerAvisClients') + ']') rate = x_avis.xpath('.//span[' + u.x_class('noteUser') + ']/text()').extract_first() if rate is not None: rate = u.string_to_float(rate.strip()) max_rate = x_avis.xpath( './/span[' + u.x_class('noteUser') + ']/following-sibling::span[1]/text()').extract_first() if max_rate is not None: max_rate = u.string_to_float(max_rate.strip()) nb_avis = x_avis.xpath( './/span[@id="avisCount"]/span/text()').extract_first() if nb_avis is not None: nb_avis = int(nb_avis.strip()) item['store'] = self.name item['url'] = response.url item['main_category'] = "Informatique" item['categories'] = categories item['brand'] = brand item['openssl_hash'] = u.generate_open_ssl_hash(item['url']) item['name'] = name item['price_old'] = price_old item['price'] = price item['currency'] = currency item["image_urls"] = [src] item["image_name"] = item['openssl_hash'] item["rate"] = rate item["max_rate"] = max_rate item["nb_avis"] = nb_avis item["price_history"] = [{ 'date': time.strftime("%Y/%m/%d"), 'price_old': price_old, 'price': price, 'currency': currency }] yield item