class EcarteleraSpider(CrawlSpider): name = 'cartelera' item_count = 0 allowed_domain = ['https://www.ecartelera.com'] start_urls = ['https://www.ecartelera.com/listas/mejores-peliculas/'] rules = { # Para cada item Rule(LinkExtractor(allow = (), restrict_xpaths = ('//div[@class="pagination"]/a[last()]')), callback='process', process_links= 'appendDummy', follow=True), Rule(LinkExtractor(allow =(), restrict_xpaths = ('//*[@id="listaglobal"]//a')), callback = 'parse_item', follow = True) } def parse_item(self, response): ecartelera_item = CarteleraItem() #info de pelicula ecartelera_item['titulo'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[2]/text())').extract() if(len(ecartelera_item['titulo'])==0): ecartelera_item['titulo'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[3]/text())').extract() ecartelera_item['tituloOriginal'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[2]/text())').extract() ecartelera_item['anyo'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[1]/span/text())').extract() ecartelera_item['pais'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[3]/text())').extract() ecartelera_item['duraccion'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[4]/span)').extract() ecartelera_item['presupuesto'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[5]/text())').extract() ecartelera_item['genero'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[6]/span)').extract() ecartelera_item['estudio'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[1]/div/p[7]/span)').extract() #ecartelera_item['distribuidora'] = response.xpath('normalize-space( ecartelera_item['ranking'] = response.xpath('normalize-space(//*[@id="bloc1"]/div[3]/div/div/p[2]/strong[1])').extract() #ecartelera_item['listas'] = response.xpath('normalize-space( yield ecartelera_item
class MercadoSpider(CrawlSpider): name = 'mercado' item_count = 0 allowed_domain = ['www.mercadolibre.com.co'] start_urls = ['https://listado.mercadolibre.com.co/celulares-xiaomi#D[A:celulares%20xiaomi]'] rules = { # Para cada item Rule(LinkExtractor(allow = (), restrict_xpaths = ('//li[@class="andes-pagination__arrow-title"]/a'))), Rule(LinkExtractor(allow =(), restrict_xpaths = ('//h2[contains(@class,"main-title")]/a')), callback = 'parse_item', follow = False) } def parse_item(self, response): ml_item = MercadoItem() #info de producto ml_item['titulo'] = response.xpath('normalize-space(//h1[@class="item-title__primary"]/text())').extract_first() ml_item['precio'] = response.xpath('normalize-space(//span[@class="price-tag-fraction"]/text())').extract() ml_item['envio'] = response.xpath('normalize-space(//p[contains(@class, "shipping-method-title shipping-text")]/text())').extract() ml_item['vendido'] = response.xpath('normalize-space(//[(@class = "item-conditions")]/text())').extract() ml_item['opiniones'] = response.xpath('normalize-space(//span[@class="average-legend"]/text())').extract() #imagenes del producto ml_item['image_urls'] = response.xpath('//figure[contains(@class, "gallery-image-container")]/a/img/@src').extract() ml_item['image_name'] = response.xpath('normalize-space(//h1[@class="item-title__primary "]/text())').extract_first() self.item_count += 1 if self.item_count > 20: raise CloseSpider('item_exceeded') yield ml_item
class CumbriaSpider(CrawlSpider): name = 'Cumbria' allowed_domains = ['www.cumbria.ac.uk'] start_urls = [ 'https://www.cumbria.ac.uk/study/courses/course-search/?level=ug-full-time-degree&level=ug-sandwich-placement&page=1' ] rules = ( Rule(LinkExtractor(allow=r'page=\d*'), follow=True), Rule(LinkExtractor( restrict_xpaths='//div[@class="articles-wrapper"]/article/a'), follow=False, callback='parse_item'), ) def parse_item(self, response): print('--------------------', response.url) titles = response.xpath('//h1//text()').extract() titles = ''.join(titles) degree_type = re.findall('[A-Za-z]*\s\([a-zA-Z]{0,6}\)', titles) degree_type = ''.join(degree_type) programme = titles.replace(degree_type, '').strip() ucas_code = response.xpath( '//div[@class="ucas-code"]//text()').extract() ucas_code = ''.join(ucas_code).replace('Course code', '').strip() # print(ucas_code) modules = response.xpath( '//div[@id="course-outline"]//text()').extract() modules = ''.join(modules)
class Ecspider(CrawlSpider): name = 'zgcspider' custom_settings = { 'ITEM_PIPELINES': { 'zgc.pipelines.RedisPipeline': 300, } } start_urls = [ 'http://mobile.zol.com.cn/', 'http://bbs.zol.com.cn/sjbbs/p1.html#c' ] page_link_a = LinkExtractor(allow=(r'/detail_\d+/')) link_b = LinkExtractor(allow=(r'/more/\d+_\d+.shtml')) page_link_c = LinkExtractor(allow=(r'/sjbbs/p\d+.html#c')) page_link_b = LinkExtractor(allow=(r'/more/\d+_\d+_\d+.shtml')) rules = ( Rule(page_link_a, callback="parse_item", follow=True), Rule(link_b), Rule(page_link_b, callback="parse_item", follow=True), Rule(page_link_c, callback="parse_item", follow=True), ) def parse_item(self, response): item = RedisItem() item['url'] = response.url yield item
class MeishijieSpider(CrawlSpider): name = 'meishijie' allowed_domains = ['meishij.net'] start_urls = ['http://www.meishij.net'] num = 0 next_page = 0 rules = (Rule( LinkExtractor(allow=(r'http://www.meishij.net/china-food/xiaochi/$')), callback='test', follow=True), Rule(LinkExtractor(allow=( r'http://www.meishij.net/china-food/xiaochi/\?&page=\d+$')), callback='next', follow=True), Rule(LinkExtractor( allow=(r'http://www.meishij.net/zuofa/\w+\.html')), callback='save')) def next(self, response): self.next_page += 1 print("next++++++++++++++++++++++++++++++++++++++++++", self.next_page) def save(self, response): self.num += 1 print( response.xpath(".//*[@id='tongji_title']/text()").extract(), self.num)
class Furgoneta1Spider(CrawlSpider): name = "furgoneta1" item_count = 1 MAX_ITEMS = 2500 allowed_domain = ['www.mercadolibre.com.ar'] start_urls = [ 'https://autos.mercadolibre.com.ar/_VEHICLE*BODY*TYPE_452750#VEHICLE_BODY_TYPE' ] rules = { # Boton siguiente Rule(LinkExtractor(allow = (), restrict_xpaths = ("//li[contains(@class, 'andes-pagination__button andes-pagination__button--next')]/a"))), # Ingreso al item Rule(LinkExtractor(allow =(), restrict_xpaths = ("//div[contains(@class, 'rowItem item item--grid item--has-row-logo new')]/a")), callback = 'parse_item', follow = False) } def parse_item(self, response): item = VehiculosItem() item['id'] = "crawler1_" + str(self.item_count) item['categoria'] = "furgoneta" item['titulo'] = response.xpath('normalize-space(//h1[@class="item-title__primary "]/text())').extract_first() item['imagen_urls'] = response.xpath('//figure[contains(@class, "gallery-image-container")]/a/img/@src').extract() if self.item_count > self.MAX_ITEMS: raise CloseSpider("Scraping terminado con " + str(self.item_count - 1) + " vehiculos analizados.") self.item_count += 1 yield item
class MercadoSpider(CrawlSpider): name = 'mercado' item_count = 0 allowed_domain = ['www.mercadolibre.com.mx'] start_url = [ 'https://listado.mercadolibre.com.mx/consolas-videojuegos#D[A:consolas-videojuegos,B:5]' ] rules = { # Para cada item Rule( LinkExtractor( allow=(), restrict_xpaths=('//li[@class="pagination__next"]/a'))), Rule(LinkExtractor( allow=(), restrict_xpaths=('//h2[contains(@class,"item__title")]/a')), callback='parse_item', follow=False) } def parse_item(self, response): product = MercadoItem() #get info product product['titulo'] = response.xpath( 'normalize-space(//h1[@class="item-title__primary"]/text())' ).extract_firts() #Maximo de productos self.item_count += 1 if self.item_count > 20: raise CloseSpider('item_exceeded') yield product
class ComputadorasSpider(CrawlSpider): name = 'computadoras' item_count = 0 allowed_domain = ['www.mercadolibre.com.mx'] start_urls = ['https://listado.mercadolibre.com.mx/computadoras#D[A:computadoras,L:1]'] rules = { # Para cada item Rule(LinkExtractor(allow = (), restrict_xpaths = ('//li[@class="pagination__next"]/a'))), Rule(LinkExtractor(allow =(), restrict_xpaths = ('//h2[contains(@class,"item__title")]/a')), callback = 'parse_item', follow = False) } def parse_item(self, response): computadora_item = ProyectoItem() #info de producto computadora_item['titulo'] = response.xpath('normalize-space(//h1[@class="item-title__primary "]/text())').extract_first() computadora_item['modelo'] = response.xpath('normalize-space(//*[@id="root-app"]/div[2]/div[1]/div[1]/section[3]/div/section/ul/li[3]/span)').extract() computadora_item['marca'] = response.xpath('normalize-space(//*[@id="root-app"]/div[2]/div[1]/div[1]/section[3]/div/section/ul/li[1]/span)').extract() computadora_item['precio'] = response.xpath('normalize-space(//span[@class="price-tag-fraction"]/text())').extract() computadora_item['condicion'] = response.xpath('normalize-space(//div[@class="item-conditions"]/text())').extract() computadora_item['opiniones'] = response.xpath('normalize-space(//span[@class="review-summary-average"]/text())').extract() #info de la tienda o vendedor computadora_item['tipo_vendedor'] = response.xpath('normalize-space(//p[contains(@class, "power-seller")]/text())').extract() computadora_item['ventas_vendedor'] = response.xpath('normalize-space(//dd[@class="reputation-relevant"]/strong/text())').extract() self.item_count += 1 if self.item_count > 40: raise CloseSpider('item_exceeded') yield computadora_item
class MMspider(CrawlSpider): """docstring for MMspider""" name = 'mzitu' allowed_domains = ['www.mzitu.com'] start_urls = ['http://www.mzitu.com/'] rules = ( Rule(LinkExtractor(allow=(r'/xinggan/page/\d+')), follow=True), Rule(LinkExtractor(allow=(r'/\d{1,6}', ), deny=(r'/\d{1,6}/\d{1,6}')), callback='parse_item', follow=True), ) def parse_item(self, response): header = { "User-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36" } item = MzituItem() item["name"] = response.css(".main-title::text").extract() item["url"] = response.url item['image_urls'] = response.css( ".main-image img::attr(src)").extract() time.sleep(random.randint(3, 6)) yield Request(response.url, headers=header) yield item
class Synsam(CrawlSpider): name = 'specsavers_no-synsam' allowed_domains = ['synsam.no'] start_urls = ['https://www.synsam.no/kontaktlinser'] products = LinkExtractor(restrict_css='.product-list-products') pagination = LinkExtractor( restrict_css='.paging-navigation', process_value=lambda x: 'https://www.synsam.no/ArticleFilter/CL/?' 'sort=price&sortOrder=asc&from=' + re.search('fr=(.*)&?', x).group(1)) rules = (Rule(products, callback='parse_product'), Rule(pagination)) def parse_product(self, response): if response.xpath('//h5[contains(., "under varemerket")]'): return loader = ProductLoader(Product(), response=response) identifier = response.xpath( '//input[@id="articleId"]/@value').extract_first( ) or response.xpath('//input[@id="skuId"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) breadcrumbs = response.css('.breadcrumbs a::text').extract()[1:] loader.add_value('name', breadcrumbs.pop()) loader.add_value('category', breadcrumbs[-3:]) loader.add_xpath('price', '//h3[@itemprop="price"]/@content') loader.add_xpath('image_url', '//img[@itemprop="image"]/@src') loader.add_css('brand', '.product-hero-brand img::attr(alt)') if loader.get_output_value('price') < 1000: loader.add_value('shipping_cost', 49) yield loader.load_item()
class YourLens(CrawlSpider): name = 'specsavers_nl-yourlens' allowed_domains = ['yourlenses.nl'] start_urls = ['https://www.yourlenses.nl/lenses'] products = LinkExtractor(restrict_css='.product-list-item') pages = LinkExtractor(restrict_css='.prodList-pagination :not(.disabled)') rules = (Rule(pages), Rule(products, callback='parse_product')) def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.xpath( '//input[@id="prodid"]/@value').extract_first() if not identifier: self.logger.warning('No identifier for %s' % response.url) return loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_css('name', 'div.infotitle h1::text') loader.add_css('price', '.inline.price::text') loader.add_value('sku', identifier) image_url = response.css('.photo::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) brand = response.xpath( '//meta[@itemprop="brand"]/@content').extract_first() if not brand: try: brand = response.xpath('//script/text()').re( '"manufacturer":"(.*?)"')[0].decode('unicode-escape') except IndexError: pass loader.add_value('brand', brand) yield loader.load_item()
class Aprendiendo(CrawlSpider): name = "aprendiendo" item_count = 0 allowed_domain = ['https://www.degustam.com'] start_urls = ("https://www.degustam.com/366-especial-navidad.html", ) rules = { Rule( LinkExtractor(allow=(), restrict_xpaths=('//div[@class="next"]/a'))), Rule(LinkExtractor(allow=(), restrict_xpaths=('//h5/a')), callback='parse_item', follow=False), } def parse_item(self, response): ml_item = AprendiendoItem() ml_item['titulo'] = response.xpath( '//h1[@class="col-xs-12 page-title product-name"]/text()').extract( ) ml_item['descripcion'] = response.xpath('//p/text()').extract() ml_item['precio'] = response.xpath( '//span[@itemprop="price"]/text()').extract() ml_item['image_urls'] = response.xpath( '//figure[contains(@class, "col-sm-4 col-xs-12 product-img2")]/a/img/@src' ).extract() ml_item['image_name'] = response.xpath( '//h1[@class="col-xs-12 page-title product-name"]/text()' ).extract_first() self.item_count += 1 if self.item_count > 5: raise CloseSpider('item_exceeded') yield ml_item
class MercadoSpider(CrawlSpider): name = "mercado" item_count = 0 allowed_domain = ['www.mercadolibre.com.ve'] start_urls = ['https://listado.mercadolibre.com.ve/impresoras'] rules = { Rule( LinkExtractor( allow=(), restrict_xpaths=( '//*[@id="results-section"]/div[2]/ul/li[12]/a'))), Rule(LinkExtractor( allow=(), restrict_xpaths=( '//li[@class="results-item article grid item-info-height-117"]' )), callback='parse_item', follow=False), } def parse_item(self, response): yield { 'titulo': response.xpath( 'normalize-space(//*[@id="short-desc"]/div/header/h1)'). extract(), 'precio': response.xpath( 'normalize-space(//*[@id="productInfo"]/fieldset[1]/span/span[2])' ).extract() } self.item_count += 1 if self.item_count > 30: raise CloseSpider('item_exceeded')
class NewsSpider(CrawlSpider): name = 'news' allowed_domains = ['www.nikkei.com'] start_urls = ['http://www.nikkei.com/news/category/'] rules = [ Rule(LinkExtractor(allow=r'/news/category/[a-zA-Z]+/$')), Rule(LinkExtractor(allow=r'/article/[a-zA-Z\d_]+/$'), callback='parse_articles'), ] #def parse(self, response): # for url in response.css('h4.cmn-article_title a::attr("href")').re(r'/article/[a-zA-Z\d_]+/$'): # yield scrapy.Request(response.urljoin(url), self.parse_articles) def parse_articles(self, response): title = response.css('.cmnc-middle ::text').extract_first() body = response.css('.cmn-article_text').xpath('string()').extract_first().strip() publish_date = response.css('.cmnc-publish ::text').extract_first() category = response.css('.cmn-topic_path').xpath('string()').extract_first().strip() yield Page( url=response.url, key=extract_key(response.url), html=response.text, title=title, body=body, date=publish_date, category=category ) """
class ListSpider(CrawlSpider): #爬虫名称 name = "tutorial" #设置下载延时 download_delay = 1 #允许域名 allowed_domains = ["news.cnblogs.com"] #开始URl start_urls = ["https://news.cnblogs.com"] #爬虫规则 rules = ( Rule(SgmlLinkExtractor( allow=(r'https://news.cnblogs.com/n/page/\d', ))), Rule(SgmlLinkExtractor(allow=(r'https://news.cnblogs.com/n/\d+', )), callback='parse_content'), ) #解析内容 def parse_content(self, response): item = TutorialItem() #当前url title = response.selector.xpath( '//div[@id="news_title"]')[0].extract().decode('utf-8') item['title'] = title author = response.selector.xpath('//div[@id="news_info"]/span/a/text()' )[0].extract().decode('utf-8') item['author'] = author releasedate = response.selector.xpath( '//div[@id="news_info"]/span[@class="time"]/text()')[0].extract( ).decode('utf-8') item['releasedate'] = releasedate yield item
class TabelogSpoider(CrawlSpider): name = 'tabelog' allowed_domains = ["tabelog.com"] start_urls = [ 'https://tabelog.com/tokyo/rstLst/lunch/?LstCosT=2&RdoCosTp=1' ] rules = [ Rule(LinkExtractor(allow=r'/\w+/rstLst/lunch/\d/')), Rule(LinkExtractor(allow=r'/\w+/A\d+/A\d+/\d+/$'), callback='parse_restaurant'), ] def parse_restaurant(self, response): latitude, longitude = response.css( 'img.js-map-lazyload::attr("data-original")').re( r'markers=.*?%7C([\d.]+),([\d.]+)') item = Restaurant( name=response.css('.display-name').xpath( 'string()').extract_first().strip(), address=response.css('[class="rstinfo-table__address"]').xpath( 'string()').extract_first(), latitude=latitude, longitude=longitude, station=response.css('[class="linktree__parent-target-text"]' ).xpath('string()').extract_first(), score=response.css('[class="rdheader-rating__score-val-dtl"]'). xpath('string()').extract_first(), ) return item
class BandQ(CrawlSpider): name = 'e-bedding-bandq' allowed_domains = ['diy.com'] start_urls = ( 'http://www.diy.com/rooms/bedroom/bedding/DIY822487.cat', 'http://www.diy.com/rooms/bedroom/beds-mattresses/DIY822423.cat') categories = LinkExtractor(restrict_css='#content .menu') pages = LinkExtractor(restrict_css='.paginator') products = LinkExtractor(restrict_css='#product-listing h3') rules = (Rule(categories), Rule(pages), Rule(products, callback='parse_product')) def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = re.search('(\d+)_BQ', response.url).group(1) loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_css('name', '.product-summary h1.product-title::text') loader.add_css('price', '.product-price::attr(content)') loader.add_css('sku', 'dl.product-code dd::text') loader.add_value('category', 'Bedroom') category = response.css('.breadcrumb').xpath( './/li/a/text()').extract()[-1] loader.add_value('category', category) image_url = response.css('.main-img img::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_xpath('brand', '//th[text()="Brand"]/following-sibling::td/text()') if loader.get_output_value('price') < 50: loader.add_value('shipping_cost', 5) yield loader.load_item()
class MercadoSpider(CrawlSpider): name = 'mercado' item_count = 0 allowed_domain = ['https://www.mercadolibre.com.pe/'] start_urls = [ 'https://listado.mercadolibre.com.pe/impresoras#D[A:impresoras]' ] rules = { # Para cada item Rule( LinkExtractor( allow=(), restrict_xpaths=('//li[@class="pagination__next"]/a'))), Rule(LinkExtractor( allow=(), restrict_xpaths=( '//*[@class="item__title list-view-item-title" ]')), callback='parse_item', follow=False) } def parse_item(self, response): ml_item = MercadoItem() #info de producto ml_item['titulo'] = response.xpath( 'normalize-space(//*[@class="item-title__primary"]/text())' ).extract_first() ml_item['folio'] = response.xpath( 'normalize-space(//*[@class="item-info__id-number"]/text())' ).extract() ml_item['precio'] = response.xpath( 'normalize-space(//span[@class="price-tag-fraction"]/text())' ).extract() ml_item['envio'] = response.xpath( 'normalize-space(//*[@class="shipping-method-title"]/text())' ).extract() ml_item['ubicacion'] = response.xpath( 'normalize-space(//*[@class="custom-address"]//text())').extract() ml_item['ventas_producto'] = response.xpath( 'normalize-space(//*[@class="item-conditions"]/text())').extract() #info de la tienda o vendedor ml_item['vendedor_url'] = response.xpath( '//*[@class="reputation-view-more card-block-link"]/@href' ).extract() ml_item['ventas_vendedor'] = response.xpath( 'normalize-space(//*[@class="reputation-relevant"][2]/strong' ).extract() ml_item['reputacion'] = response.xpath( 'normalize-space(//*[@class="reputation-relevant"][1]/strong' ).extract() self.item_count += 1 if self.item_count > 5: raise CloseSpider('item_exceeded') yield ml_item
class RutlandcyclingSpider(CrawlSpider): name = 'zyro-rutlandcycling.com' allowed_domains = ['rutlandcycling.com'] start_urls = ('http://www.rutlandcycling.com',) rules = ( Rule(LinkExtractor(restrict_css='.ctrNavigation, #lnkNextTop')), Rule(LinkExtractor(restrict_xpaths='//div[@itemtype="http://schema.org/Product"]'), callback='parse_product') ) def _parse(self, response): for url in response.css('.ctrNavigation a::attr(href)').extract(): yield Request(response.urljoin(url), callback=self.parse) for url in response.xpath('//div[@itemtype="http://schema.org/Product"]//a/@href').extract(): yield Request(response.urljoin(url), callback=self.parse_product) def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = response.xpath('//h1/text()').extract()[0] identifier = response.xpath('//*[@id="currentProduct"]/@value').extract()[0] sku = response.xpath('//p[contains(., "Code")]/span[@class="seasonCode"]/text()').extract() sku = sku[0] if sku else '' brand = response.xpath('//p[contains(., "Brand")]/span[@class="seasonCode"]/text()').extract() brand = brand[0] if brand else '' image_url = response.css('.mainImages ::attr(data-image)').extract() category = response.xpath('//div[@class="breadcrumbs"]//a/text()').extract()[1:-1] products = response.xpath('//div[@class="clAttributeGridContainer"]/div') for product in products: product_loader = ProductLoader(item=Product(), selector=product) p_name = product.select('div[@id="attName"]/div/text()').extract()[0] p_name = name + ' ' + p_name.replace('On Sale - ', '') p_identifier = product.select('div[@id="attCode"]/text()').extract()[0] price = product.select('div[@id="attPrice"]/span[@id]/text()').extract()[0] price = extract_price(price) out_of_stock = product.select('div[@id="attStockMessage"]/span[@class="OutofStockCSS"]').extract() product_loader.add_value('identifier', identifier + '_' + p_identifier) product_loader.add_value('name', p_name) product_loader.add_value('sku', sku) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) product_loader.add_value('price', price) if price < 20: product_loader.add_value('shipping_cost', 3.49) if out_of_stock: product_loader.add_value('stock', 0) product_loader.add_value('category', category) product_loader.add_value('brand', brand) product_loader.add_value('url', response.url) product = product_loader.load_item() yield product
class BMStores(Spider): name = 'toymonitor-bmstores' allowed_domains = ['bmstores.co.uk'] start_urls = ['http://www.bmstores.co.uk/products/toys-and-games'] categories = LinkExtractor(restrict_css='ul.aside-list') products = LinkExtractor(restrict_css='a.product') rules = (Rule(categories, callback='parse_pages', follow=True), Rule(products, callback='parse_product')) def parse_pages(self, response): category_id = response.xpath('//script/text()').re( "categoryID: '(.+)'")[0] for page in response.css( 'div.pagination ::attr(data-pageto)').extract(): url = 'http://www.bmstores.co.uk/hpcProduct/productbyfilter/ajaxmode/1?categoryID=%s&sort=datehigh&perPage=36&pageNum=%s' % ( category_id, page) yield Request(url, self.parse_page) def parse_page(self, response): data = json.loads(response.body) if not data['success']: self.logger.warning('Failed pagination %s' % response.url) selector = Selector(text=data['paginationLink']) for page in selector.css( 'div.pagination ::attr(data-pageto)').extract(): url = add_or_replace_parameter(response.url, 'pageNum', page) yield Request(url, self.parse_page) selector = Selector(text=data['pageHTML']) for url in selector.css('a.product::attr(href)').extract(): yield Request(response.urljoin(url), self.parse_product) def parse_product(self, response): if 'login.cfm' in response.url: return loader = ProductLoader(Product(), response=response) identifier = response.url.split('/')[-1] identifier = hashlib.md5(identifier).hexdigest() loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_css('name', 'h1.content-title::text') loader.add_xpath('price', '//script/text()', re='price": "(.+)"') loader.add_xpath('sku', '//script/text()', re='sku": "(.+)"') category = response.xpath( '//ul[@id="breadcrumbs"][1]//a/text()').extract()[1:-1] loader.add_value('category', category) image_url = response.css( 'div.product-detail-feature-img img::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_xpath('brand', '//meta[@property="og:brand"]/@content') stock = response.xpath('//script/text()').re('availability": "(.+)"') if stock and stock[0] != 'In stock': loader.add_value('stock', 0) yield loader.load_item()
class AqiSpider(CrawlSpider): name = 'aqi_crawl' # 改域名 allowed_domains = ['aqistudy.cn'] # 1. 请求首页 start_urls = ['https://www.aqistudy.cn/historydata/'] # callback 有; follow False # callback 没有 follow True rules = ( # 自动提取 所有城市的链接,自动发送请求 解析link Rule(LinkExtractor(allow='monthdata\.php')), # 自动提取 所有月份的链接,自动发送请求 解析link, 手动解析data Rule(LinkExtractor(allow="daydata\.php"), callback="parse_day", follow=False), ) # 4.解析目标数据 每天的数据 def parse_day(self, response): item = AqiItem() # 解析 标题 在提取城市名字 title = response.xpath('//*[@id="title"]/text()').extract_first() item['city_name'] = title[8:-11] # 1. 取出所有 tr_list tr_list = response.xpath('//tr') # 2.删除表头 tr_list.pop(0) for tr in tr_list: # 日期 item['date'] = tr.xpath('./td[1]/text()').extract_first() # AQI item['aqi'] = tr.xpath('./td[2]/text()').extract_first() # 质量等级 item['level'] = tr.xpath('./td[3]//text()').extract_first() # PM2.5 item['pm2_5'] = tr.xpath('./td[4]/text()').extract_first() # PM10 item['pm10'] = tr.xpath('./td[5]/text()').extract_first() # 二氧化硫 item['so_2'] = tr.xpath('./td[6]/text()').extract_first() # 一氧化碳 item['co'] = tr.xpath('./td[7]/text()').extract_first() # 二氧化氮 item['no_2'] = tr.xpath('./td[8]/text()').extract_first() # 臭氧 item['o3'] = tr.xpath('./td[9]/text()').extract_first() # 将数据 -->engine-->pipeline yield item
class Coverbrands(CrawlSpider): name = "blivakker-coverbrands" allowed_domains = ['coverbrands.no'] start_urls = ['http://www.coverbrands.no/'] rules = (Rule( LinkExtractor(restrict_xpaths='//ul[@id="nav"]', restrict_css='.pages')), Rule(LinkExtractor(restrict_css='.products-grid', process_value=url_query_cleaner), callback='parse_product')) def parse_product(self, response): loader = ProductLoader(item=Product(), response=response) css = '.nosto_product .%s ::text' loader.add_css('identifier', css % 'product_id') loader.add_css('sku', css % 'product_id') for field in ('url', 'name', 'image_url', 'brand'): loader.add_css(field, css % field) list_price = response.css(css % 'list_price').extract_first() sales_price = response.css(css % 'price').extract_first() loader.add_value('price', list_price) if 'InStock' not in response.css(css % 'availability').extract_first(): loader.add_value('stock', 0) category = response.css(css % 'category').extract_first() loader.add_value('category', category.split('/')[-1]) options_data = response.xpath('//script/text()').re( 'Product.Config.({.+})') if not options_data: item = loader.load_item() if sales_price != list_price: item['metadata'] = {'SalesPrice': Decimal(sales_price)} yield item return options_data = json.loads(options_data[0]) if len(options_data['attributes']) > 1: self.log('More than one options attributes found on %s' % response.url) return price = loader.get_output_value('price') name = loader.get_output_value('name') sales_price = Decimal(sales_price) for option in options_data['attributes'].values()[0]['options']: new_price = sales_price + Decimal(option['price']) loader.replace_value('price', price + Decimal(option['oldPrice'])) loader.replace_value('name', name + ' ' + option['label']) loader.replace_value('identifier', option['products'][0]) loader.replace_value('sku', option['products'][0]) loader.replace_xpath( 'image_url', '//li[@id="simple-product-image-%s"]/a/@href' % option['products'][0]) item = loader.load_item() if price + Decimal(option['oldPrice']) != new_price: item['metadata'] = {'SalesPrice': new_price} yield item
class SinaSpeicalSpider(CrawlSpider): name = "sina_special" #allowed_domains = ["www.51job.com"] start_urls = ('http://match.2016.sina.com.cn/medals/', ) rules = ( Rule(LinkExtractor( allow=('2016.sina.com.cn/china/[0-9\-]*/doc-if[a-z0-9]*.shtml', )), callback='parse_one_news', follow=True), Rule(LinkExtractor( allow=('2016.sina.com.cn/brazil/[0-9\-]*/doc-if[a-z0-9]*.shtml', '2016.sina.com.cn/side/[0-9\-]*/doc-if[a-z0-9]*.shtml')), callback='parse_one_news', follow=True), Rule(LinkExtractor(allow=('2016.sina.com.cn', ), deny=( 'php$', 'php?', 'video.sina.com.cn', )), follow=True), ) def parse(self, response): def do_item(item): if item and isinstance(item, list): return item[0] return item try: rows = response.css("table[class='tb_02 tb_04'] tr[class='sub']") for row in rows: item = SpecItem() item['url'] = response.url item['kind'] = 1 item['rank'] = row.css( "td[class='w01'] ::text").extract()[0].strip() item['country'] = row.css( "td[class='w02'] a::text").extract()[0].strip() item['gold'] = row.css( "td[class='w03'] a::text").extract()[0].strip() item['silver'] = row.css( "td[class='w04'] a::text").extract()[0].strip() item['bronze'] = row.css( "td[class='w05'] a::text").extract()[0].strip() item['total'] = row.css( "td[class='w06'] a::text").extract()[0].strip() yield item except Exception as e: self.logger.error("parse url:%s err:%s", response.url, e) return [] return item
class FeneticWellbeing(CrawlSpider): name = 'betterlife_healthcare-feneticwellbeing' allowed_domains = ['feneticwellbeing.com'] start_urls = ['http://www.feneticwellbeing.com/'] categories = LinkExtractor(allow='/product-category/') products = LinkExtractor(allow='/shop/') rules = (Rule(categories), Rule(products, callback='parse_product')) def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.xpath( '//input[@name="product_id"]/@value').extract_first( ) or response.xpath( '//input[@name="add-to-cart"]/@value').extract_first() if not identifier: loader.add_value('stock', 0) identifier = response.xpath( '//div[@itemtype="http://schema.org/Product"]/@id').re_first( 'product-(\d+)') loader.add_value('identifier', identifier) loader.add_css('sku', 'span.sku::text') loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_css('price', '.product-price-exvat span.amount::text') loader.add_css('price', '.product-price span.amount::text') category = response.xpath( '//span[@class="posted_in"][contains(., "Categories:")]/a/text()' ).extract_first() loader.add_value('category', category) loader.add_css('image_url', 'div.single-product-main-image a::attr(href)') brand = response.xpath( '//span[@class="posted_in"][contains(., "Brands:")]/a/text()' ).extract_first() loader.add_value('brand', brand) item = loader.load_item() variations = response.xpath( '//@data-product_variations').extract_first() if not variations: yield item return variations = json.loads(variations) for variant in variations: loader = ProductLoader(Product(), response=response) loader.add_value(None, item) loader.replace_value('identifier', variant['variation_id']) loader.replace_value('sku', variant['sku']) loader.replace_value('price', variant['display_price']) if variant['image_link']: loader.replace_value('image_url', variant['image_link']) loader.add_value('name', variant['attributes'].values()) yield loader.load_item()
class Ocado(CrawlSpider): name = 'e-bedding-ocado' allowed_domains = ['ocado.com'] start_urls = [ 'https://www.ocado.com/webshop/getCategories.do?tags=|30931|126580' ] categories = LinkExtractor(restrict_css='#navigationSidebar .superNav') products = LinkExtractor(restrict_css='.productTitle', allow='/product/', process_value=url_query_cleaner) rules = (Rule(categories, callback='parse_category', follow=True), Rule(products, callback='parse_product')) def parse_category(self, response): count = response.css('#productCount em::text').re('\d+')[0] for idx in xrange(int(count)): url = add_or_replace_parameter(response.url, 'index', idx) yield Request(url) def parse_product(self, response): options = response.css('.pg_select') if options: selected_option = options.xpath('option[@selected]') if not selected_option: for url in options.xpath('.//@data-href').extract(): yield Request(response.urljoin(url_query_cleaner(url)), self.parse_product) return loader = ProductLoader(Product(), response=response) sku = response.xpath( '//div[@id="content"]//input[@name="sku"]/@value').extract_first() loader.add_value('identifier', sku) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_xpath('name', '//strong[@itemprop="name"]/text()') loader.add_css('price', 'div.show h5 ::text') loader.add_css('price', '.nowPrice ::text') loader.add_css('price', '.typicalPrice h5 ::text') category = response.xpath('//input[@name="productDetailsDTO"]/@value' ).re('"category":"(.+?)"') if category: loader.add_value('category', category[0].split('/')) image_url = response.css( 'ul#galleryImages a::attr(href)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_xpath( 'brand', '//span[@itemprop="brand"]//span[@itemprop="name"]/text()') if response.css('div#content p.oos'): loader.add_value('stock', 0) yield loader.load_item()
class RebelSport(CrawlSpider): name = 'kitbag_au-rebelsport' allowed_domains = ['rebelsport.com.au'] start_urls = [ 'http://www.rebelsport.com.au/store/fangear/soccer-football/604' ] categories = LinkExtractor( restrict_css='.secondary-menu', process_value=lambda url: add_or_replace_parameter( url, 'pageSize', '500')) pages = LinkExtractor(restrict_css='.pagination') products = LinkExtractor( restrict_css='.product', process_value=lambda url: make_variant_url(url_query_cleaner(url))) rules = (Rule(categories), Rule(products, callback='parse_product')) def parse_product(self, response): data = response.xpath('//script/text()').re('{\\\\"Variants.+}')[0] data = json.loads(data.replace('\\"', '"')) variants = data['Variants'] for variant in variants: url = response.urljoin(variant['ProductPLU']) yield Request(make_variant_url(url), self.parse_product) loader = ProductLoader(item=Product(), response=response) identifier = response.xpath( '//input[@id="ProductPLU"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '(//h1[@itemprop="name"]/text())[1]') metadata = {} for i in xrange(3): variant_name = data['Variant%dSelected' % (i + 1)] if variant_name and variant_name != 'N/A': loader.add_value('name', variant_name) metadata[data['Variant%dHeader' % (i + 1)]] = variant_name if 'size' in variant_name.lower(): metadata['size'] = variant_name[5:].strip() price = response.css('.price-value .currency::text').extract() loader.add_value('price', price.pop()) category = response.css('.breadcrumb a::text').extract() loader.add_value('category', category[1:]) loader.add_css('image_url', '.product-image::attr(src)') loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content') loader.add_value('shipping_cost', '7.95') stock = response.css('.product-stock-widget::attr(ng-init)').re( 'AvailableOnline: (\w+)')[0] if stock != 'true': loader.add_value('stock', 0) item = loader.load_item() item['metadata'] = metadata yield item
class SinaOlySpider(CrawlSpider): name = "sina_oly" #allowed_domains = ["www.51job.com"] start_urls = ('http://2016.sina.com.cn/', ) rules = ( Rule(LinkExtractor( allow=('2016.sina.com.cn/china/[0-9\-]*/doc-if[a-z0-9]*.shtml', )), callback='parse_one_news', follow=True), Rule(LinkExtractor( allow=('2016.sina.com.cn/brazil/[0-9\-]*/doc-if[a-z0-9]*.shtml', '2016.sina.com.cn/side/[0-9\-]*/doc-if[a-z0-9]*.shtml')), callback='parse_one_news', follow=True), Rule(LinkExtractor(allow=('2016.sina.com.cn', ), deny=( 'php$', 'php?', 'video.sina.com.cn', )), follow=True), ) def parse_one_news(self, response): def do_item(item): if item and isinstance(item, list): return item[0] return item item = NewsItem() try: cn = response.css("div[class='cn']") item['url'] = response.url item['title'] = do_item( response.css( "div[class='blkContainerSblk'] h1::text").extract()) art_info = response.css("div[class='artInfo']") item['publish'] = do_item( art_info.css("span[id='pub_date']::text").extract()) item['pic_title'] = do_item( response.css("span[class='img_descr'] ::text").extract()) item['keywords'] = do_item( response.css("p[class='art_keywords'] a::text").extract()) ''' filename = response.url.split("/")[-2] + '.html' with open(filename,'wb') as f: f.write(response.body) ''' except Exception as e: self.logger.error("parse url:%s err:%s", response.url, e) return [] return item
class LifeSpider(CrawlSpider): name = 'life' # limits the number of item scraped, it's optional, but in some cases you don't want to scrape to infinity #item_count = 0 # only allow information within this domain, don't let scrapy get out of it allowed_domain = ['https://lifeinformatica.com/'] # for laptops start_urls = [ 'https://lifeinformatica.com/categoria-producto/family-ordenadores-y-portatiles/family-portatiles-y-accesorios/family-portatiles/' ] # for smartphones #start_urls = ['https://lifeinformatica.com/categoria-producto/family-tablets-y-moviles/family-smartphones-y-accesorios/family-smartphones/'] rules = { # go through every page with the next button Rule( LinkExtractor(allow=(), restrict_xpaths=( '//nav[@class="electro-advanced-pagination"]/a')) ), # go inside ever product on the page Rule(LinkExtractor( allow=(), restrict_xpaths=('//div[@class="product-loop-header"]')), callback='parse_item', follow=False) } def parse_item(self, response): item = LifeItem() # product information item['brand'] = response.xpath( '//span[@itemprop="brand"]//text()').extract() item['name'] = response.xpath( '//h1[@class="product_title entry-title"]//text()').extract() item['price'] = response.xpath( '//span[@class="entero"]//text()').extract() item['decimals'] = response.xpath( '//span[@class="decimales_precio"]//text()')[0].extract() item['currency'] = response.xpath( '//span[@class="decimales_precio"]//text()')[1].extract() item['price_without_vat'] = response.xpath( '//p[@class="sinIva"]//text()').extract() item['availability'] = response.xpath( '//span[@class="no_stock"]//text()').extract() item['description'] = response.xpath( '//div[@class="electro-description clearfix"]/p/text()').extract() #self.item_count += 1 #if self.item_count > 5: #raise CloseSpider('item_exceeded') yield item
class ShoucaiSpider(CrawlSpider): name = 'shoucai' start_urls = ['https://www.shoucainu8.com/Invest/llist/status/3'] rules = ( Rule(LinkExtractor(allow=(r'https://www.shoucainu8.com/Invest/llist/status/3/p/\d+.html'))), Rule(LinkExtractor(allow=(r'https://www.shoucainu8.com/invest/detail/sn/\d+')), callback='parse_item') ) def parse_item(self, response): sel = Selector(response) wealth_title = sel.xpath('//div[@class="invest-title"]/h2/text()')\ .extract() wealth_interest_rate = sel.xpath('//p[@class="rate"]/text()')\ .extract() wealth_sum = sel.xpath('//p[@class="total"]/text()').extract() wealth_deadline = sel.xpath('//p[@class="duration"]/text()').extract() wealth_starting_amount_or_username = sel\ .xpath('//table[@class="table-details"]/tr[1]/td/text()').extract() wealth_interest_bearing_method_or_id = sel\ .xpath('//table[@class="table-details"]/tr[2]/td/text()').extract() wealth_phone_number_or_product_manual = sel\ .xpath('//table[@class="table-details"]/tr[3]/td/text()').extract() wealth_excepted_return_or_type_of_loan = sel\ .xpath('//table[@class="table-details"]/tr[4]/td/text()').extract() wealth_redemption_exit_or_use_of_the_loan = sel\ .xpath('//table[@class="table-details"]/tr[5]/td/text()').extract() wealth_asset_type = sel\ .xpath('//table[@class="table-details"]/tr[6]/td/text()').extract() wealth_market_value = sel\ .xpath('//table[@class="table-details"]/tr[7]/td/text()').extract() wealth_payback = sel\ .xpath('//table[@class="table-details"]/tr[8]/td/text()').extract() wealth_risk_control = sel\ .xpath('//table[@class="table-details"]/tr[9]/td/text()').extract() item = ShoucainuItem() item['wealth_title'] = wealth_title item['wealth_interest_rate'] = wealth_interest_rate item['wealth_sum'] = wealth_sum item['wealth_deadline'] = wealth_deadline item['wealth_starting_amount_or_username'] = wealth_starting_amount_or_username item['wealth_interest_bearing_method_or_id'] = wealth_interest_bearing_method_or_id item['wealth_phone_number_or_product_manual'] = wealth_phone_number_or_product_manual item['wealth_excepted_return_or_type_of_loan'] = wealth_excepted_return_or_type_of_loan item['wealth_redemption_exit_or_use_of_the_loan'] = wealth_redemption_exit_or_use_of_the_loan item['wealth_asset_type'] = wealth_asset_type item['wealth_market_value'] = wealth_market_value item['wealth_payback'] = wealth_payback item['wealth_risk_control'] = wealth_risk_control yield item
class WatchO(CrawlSpider): name = 'bablas-watcho' allowed_domains = ['watcho.co.uk'] start_urls = ('http://www.watcho.co.uk/watches.html', 'http://www.watcho.co.uk/Clocks.html') categories = LinkExtractor(restrict_css='div.SubCategoryListGrid', restrict_xpaths='//a[@href="%s" or @href="%s"]/following-sibling::*' %start_urls) pages = LinkExtractor(restrict_css='div.CategoryPagination') products = LinkExtractor(restrict_css='div.ProductDetails') rules = (Rule(categories), Rule(pages), Rule(products, callback='parse_product')) def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.xpath('//input[@name="product_id"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_xpath('price', '//meta[@itemprop="price"]/@content') category = response.xpath('//div[@id="ProductBreadcrumb"]//a/text()').extract()[1:] loader.add_value('category', category) loader.add_xpath('image_url', '//img[@itemprop="image"]/@src') loader.add_xpath('brand', '//div[@itemtype="http://schema.org/Organization"]/meta[@itemprop="name"]/@content') if not response.xpath('//link[@itemprop="availability"]/@href[contains(., "InStock")]'): loader.add_value('stock', 0) sku = identifier name = loader.get_output_value('name') name_end = re.search('\S+$', name).group(0).strip(' ()') keywords = response.xpath('//meta[@name="keywords"]/@content').extract_first().split(',') keywords = [word.strip() for word in keywords if word] shortest_keyword = min(keywords, key=len) if keywords else 'none' from_name = re.findall('\S*\d+\S*', name) if shortest_keyword.lower() == name_end.lower(): sku = name_end elif shortest_keyword.upper() == shortest_keyword: sku = shortest_keyword elif name_end.upper() == name_end: sku = name_end elif from_name: sku = max(from_name, key=len) if '(' in sku: sku = identifier loader.replace_value('sku', sku) yield loader.load_item()