def get_app(self, response): il = ItemLoader(item=PlayStoreItems(), response=response) il.add_css('app_id', '.details-wrapper::attr(data-docid)') il.add_css('name', '.document-title div::text') il.add_css('category', '.category span::text') il.add_css( 'category_url', '.category::attr(href)', Compose(lambda urls: [urljoin(response.url, url) for url in urls])) il.add_css('price', '.details-actions .price span::text') il.add_css('offers_in_app_purchases', '.inapp-msg::text') il.add_css('stars_count', '.stars-count::text') il.add_css('video', '.details-trailer > span::attr(data-video-url)') il.add_css('screenshots', '.screenshot::attr(src)') il.add_xpath( 'description', '//div[contains(@class, "show-more-content")]/div//text()') il.add_css('update_date', '[itemprop="datePublished"]::text') il.add_css('file_size', '[itemprop="fileSize"]::text') il.add_css('installs', '[itemprop="numDownloads"]::text') il.add_css('current_version', '[itemprop="softwareVersion"]::text') il.add_css('requires_android', '[itemprop="operatingSystems"]::text') il.add_css('offered_by', '[itemprop="author"] > a span::text') il.add_css( 'offered_by_url', '[itemprop="author"] > a::attr(href)', Compose(lambda urls: [urljoin(response.url, url) for url in urls])) yield il.load_item()
def test_compose(self): proc = Compose(lambda v: v[0], str.upper) self.assertEqual(proc(['hello', 'world']), 'HELLO') proc = Compose(str.upper) self.assertEqual(proc(None), None) proc = Compose(str.upper, stop_on_none=False) self.assertRaises(TypeError, proc, None)
class Cogis_spillItemLoader(ItemLoader): default_input_processor = Compose(TakeFirst(), extract_text) doc_href_in = Compose(TakeFirst(), extract_link) date_in = Compose(TakeFirst(), extract_date) county_code_in = TakeFirst() county_name_in = TakeFirst() default_output_processor = TakeFirst()
class ArticleItemLoader(ItemLoader): default_item_class = ArticleItem default_output_processor = TakeFirst() title_out = Compose(TakeFirst(), Net39ArticleTitle()) content_out = Compose(Join(''), Net39ArticleContent())
class StackOverflowUser(Item): user_id = Field(output_processor=TakeFirst()) user_name = Field(output_processor=Compose( get_the_last_name) # get the last name ) # stackoverflow user name user_link = Field(output_processor=Compose( get_the_last_name) # get the owner link ) # link of user in stackoverflow
class CommentItemLoader(ItemLoader): default_item_class = CommentItem default_input_processor = MapCompose(lambda x: x.strip()) default_output_processor = Compose(TakeFirst(), lambda x: x.strip()) default_selector_class = Selector textpost_out = Compose(Join(" "), lambda x: x.strip()) comments_out = Compose(TakeFirst(), get_comments_count, lambda x: x.strip()) upvoted_out = Compose(TakeFirst(), get_upvoted, lambda x: x.strip()) comment_out = Compose(Join(" "), lambda x: x.strip())
def parse_brand_list(self, response): hxs = HtmlXPathSelector(response) # products product_items = hxs.select('//div[@class="productGrid"]/ul/li/div[@class="item"]') category_items = hxs.select('//h1[@class="categoryLandingPageTitle_heading"]/a/text()').extract() category = category_items[0] if category_items else '' brand_name = get_brand_from_url(response.url) def get_full_image_url(url): return get_full_url(response, url) for product_item in product_items: image_url = product_item.select(u'div[@class="prodimg"]/a/img/@src').extract() if image_url: image_url = get_full_url(response, image_url[0]) ploadr = ProductLoader(item=Product(), selector=product_item, response=response) ploadr.add_xpath('name', 'div[@class="prodname"]/a/text()', TakeFirst(), Compose(unicode.strip)) ploadr.add_xpath('url', 'div[@class="prodname"]/a/@href', TakeFirst(), Compose(unicode.strip), Compose(get_full_image_url)) ploadr.add_value('category', category) ploadr.add_value('image_url', image_url) price = ploadr.get_xpath('div[@class="proddetails"]//div[@class="prodnowprice"]/span/text()', TakeFirst(), Compose(extract_price)) price_excl_vat = Decimal(price) ploadr.add_value('price', price_excl_vat) ploadr.add_value('shipping_cost', Decimal('5.00') if price_excl_vat < 50 else Decimal('0.0')) ploadr.add_xpath('sku', 'div[@class="proddetails"]//div[@class="proditemcode"]/a/span/following-sibling::text()', TakeFirst(), Compose(unicode.strip)) ploadr.add_value('identifier', ploadr.get_output_value('sku')) stock_info = product_item.select(u'div[@class="proddetails"]/div/div/span[contains(@class, "instock")]/@class').extract() buy_button = product_item.select(u'div[@class="proddetails"]/div[@class="prodquickbuy"]/a[@class="primaryBtn"]').extract() ploadr.add_value('brand', brand_name) ploadr.add_value('stock', 1 if stock_info or buy_button else 0) item = ploadr.load_item() tmp = ''.join(product_item.select("//div[@class='proditemcode']//text()").extract()) item['metadata'] = {'product_code': tmp.split(':')[-1].strip()} if not ploadr.get_output_value('brand'): yield Request(item['url'], meta={'item': item}, callback=self.parse_brand) else: yield item
class DealLoader(DaywatchLoader): offer_out = Compose(Join(), whitespace_trimmer) discount_out = Compose(Join(), parse_discount) sold_count_out = Compose(Join(), parse_sold_count) description_out = Compose(Join(), whitespace_trimmer) is_main_deal_out = Compose(TakeFirst(), int) city_out = Compose(Join(), whitespace_trimmer) merchant_name_out = Compose(Join(), whitespace_trimmer) merchant_city_out = Join() merchant_lat_out = Compose(TakeFirst(), parse_en_float_str) merchant_lon_out = Compose(TakeFirst(), parse_en_float_str) merchant_address_out = Compose(Join(), whitespace_trimmer) merchant_postcode_out = Join() merchant_phone_out = Join(";") merchant_website_out = TakeFirst() merchant_email_out = Join() def load_price_currency(self, xpath, spider_name=''): try: text = ' '.join(self.selector.xpath(xpath).extract()) (price, currency) = parse_price_currency(text, self.context) self.add_value(F_PRICE, price) self.add_value(F_CURRENCY, currency) except Exception as e: self.add_value(F_PRICE, MISSING_VALUE) site = SITE_MODEL.objects.get(spider_name=spider_name) self.add_value(F_CURRENCY, site.country.currency) raise_missing_value(spider_name=self.context['spider_name'], field_name=F_PRICE, url=self.context[F_URL], exception=e, level=STATUS_CRITICAL, message="Could not load price and currency.", category=PARSING_ERROR) def load_price_currency_from_str(self, text): try: (price, currency) = parse_price_currency(text, self.context) self.add_value(F_PRICE, price) except Exception as e: self.add_value(F_PRICE, MISSING_VALUE) raise_missing_value(spider_name=self.context['spider_name'], field_name=F_PRICE, url=self.context[F_URL], exception=e, level=STATUS_CRITICAL, message="Could not load price and currency.", category=PARSING_ERROR)
class ArticleLoader(XPathItemLoader): category_loader = CategoryLoader tagline_loader = TaglineLoader url_out = Join() body_text_out = Compose(Join(), TextTool(normalize=True)) date_of_out = Compose(TakeFirst(), ParseDate(patch_table={u"мая": "May"})) headline_out = Compose(Join(), TextTool(normalize=True)) #image_urls_out = FullUrl() def __init__(self, search, response, check_exists=False): self.search = search self.response = response self.check_exists = check_exists super(ArticleLoader, self).__init__(item=ArticleItem(), response=self.response) def load_item(self): self._configure_main_rules() if self.check_exists and ArticleChecker( url=self.response.url, headline=self.get_output_value('headline')).exists(): return result = super(ArticleLoader, self).load_item() return result if result['headline'] else None def _configure_main_rules(self): self.add_value('url', self.response.url) for field, xpath in self.search.iteritems(): try: self.add_xpath(field, xpath) except KeyError: continue def _load_category_item(self): try: category_loader = self.category_loader(self.search, self.response) category = category_loader.load_item() if not (category and category['name']): raise ValueError() except (KeyError, ValueError): pass # -> category rule not exists - simple skip this case else: self._setup_text_processing(category) self.item["categories"].append(category) def _setup_text_processing(self, category): self.body_text_out.cut_substring = category['name']
class TimetableLoader(XPathItemLoader): default_output_processor = Join() default_input_processor = MapCompose(unicode.strip) flight_in = Compose(flight_handler) flight_type_in = Identity() flight_type_out = Compose(return_first) flight_status_in = Compose(flight_status_handler) flight_status_out = Compose(return_first) city_of_departure_out = Compose(title) city_of_arrival_out = Compose(title) airport_of_departure_out = Compose(title) airport_of_arrival_out = Compose(title) airline_in = Compose(airline_handler) airline_out = Compose(title)
class Product(Item): id = Field() name = Field(input_processor=Compose(TakeFirst(), unicode.strip)) price = Field(input_processor=Compose(TakeFirst(), unicode.strip, remove_comma, float)) cat = Field() avgStars = Field(input_processor=Compose(only_elem_or_default, float)) nReviews = Field( input_processor=Compose(only_elem, unicode.strip, remove_comma, int)) salesRank = Field( input_processor=Compose(unicode.strip, remove_comma, int)) subCatRank = Field(input_processor=Compose( only_elem_or_default, unicode.strip, remove_comma, int)) subCat = Field( input_processor=Compose(only_elem_or_default, unicode.strip)) manufact = Field( input_processor=Compose(only_elem_or_default, unicode.strip)) referrer = Field() @property def export_filename(self): return 'product' @property def key(self): return self._values['id']
def convert_type(infs): def _wrapper(inf, t): def _convert(data): if t not in ['join', 'list'] and isinstance(data, list): data = TakeFirst()(data) if type(data) in [str, unicode]: data = data.strip() elif type(data) in [int, float, datetime]: data = str(data) else: return data if t == 'join': sep = inf.get('sep', u' ') return Join(sep)(data) elif t == 'list': sep = inf.get('sep', u' ') return remove_tags(Join(sep)(data)).strip() elif t == 'text': return remove_tags(data).strip() elif t == 'clean': cleaner = Cleaner(style=True, scripts=True, javascript=True, links=True, meta=True) return cleaner.clean_html(data) elif t == 'unesc': return HTMLParser().unescape(data) elif t == 'base64': return base64.decodestring(data) elif t == 'sub': frm = inf.get('from') to = inf.get('to') return re.sub(frm, to, data) elif t == 'jpath': qs = inf.get('query') return jsonpath.jsonpath(json.loads(data), qs) elif t == 'map': m = inf.get('map') d = inf.get('default') return m.get(data, d) elif t == 'int': return int(float(data)) elif t == 'float': return float(data) elif t == 'date': fmt = inf.get('fmt', 'auto') tz = inf.get('tz', '+00:00') return parse_date(data, fmt, tz) elif t == 'cst': fmt = inf.get('fmt', 'auto') return parse_date(data, fmt, '+08:00') else: return data return _convert infs = infs if type(infs) == list else [infs] return Compose(*[_wrapper(inf, inf.get('type', 'str')) for inf in infs])
class LegisladorItemLoader(XPathItemLoader): default_item_class = LegisladorItem default_input_processor = MapCompose(fix_space, unicode.strip) default_output_processor = TakeFirst() apellido_in = MapCompose(fix_space, format_personal_name) nombre_in = MapCompose(fix_space, format_personal_name) camara_in = MapCompose(fix_space, unicode.strip, normalize_camara) distrito_nombre_in = MapCompose(fix_space, unicode.strip, normalize_distrito_name) bloque_nombre_in = MapCompose(fix_space, unicode.strip, normalize_bloque_name) mandato_inicio_in = MapCompose(fix_space, unicode.strip, spanish_date) mandato_inicio_out = Compose(lambda v: v[0].isoformat()) mandato_fin_in = MapCompose(fix_space, unicode.strip, spanish_date) mandato_fin_out = Compose(lambda v: v[0].isoformat())
class StoreLoader(XPathItemLoader): default_output_processor = Compose(lambda v: v[0], unicode.strip) def branch_in(self, values): for v in values: v = v.strip() yield v + u'店' if not v.endswith(u'店') else v
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) #pagination urls = hxs.select('//div[@class="pagination"]//a/@href').extract() for url in urls: yield Request(urljoin_rfc(base_url, url), callback=self.parse) #products products = hxs.select('//div[@class="product-grid"]/div') for product in products: loader = ProductLoader(item=Product(), selector=product) name = product.select( './/div[@class="image"]/a/img/@alt').extract()[0].strip() url = product.select('.//div[@class="image"]/a/@href').extract()[0] loader.add_value('url', urljoin_rfc(base_url, url)) loader.add_value('name', name) loader.add_xpath('image_url', './/div[@class="image"]/a/img/@src', Compose(lambda v: urljoin(base_url, v[0]))) price = product.select( './/div[@class="price"]/span[@class="price-tax"]/text()' ).extract() price = extract_price(price[0].strip()) loader.add_value('price', price) results = re.search(r"\b([\d]+)\b", name) if results: loader.add_value('sku', results.group(1)) identifier = product.select( './/div[@class="cart"]/input/@onclick').re(r"([\d]+)")[0] loader.add_value('identifier', identifier) loader.add_value('brand', 'LEGO') loader.add_value('shipping_cost', 4.89) yield self.load_item_with_metadata(loader.load_item())
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) loader = ProductLoader(response=response, item=Product()) loader.add_xpath( 'price', '(//span[@class="price-including-tax"])[1]/span/text()', TakeFirst(), Compose(onlyDecimal)) loader.add_xpath( 'identifier', '//form[@id="product_addtocart_form"]//input[@name="product"]/@value' ) loader.add_xpath( 'sku', '//table[@id="product-attribute-specs-table"]//tr[contains(th/text(), "SKU")]/td/text()' ) loader.add_xpath( 'brand', '//table[@id="product-attribute-specs-table"]//tr[contains(th/text(), "Manufacturer")]/td/text()' ) loader.add_value('url', urljoin_rfc(base_url, response.url)) loader.add_xpath('name', '//div[@class="product-name"]/h1/text()') loader.add_xpath('image_url', '//p[@class="product-image"]//img/@src') loader.add_value('category', response.meta['category']) if int(loader.get_output_value('price')) <= 50: loader.add_value('shipping_cost', 5) yield loader.load_item()
class ExclusionLoader(scrapy_utils.MultiLoader): default_item_class = ExclusionItem keys = { 'main_listing': [('name', 'id'), 'ansprechpartner', 'strasse', 'ort', 'tel', 'fax', 'email',], } default_keys = keys['main_listing'] default_input_processor = MapCompose(lambda x: x.xpath('string()'), lambda x: x.strip()) default_output_processor = TakeFirst() def extract_id(url): uid = scrapy_utils.url_query(url)['id'][0] return arg_to_iter(uid) def extract_url(cell): links = cell.xpath('.//a') if links: url = links[0].attrib['href'] else: url = '' return arg_to_iter(url) id_in = MapCompose(extract_url, extract_id) website_in = Compose(extract_url)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = hxs.select('//td[@class="smallText"]/big/big/strong/text()').extract().pop().strip() category = hxs.select('//td[@class="headerNavigationOP"]/a[@class="headerNavigation"]/text()')[-2].extract().strip() sku = hxs.select('//td[@class="headerNavigationOP"]/a[@class="headerNavigation"]/text()').pop().extract().strip() pid = self.get_pid_from_url(response.url) price = self.parse_price(hxs.select('//td[@class="pageHeading"]/big[1]/strong/text()').pop().extract()) stock = hxs.select('//td[@class="smallText"][@align="right"]/strong/big[contains(text(), "ANO")]/text()') if price: loader = ProductLoader(response=response, item=Product()) url = response.url.split('?osCsid')[0] loader.add_value('url', url) loader.add_value('name', name) loader.add_xpath('image_url', '//meta[@property="og:image"]/@content', Compose(lambda v: urljoin(base_url, v[0]))) loader.add_value('price', price) loader.add_value('category', category) loader.add_value('sku', sku) loader.add_value('identifier', pid) loader.add_value('brand', 'LEGO') if not stock: loader.add_value('stock', 0) yield self.load_item_with_metadata(loader.load_item()) else: self.errors.append("No price set for url: '%s'" % urljoin(base_url, response.url))
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), selector=hxs) name = hxs.select( '//*[@id="detail_product"]//h1/text()').extract()[0].strip() loader.add_value('url', response.url) loader.add_value('name', name) loader.add_xpath('image_url', '//*[@id="image_content"]/a/img/@src', Compose(lambda v: urljoin(base_url, v[0]))) price = hxs.select('//*[@id="real_price"]/text()').extract() price = extract_price(price[0].strip().replace( u' K\u010d', '').replace(',', '.').replace(' ', '')) loader.add_value('price', price) category = hxs.select( '//*[@id="breadcrumbs"]/a[@class="item"]/text()').extract() if category: loader.add_value('category', category[-1]) results = re.search(r"([\d]+)", name) if results: loader.add_value('sku', results.group(1)) identifier = hxs.select('//p[@class="warehouse"]/a/@onclick').re( r"([\d]+)")[0] loader.add_value('identifier', identifier) availability = hxs.select( '//p[@class="warehouse"]/a/span/text()').extract()[0].strip() if availability == u'Nen\xed skladem' or availability == u'U dodavatele': loader.add_value('stock', 0) else: results = re.search(r"([\d]+)", availability) if results: loader.add_value('stock', results.group(1)) loader.add_value('brand', 'LEGO') yield self.load_item_with_metadata(loader.load_item())
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) category = hxs.select( u'//div[@id="breadcrumbs_sub"]/ol/li/a[@class="category"]/text()' ).extract() category = category[0] if category else '' image_url = hxs.select( u'//form[@id="projector_form"]//img[@itemprop="image"]/@src' ).extract() if image_url: image_url = urljoin_rfc(get_base_url(response), image_url[0]) name = hxs.select( u'//div[@id="breadcrumbs"]//li[last()]/span/text()').extract()[0] name_option = hxs.select( u'//div[@class="product_section_sub"]/a[@title and contains(@class, "active")]/@title' ) if name_option: name = "%s - %s" % (name.rstrip(), name_option.extract()[0].lstrip()) product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('name', name.strip()) product_loader.add_value('url', response.url) product_loader.add_value('category', category) product_loader.add_value('image_url', image_url) identifier = hxs.select( u'//form[@id="projector_form"]/input[@name="product"]/@value' ).extract() product_loader.add_value('identifier', identifier[0]) price_xpath = '//div[@id="projector_price_value_wrapper"]/strong/span[@class="price"]/text()' product_loader.add_xpath('price', price_xpath) product_loader.add_xpath('shipping_cost', '//div[@id="projector_shipping"]/span/text()', TakeFirst(), Compose(lambda v: v.replace(',', '.')), re='([0-9.]+)') stock_option = hxs.select( u'//div[@id="projector_status_description"]/text()').extract() self.log("Stock option found %s" % stock_option, level=log.DEBUG) product_loader.add_value('stock', STOCK_MAP.get(stock_option[0], 0)) yield product_loader.load_item() # parse product options more_products = hxs.select( u'//div[@class="product_section_sub"][1]/a[@title]/@href').extract( ) _, _, urlpath = response.url.partition('/product-pol') url_to_remove = "/product-pol%s" % urlpath final_more_products = list(set(more_products) - set([url_to_remove])) # parse product for product_url in final_more_products: product_url = urljoin_rfc(get_base_url(response), product_url) yield Request(product_url, callback=self.parse_product)
class SpeakerLoader(ItemLoader): default_item_class = Speaker default_input_processor = MapCompose(remove_tags, unquote_markup, unicode.strip) default_output_processor = Join() image_urls_out = Identity() name_out = Compose(Join(), _cleanup_name)
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) # categories urls = hxs.select('//*[@id="inleft_eshop"]//a/@href').extract() for url in urls: if url != '/legacik/eshop/4-1-LEGO-SERVICE-suciastky': yield Request(urljoin_rfc(base_url, url), callback=self.parse) # pagination urls = hxs.select('//div[@class="pagination"]//a/@href').extract() for url in urls: yield Request(urljoin_rfc(base_url, url), callback=self.parse) # products # urls = hxs.select('//div[@class="productTitleContent"]/a/@href').extract() # for url in urls: # yield Request(urljoin_rfc(base_url, url), callback=self.parse_product) products = hxs.select('//div[@class="productBody"]') category = hxs.select('//*[@id="wherei"]/p//a/text()').extract() for product in products: loader = ProductLoader(item=Product(), selector=product) name = product.select( './/div[@class="productTitleContent"]/a/text()').extract( )[0].strip() url = product.select( './/div[@class="productTitleContent"]/a/@href').extract()[0] loader.add_value('url', urljoin_rfc(base_url, url)) loader.add_value('name', name) loader.add_xpath('image_url', './/div[@class="img_box"]/a/img[1]/@src', Compose(lambda v: urljoin(base_url, v[0]))) price = product.select('.//*[@itemprop="price"]/text()').extract() try: price = extract_price_eu(price[0].strip()) except: price = Decimal('0.0') loader.add_value('price', price) if category: loader.add_value('category', category[-1]) results = re.search(r"\b([\d]+)\b", name) if results: loader.add_value('sku', results.group(1)) identifier = product.select( './/div[@class="img_box"]/a/img[1]/@rel').extract()[0] loader.add_value('identifier', identifier) availability = product.select( './/div[@class="stock_no"]').extract() if availability or not price: loader.add_value('stock', 0) loader.add_value('brand', 'LEGO') if price <= 15: loader.add_value('shipping_cost', 2.80) elif price <= 29: loader.add_value('shipping_cost', 4.5) elif price <= 149: loader.add_value('shipping_cost', 4.99) else: loader.add_value('shipping_cost', 0) yield self.load_item_with_metadata(loader.load_item())
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name_xpath = '//div[@id="product-details"]/h1/span/text()' names = hxs.select('//h1[@id="product_title"]/text()').extract() if names and len(names) > 0: name = names[0].strip() else: # product not found. Just continue self.log('WARNING: Product not found => %s' % response.url) return quantity = hxs.select('//p[@id="stock_status"]/text()').extract() if quantity and "In Stock" in quantity.pop(): quantity = None else: quantity = 0 category = hxs.select( '//ul[@id="crumbs"]/li[@class="last"]/a/text()').extract() brand = hxs.select( '//div[@id="product_title_container"]/span[@class="secondary"]/text()' ).extract() loader = ProductLoader(response=response, item=Product()) loader.add_value('url', urljoin(base_url, response.url)) loader.add_value('name', name) loader.add_xpath('image_url', '//img[@id="main_image"]/@src', TakeFirst(), Compose(lambda v: urljoin(base_url, v))) loader.add_xpath( 'price', '//div[@class="product_price"]/span[@class="price"]/text()', TakeFirst(), re="([.0-9]+)") if not loader.get_output_value('price'): loader.add_value('price', 0) if category: loader.add_value('category', category[0].strip()) loader.add_value('sku', name, TakeFirst(), re='(\d\d\d+)\s*$') if brand: loader.add_value('brand', brand[0].strip()) identifier = hxs.select('//input[@name="ProductID"]/@value').extract() if not identifier: identifier = hxs.select('//li[@itemprop="id"]/text()').extract() loader.add_value('identifier', identifier[0]) if quantity == 0: loader.add_value('stock', 0) yield loader.load_item()
class PlayStoreItems(Item): app_id = Field(output_processor=TakeFirst()) name = Field(output_processor=TakeFirst()) category = Field(output_processor=TakeFirst()) category_url = Field(output_processor=TakeFirst()) price = Field(input_processor=Compose(lambda text: [ line.strip().replace(" Buy", "").replace("Install", "Free") for line in text ]), output_processor=TakeFirst()) offers_in_app_purchases = Field(output_processor=TakeFirst()) stars_count = Field(input_processor=Compose( lambda text: [line.strip().strip("()") for line in text]), output_processor=Join('')) video = Field(output_processor=TakeFirst()) screenshots = Field() description = Field(input_processor=Compose( lambda text: [line.strip() for line in text]), ) update_date = Field(output_processor=TakeFirst()) file_size = Field( input_processor=Compose(lambda text: [line.strip() for line in text]), output_processor=TakeFirst()) installs = Field( input_processor=Compose(lambda text: [line.strip() for line in text]), output_processor=TakeFirst()) current_version = Field( input_processor=Compose(lambda text: [line.strip() for line in text]), output_processor=TakeFirst()) requires_android = Field( input_processor=Compose(lambda text: [line.strip() for line in text]), output_processor=TakeFirst()) offered_by = Field(output_processor=TakeFirst()) offered_by_url = Field(output_processor=TakeFirst())
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = "".join( map( lambda x: x.strip(), hxs.select( '//div[@id="primary_block"]/h1/descendant-or-self::text()' ).extract())) if name.startswith("LEGOLAND"): return category = hxs.select('//div[@class="breadcrumb "]/a/text()').extract() if category: category = category.pop() else: category = "" pid = hxs.select('//input[@name="id_product"]/@value').extract() sku = hxs.select( '//label[@for="product_reference"]/following-sibling::span[1]/text()' ).extract() if not sku: sku = pid elif sku[0].endswith("-lego"): sku = sku.pop()[0:-5] try: price = self.parse_price( hxs.select('//p[@class="our_price_display"]/strong/span/text()' ).pop().extract()) except IndexError: return stock = hxs.select('//p[@id="pQuantityAvailable"]/span[@class="yes"]') if price: loader = ProductLoader(response=response, item=Product()) loader.add_value('url', urljoin(base_url, response.url)) loader.add_value('name', name) loader.add_xpath('image_url', '//div[@id="image-block"]/span/img/@src', Compose(lambda v: urljoin(base_url, v[0]))) loader.add_value('price', price) loader.add_value('category', category) loader.add_value('sku', sku) loader.add_value('identifier', pid) loader.add_value('brand', 'LEGO') if not stock: loader.add_value('stock', 0) yield self.load_item_with_metadata(loader.load_item()) else: self.errors.append("No price set for url: '%s'" % urljoin(base_url, response.url))
class TimetableItem(Item): airport = Field() flight_type = Field() flight = Field() airline = Field() airport_of_departure = Field() city_of_departure = Field() airport_of_arrival = Field() city_of_arrival = Field() flight_status = Field() datetime_scheduled = Field(output_processor=Compose(Join(), to_datetime)) datetime_estimated = Field(output_processor=Compose(Join(), to_datetime), default=None) datetime_actual = Field(output_processor=Compose(Join(), to_datetime), default=None) terminal = Field() comment = Field() checkin_desk = Field( output_processor=Compose(Join(), checkin_desk_processor))
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) try: name = hxs.select( '//*[@id="product_title"]/text()').extract()[0].strip() price = hxs.select( '//div[contains(@class,"akciova-cena")]//div[@class="price-box"]/text()' ).extract()[0].strip() except: retry = int(response.meta.get('retry', 0)) if retry < 10: new_meta = response.meta.copy() new_meta['retry'] = retry + 1 yield Request(response.url, meta=new_meta, callback=self.parse_product, dont_filter=True) return loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_xpath('image_url', '//div[@class="image"]//img[@class="product"]/@src', Compose(lambda v: urljoin(base_url, v[0]))) price = extract_price(price.replace(' ', '').replace(u'\xa0', '')) loader.add_value('price', price) category = hxs.select( u'//*[@id="page-product-detail"]//div[@class="wrap_info"]//dl/dd/a[contains(@title, "Zna\u010dka")]/../preceding-sibling::dt/a/text()' ).extract() if category: loader.add_value('category', category[0]) sku = hxs.select( '//*[@id="parametry"]/div/table/tbody/tr[2]/td/text()').extract() if not sku: sku = '' for match in re.finditer(r"([\d,\.]+)", name): if len(match.group()) > len(sku): sku = match.group() else: sku = sku[0] loader.add_value('sku', sku) identifier = hxs.select( u'//*[@id="page-product-detail"]//div[@class="wrap_info"]//dl/dd[contains(text(), "k\xf3d produktu")]/preceding-sibling::dt/text()' ).extract()[0] loader.add_value('identifier', identifier.strip()) #availability = hxs.select(u'//*[@id="page-product-detail"]//div[@class="wrap_info"]//dl/dd[contains(text(), "dostupnos\u0165 eshop")]/preceding-sibling::dt/span/text()').extract()[0].strip() #if availability != u'na sklade v e-shope': # loader.add_value('stock', 0) loader.add_value('brand', 'LEGO') if int(price) <= 100: loader.add_value('shipping_cost', 4) yield self.load_item_with_metadata(loader.load_item())
def parse_products(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) #categories urls = hxs.select( '//div[@class="kategoriaNahlady"]//span[@class="nadpis"]/a/@href' ).extract() for url in urls: yield Request(urljoin_rfc(base_url, url), callback=self.parse_products) #pagination urls = hxs.select( '//div[@class="strankovanie-inner"]//a/@href').extract() for url in urls: yield Request(urljoin_rfc(base_url, url), callback=self.parse_products) #products category = hxs.select('//*[@id="main"]/h1/text()').extract() products = hxs.select( '//div[@class="produkty"][1]/div[@class="produkt"]') for product in products: loader = ProductLoader(item=Product(), selector=product) name = product.select( './/div[@class="nazov"]/a/text()').extract()[0].strip() url = product.select('.//div[@class="nazov"]/a/@href').extract()[0] loader.add_value('url', urljoin_rfc(base_url, url)) loader.add_value('name', name) loader.add_xpath('image_url', './/a[@class="obrazok"]/img/@src', Compose(lambda v: urljoin(base_url, v[0]))) price = product.select('.//div[@class="cena"]/text()').extract() price = extract_price_eu(price[0].strip()) loader.add_value('price', price) results = re.search(r"\b([\d]+)\b", name) if results: sku = results.group(1) loader.add_value('sku', sku) loader.add_value('brand', 'LEGO') availability = product.select( './/a[@class="kosik vypredane"]').extract() if availability: loader.add_value('stock', 0) if category: category = category[0].partition(' - strana ')[0] loader.add_value('category', category) identifier = product.select('.//a[@class="kosik"]/@href').extract() if identifier: identifier = identifier[0].partition('=')[2] loader.add_value('identifier', identifier) yield self.load_item_with_metadata(loader.load_item()) else: #as we have no identifier for out of stock products we need to visit product page to extract it product = loader.load_item() yield Request(product['url'], callback=self.parse_identifier, meta={'product': product})
class RPostItemsLoader(ItemLoader): default_item_class = RpostResultsItem default_output_processor = Compose(TakeFirst(), unicode, unicode.strip) racename_out = Compose(Join(), unicode, unicode.strip) racetime_out= Compose(Join(),unicode, unicode.strip) rpOR_out = Compose(TakeFirst(), unicode, unicode.strip, processOR) rpTS_out = Compose(TakeFirst(), unicode, unicode.strip, processTS) prizemoney_out =Compose(TakeFirst(), unicode, unicode.strip, toascii) rphorseurl_out = Compose(TakeFirst(), unicode, unicode.strip)
class TramiteProyectoItemLoader(XPathItemLoader): default_item_class = TramiteProyectoItem default_input_processor = MapCompose(fix_space, unicode.strip) default_output_processor = TakeFirst() proyecto_camara_origen_in = MapCompose(fix_space, unicode.strip, normalize_camara) proyecto_camara_origen_expediente_in = MapCompose(fix_space, unicode.strip, normalize_codigo_expediente) camara_in = MapCompose(fix_space, unicode.strip, normalize_camara) fecha_in = MapCompose(fix_space, unicode.strip, partial(spanish_date, allow_empty=True)) fecha_out = Compose(lambda v: v[0].isoformat()) index_in = MapCompose(digits_only)