def list_page(self, response): urls = response.xpath('//div[@class="house-title"]/a/@href').extract() for url in urls: yield WebdriverRequest(url, callback=self.parse) next_page = response.xpath('//a[@class="aNxt"]/@href').extract_first() if next_page: yield WebdriverRequest(next_page, callback=self.list_page)
def parse_smzdm_list_page(self, response): try: category = response.meta["category"] sel = WebdriverXPathSelector(response) item_url_sel_list = sel.select( "/html/body/section//div[@class='listTitle']/h3[@class='itemName']/a/@href" ) for item_url_sel in item_url_sel_list: item_url = item_url_sel.extract() if item_url not in self.urls_seen: yield WebdriverRequest(item_url, meta={'category': category}, callback=self.parse_smzdm_item_page) # else: # raise StopIteration next_page_xpath = "//li[@class='pagedown']/a/@href" next_page_url_sel_list = sel.select(next_page_xpath) for next_page_url_sel in next_page_url_sel_list: next_page_url = next_page_url_sel.extract() yield WebdriverRequest(next_page_url, meta={'category': category}, callback=self.parse_smzdm_list_page) except: log.msg("Smzdm list page parse failed:\t[%s]" % (response.url), level=log.ERROR, spider=SmzdmSpider) raise StopIteration
def list_page(self, response): urls = response.xpath('//ul//div[@class="title"]/a/@href').extract() for url in urls: yield WebdriverRequest(url=response.urljoin(url), callback=self.parse) next_page = response.xpath( '//div[@class="page-box fr"]//a[@class="on"]/following-sibling::*[1]/@href' ).extract_first() if next_page and int(next_page[14:-5]) < 20: yield WebdriverRequest(url=response.urljoin(next_page), callback=self.list_page)
def parse_feed(self, response): """ Parses a json datafeed page on The Warehouse Designed for: http://www.thewarehouse.co.nz/red/catalog/gifting/gifts-for-him?JsonFlag=true """ jsonresponse = json.loads(response.body_as_unicode()) for product in jsonresponse['products']: l = ScraperProductLoader(item=ScraperProduct(), response=response) l.add_value('url', product['productUrl']) l.add_value('sku', product['productSku']) l.add_value('name', product['productName']) l.add_value('in_stock', product['derivedInStock']) l.add_value('description', product['productDescription']) l.add_value('price', product['price']) l.add_value("image_urls", [product['productImageUrl']]) l.add_value("price", product['price']) if len(product['productPriceInfo']): l.add_value('attributes', product['productPriceInfo']) yield l.load_item() # If the feed continues on another page, follow it if jsonresponse.get('nextPageUrl'): yield WebdriverRequest(jsonresponse.get('nextPageUrl'), callback=self.parse_feed)
def parse_start_url(self, response): """ Handles any special parsing from start_urls. However, we mostly use it to handle pagination. This method is misleading as it actually cascades... """ if self.is_product_page(response): self.rules = () self._rules = [] return self.parse_product(response) if response.url in self.visited: return [] sel = Selector(response) pages = sel.css('.pagePaginatorLabel').re_first(r'Page \d+ of (\d+)') if not pages or int(pages) <= 1: return [] urls = [] pages = int(pages) for page in xrange(pages): url = '{base}#pageId={page}'.format(base=response.url, page=page) self.visited.append(url) urls.append(WebdriverRequest(url)) return urls
def parse_product(self, response): """ Parses a product page on Gap.com. @url http://www.gap.com/browse/product.do?cid=64526&vid=1&pid=960079012 @returns items 1 1 @returns requests 0 0 @scrapes url sku name price in_stock description image_urls attributes """ sel = Selector(response) url_sel = sel.css('link[rel="canonical"]::attr(href)') their_id = url_sel.re("/P(\d+).jsp")[0] l = ScraperProductLoader(item=ScraperProduct(), response=response) l.add_value('url', url_sel.extract()[0]) l.add_value('sku', their_id) l.add_css('name', '.productName::text') # Presence of product name determines product availability l.add_value('in_stock', bool(l.get_output_value('name'))) l.add_css('description', '#tabWindow') attributes = {} sale_price = sel.css('#priceText .salePrice::text').extract_first() if not sale_price: l.add_css('price', '#priceText::text') else: l.add_css('price', '#priceText strike::text') attributes['sales_price'] = sale_price try: category_name = sel.css('#lnc-division::attr(alt)').extract()[0] except IndexError: category_name = sel.css('ul.category li.category')[0].css( 'a::text').extract()[0] l.add_value('attributes', attributes) # image urls are stored in the "productData" page url = self.product_data_url.format(their_id) request = WebdriverRequest(url, callback=self.images) # request contains other data in meta['item'] request.meta['item'] = l.load_item() yield request # crawl the other url
def parse_recommended_products(self, response): # Scrape similar products sel = Selector(response) url_paths = sel.css( 'article.top-products .content>a::attr(href)').extract() for url_path in url_paths: request = WebdriverRequest(url_path, callback=self.parse_product) self.prep_product_tagging(request, response.meta.get('item')) yield request
def parse_sub2(self, response): # def parse(self, response): product_links = response.xpath( '//*[@class="item-list-block"]/a/@href').extract() count = 0 for product_link in product_links: # if count >= 1: # break count += 1 if not self.dbutils.check_history_exist(product_link): yield WebdriverRequest(product_link, callback=self.parse_product)
def parse_category_full_page(self, response): count = 0 _slick_slides = response.xpath('//*[@class="slick-track"]/div') for slide in _slick_slides: # if count >= 1: # break sub2_link = response.xpath( '//*[@class="slick-track"]/div/a/@href')[count].extract() count += 1 yield WebdriverRequest(sub2_link, callback=self.parse_sub2) count = 0 _sale_banners = response.xpath( '//*[@role="saleBannersList"]/div/*[@role="saleBannerContainer"]') for banners in _sale_banners: # if count >= 1: # break sub2_link = response.xpath( '//*[@role="saleBannersList"]/div/*[@role="saleBannerContainer"]/a/@href' )[count].extract() count += 1 yield WebdriverRequest(sub2_link, callback=self.parse_sub2)
def parse_recipe(self, response): if not self.is_recipe_page(response): self.logger.warning(u"Unexpectedly not a recipe page: {}".format(response.request.url)) return recipe_id = re.match(r'(?:http://|https://)?www\.surlatable\.com/product/REC-(\d+)(/.*)?', response.url).group(1) sel = Selector(response) l = ScraperContentLoader(item=ScraperImage(), response=response) l.add_value('force_skip_tiles', self.skip_tiles) l.add_value('original_url', unicode(response.request.url)) l.add_value('source', 'Sur La Table') l.add_css('name', 'h1.name::text') l.add_css('description', '#recipedetail .story') item = l.load_item() self.handle_product_tagging(response, item, content_id=recipe_id) if self.skip_images: yield item else: # Continue to XML data to get recipe image magic_values = sel.css('.fluid-display::attr(id)').extract_first().split(':') xml_path = u'/images/customers/c{1}/{2}/{2}_{3}/pview_{2}_{3}.xml'.format(*magic_values) request = WebdriverRequest(self.root_url + xml_path, callback=self.parse_one_image) request.meta['item'] = item yield request # Scrape tagged products url_paths = sel.css('.productinfo .itemwrapper>a::attr(href)').extract() for url_path in url_paths: request = WebdriverRequest(self.root_url + url_path, callback=self.parse_product) self.prep_product_tagging(request, item) yield request
def start_requests(self): urls = [ 'http://www.ewtn.com/daily-readings/?date=2016-12-24', 'http://www.ewtn.com/daily-readings/?date=2016-12-25', 'http://www.ewtn.com/daily-readings/?date=2016-12-26', 'http://www.ewtn.com/daily-readings/?date=2016-12-27', 'http://www.ewtn.com/daily-readings/?date=2016-12-28', 'http://www.ewtn.com/daily-readings/?date=2016-12-29', 'http://www.ewtn.com/daily-readings/?date=2016-12-30', 'http://www.ewtn.com/daily-readings/?date=2016-12-31', ] for url in urls: yield WebdriverRequest(url=url, callback=self.parse)
def parse(self, response): """ main method """ yield WebdriverRequest(response.url, callback=self._spider_page, method="GET")
def parse_smzdm_item_page(self, response): try: category = response.meta["category"] sel = WebdriverXPathSelector(response) title_sel_list = sel.select('/html/body/section/div[1]/article/h1') attachment_sel_list = sel.select( '/html/body/section/div[1]/article/h1/span') if len(title_sel_list): title = self.normalize_text(title_sel_list[0].extract()) item_name = title else: log.msg("Smzdm title parse failed:\t[%s]" % (response.url), level=log.ERROR, spider=SmzdmSpider) raise StopIteration all_attachment = '' for attachment_sel in attachment_sel_list: attachment = attachment_sel.extract() item_name = item_name.replace(attachment, '') all_attachment += attachment price, currency = self.parse_price(all_attachment) item_shopping_url_sel_list = sel.select( "/html/body/section/div[1]/article/div[2]/div/div/a/@href") if len(item_shopping_url_sel_list): item_shopping_url = item_shopping_url_sel_list[0].extract() yield WebdriverRequest(item_shopping_url, meta={'referer': response.url}, callback=self.parse_shopping_item_page) description_sel_list = sel.select( '/html/body/section/div[1]/article/div[2]/p[@itemprop="description"]' ) description = '' img_src_list = [] for description_sel in description_sel_list: description += self.normalize_text(description_sel.extract()) img_src_sel_list = description_sel.select(".//img/@src") for img_src_sel in img_src_sel_list: img_src_list.append(img_src_sel.extract()) try: worthy_vote = int( self.get_text_by_xpath( sel, "//span[@id='rating_worthy_num']/text()")) except: worthy_vote = 0 try: unworthy_vote = int( self.get_text_by_xpath( sel, "//span[@id='rating_unworthy_num']/text()")) except: unworthy_vote = 0 try: favorite_count = int( self.get_text_by_xpath(sel, "//a[@class='fav']/em/text()")) except: favorite_count = 0 try: comment_count = int( self.get_text_by_xpath(sel, "//a[@class='comment']/em/text()")) except: comment_count = 0 yield items.SmzdmItem(title=item_name, price=price, url=response.url, description=description, \ image_urls=img_src_list, worthy_vote=worthy_vote, unworthy_vote=unworthy_vote, \ favorite_count=favorite_count, comment_count=comment_count, category=category, currency=currency) except: log.msg("Smzdm item page parse failed:\t[%s]" % (response.url), level=log.ERROR, spider=SmzdmSpider) raise StopIteration
def parse_action(self, response): yield WebdriverRequest('%s&wr=%d' % (response.url, 0), callback=self.parse_nothing)
def start_requests(self): for title, url in self.url: yield WebdriverRequest(url, callback=self.parse)
def login(self): return WebdriverRequest( 'http://associates.amazon.cn/gp/associates/network/main.html', callback=self.submit_login_info)
def start_requests(self): for i in xrange(2): yield WebdriverRequest('http://testdomain/path?wr=%d' % i) yield Request('http://testdomain/path?r=%d' % i)
def parse(self, response): for url in response.css('#MainContent_tStock h3 a::attr("href")'): url = url.extract() yield WebdriverRequest(urljoin(response.url, url), callback=self.parse_company)
def start_requests(self): for url in self.start_urls: yield WebdriverRequest(url=url, callback=self.list_page)
def parse_shopping_item_page(self, response): try: sel = WebdriverXPathSelector(response) referer = response.meta["referer"] target_price = response.meta["target_price"] jd_jump_url_sel = sel.select( "/html/body/div[5]/div/div/div[1]/div[2]/div[3]/a/@href") if jd_jump_url_sel: log.msg("JD jump url:\t[%s]" % (jd_jump_url_sel[0].extract()), level=log.DEBUG, spider=SmzdmSpider) yield WebdriverRequest(jd_jump_url_sel[0].extract(), meta={'referer': referer}, callback=self.parse_shopping_item_page) else: img_src_list = [] comment_list = [] description = "" title = "" vote_count = "" vote_score = "" price = -1.0 log.msg("Shopping url: %s" % (response.url), level=log.DEBUG, spider=SmzdmSpider) log.msg("Real shopping url: %s" % (response.webdriver.current_url), level=log.DEBUG, spider=SmzdmSpider) url = response.webdriver.current_url hostname = urlparse(url).hostname if hostname != "www.amazon.cn": log.msg("Shopping robot does not support this site", level=log.INFO, spider=SmzdmSpider) return for url_pattern, (title_xpath, price_xpath, price_redudant_pattern, description_xpath, description_img_xpath, currency, title_img_xpath_list, comment_xpath, vote_count_xpath, vote_score_xpath ) in self.__url_pattern_xpath_dict.items(): if url_pattern.match(url): log.msg("Shopping url pattern is found", level=log.DEBUG, spider=SmzdmSpider) title_sel_list = sel.select(title_xpath) if len(title_sel_list): title = self.normalize_text( title_sel_list[0].extract()) else: log.msg("Shopping page error:\ttitle is not found", level=log.ERROR, spider=SmzdmSpider) raise StopIteration continue price_sel_list = sel.select(price_xpath) if len(price_sel_list): price_text = price_sel_list[0].extract() price_text = price_redudant_pattern.sub( '', price_text) try: price = float(price_text) if url.startswith("http://www.kiddies24.de"): price /= 100 if (price - target_price) / target_price > 0.05: log.msg( "Price is not ideal. (current price: %f, target price: %f)" % (price, target_price), level=log.INFO, spider=SmzdmSpider) return except: traceback.print_exc() log.msg( "Shopping page error:\tThis item is sold out, the price is %s" % (price), level=log.WARNING, spider=SmzdmSpider) else: log.msg("Shopping page error:\tprice is not found", level=log.WARNING, spider=SmzdmSpider) title_img_sel_list = [] for title_img_xpath in title_img_xpath_list: title_img_sel_list += sel.select(title_img_xpath) title_img_src = "" for title_img_sel in title_img_sel_list: title_img_src = title_img_sel.extract() if title_img_src: img_src_list.append(title_img_src) break if hostname == "item.jd.com": # sel.select_script("arguments[0].scrollIntoView(true);", sel.webdriver.find_element_by_xpath("//div[@id='comment-0']")) # sel.select_script("arguments[0].scrollIntoView(true);", sel.webdriver.find_element_by_xpath("//div[@id='comment-2']")) # sel.webdriver.execute_script("window.scrollTo(0, document.body.scrollHeight);") sel.webdriver.find_element_by_xpath( "//li[@id='detail-tab-comm']/a").click() time.sleep(2) for comment_sel in sel.select(comment_xpath): comment_list.append(comment_sel.extract()) vote_count_sel_list = sel.select(vote_count_xpath) if len(vote_count_sel_list): vote_count = vote_count_sel_list[0].extract() else: log.msg( "Shopping page error:\tvote count is not found", level=log.ERROR, spider=SmzdmSpider) vote_score_sel_list = sel.select(vote_score_xpath) if len(vote_score_sel_list): vote_score = vote_score_sel_list[0].extract() else: log.msg( "Shopping page error:\tvote score is not found", level=log.ERROR, spider=SmzdmSpider) log.msg("Shopping item: [%s] [%s] [%s] [%s] [%s]" % (title, description, price, url, referer), level=log.DEBUG, spider=SmzdmSpider) yield items.ShoppingItem(title=title, price=price, url=url, referer=referer, image_urls=img_src_list, \ title_image_url=title_img_src, description=description, currency=currency, \ comment_list=comment_list, vote_count=vote_count, vote_score=vote_score) log.msg("Place the order!", level=log.INFO, spider=SmzdmSpider) sel = WebdriverXPathSelector(response) one_click_button_list = sel.select( '//*[@id="one-click-button"]') if not one_click_button_list: log.msg("Need to enable one click order!", level=log.DEBUG, spider=SmzdmSpider) referer = response.meta["referer"] enable_one_click_url_sel = response.webdriver.find_elements_by_xpath( '//*[@id="oneClickSignIn"]') if enable_one_click_url_sel: # enable_one_click_url = enable_one_click_url_sel[0].extract() log.msg("Enable one click order", level=log.DEBUG, spider=SmzdmSpider) # enable_one_click_url_sel[0].click() yield WebdriverActionRequest(response, \ actions=ActionChains(response.webdriver).click(enable_one_click_url_sel[0]), \ meta={'referer': referer}, \ callback=self.parse_shopping_item_page) else: log.msg("One click order!", level=log.INFO, spider=SmzdmSpider) one_click_button_list[0].click() # self.order_item(response) # time.sleep(1) except: traceback.print_exc() log.msg("Shopping item page parse failed:\t[%s]" % (response.url), level=log.ERROR, spider=SmzdmSpider) raise StopIteration
def parse_product(self, response): if not self.is_product_page(response): self.logger.warning(u"Unexpectedly not a product page: {}".format(response.request.url)) return attributes = {} sel = Selector(response) l = ScraperProductLoader(item=ScraperProduct(), response=response) l.add_value('force_skip_tiles', self.skip_tiles) l.add_css('name', 'h1#product-title::text') l.add_css('description', '#product-description div::text') l.add_css('details', '#product-moreInfo-features li') # If the page doesn't have a sku, the product doesn't exist sku = '' try: # Try to find the SKU directly, does not work for products with multiple sizes sku = sel.css('#product-sku span[itemprop="sku"]::text').extract()[0].strip() except (IndexError, AttributeError): pass if not sku: try: # could be a color option sku = sel.css('#product #product-options a[data-sku]::attr(data-sku)').extract()[0] except (IndexError, AttributeError): pass if not sku: try: # Product ID usually of form: 'PRO-1220433' prod_id = sel.css('#productId::attr(value)').extract()[0] sku = re.search(r'\d+', prod_id).group() except (IndexError, AttributeError): # An item with a missing sku will not validate pass l.add_value('sku', unicode(sku)) # prices are sometimes in the forms: # $9.95 # $9.95 - $48.96 # $99.96 Sugg. $1,860.00 | You save 46% price_range = sel.css('meta[property="eb:pricerange"]::attr(content)').extract_first() if price_range: attributes['price_range'] = price_range try: price = sel.css('.product-priceMain span.hide::text').extract_first().split('-')[0] sugg_price = sel.css('.product-priceInfo #product-priceList span::text').extract_first() if sugg_price: reg_price = sugg_price.split('-')[0] # Sometimes "$9.95 - $48.96" sale_price = price else: reg_price = price sale_price = None except IndexError: reg_price = u'$0.00' sale_price = None l.add_value('in_stock', bool(not self.is_sold_out(response))) l.add_value('price', unicode(reg_price)) l.add_value('sale_price', unicode(sale_price) if sale_price else None) l.add_value('attributes', attributes) l.add_value('url', unicode(response.request.url)) item = l.load_item() # If this is a similar_product and tagged_product, handle it self.handle_product_tagging(response, item) if self.skip_images: yield item else: # Full-sized Sur La Table image URLs found in a magical XML file. try: magic_values = sel.css('.fluid-display::attr(id)').extract_first().split(':') xml_path = u"/images/customers/c{1}/{2}/{2}_{3}/pview_{2}_{3}.xml".format(*magic_values) request = WebdriverRequest(self.root_url + xml_path, callback=self.parse_product_images) request.meta['item'] = item yield request except IndexError: yield item
def start_requests(self): # logger = logging.getLogger(self.__smzdm_log_file) log.start(logfile=self.__smzdm_log_file, loglevel='INFO', logstdout=False) smzdm_config = ConfigParser.RawConfigParser() smzdm_config.read("configure/smzdm.ini") self.price_pattern = re.compile( smzdm_config.get("item_page", "price_pattern").decode("utf-8")) self.usd_price_pattern = re.compile( smzdm_config.get("item_page", "usd_price_pattern").decode("utf-8")) self.jpy_price_pattern = re.compile( smzdm_config.get("item_page", "jpy_price_pattern").decode("utf-8")) self.eur_price_pattern = re.compile( smzdm_config.get("item_page", "eur_price_pattern").decode("utf-8")) self.head_separator = smzdm_config.get( "item_page", "head_separator_pattern").decode("utf-8") self.attachment_pattern = re.compile( smzdm_config.get("item_page", "attachment_pattern").decode("utf-8")) config_file_name = "configure/shopping_page.ini" shopping_config = ConfigParser.RawConfigParser() shopping_config.read(config_file_name) for section_name in shopping_config.sections(): log.msg("Supported url pattern:\t%s" % (shopping_config.get( section_name, "url_pattern").decode('utf8')), level=log.DEBUG, spider=SmzdmSpider) url_pattern = re.compile( shopping_config.get(section_name, "url_pattern").decode('utf8')) title_xpath = shopping_config.get(section_name, "title_xpath") price_xpath = shopping_config.get(section_name, "price_xpath") price_redudant_pattern = re.compile( shopping_config.get(section_name, "price_redudant_pattern").decode('utf8')) description_xpath = shopping_config.get(section_name, "description_xpath") description_img_xpath = shopping_config.get( section_name, "description_img_xpath") currency = shopping_config.get(section_name, "currency") title_img_xpath_list = shopping_config.get( section_name, "title_img_xpath").split(",") comment_xpath = shopping_config.get(section_name, "comment_xpath") vote_count_xpath = shopping_config.get(section_name, "vote_count_xpath") vote_score_xpath = shopping_config.get(section_name, "vote_score_xpath") self.__url_pattern_xpath_dict[url_pattern] = (title_xpath, \ price_xpath, price_redudant_pattern, description_xpath, \ description_img_xpath, currency, title_img_xpath_list, \ comment_xpath, vote_count_xpath, vote_score_xpath) log.msg("Start requests", level=log.INFO, spider=SmzdmSpider) # CrawlSpider.start_requests(self) log.msg("Login Amazon", level=log.INFO, spider=SmzdmSpider) WebdriverRequest( 'https://www.amazon.cn/ap/signin?_encoding=UTF8&openid.assoc_handle=cnflex&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.mode=checkid_setup&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&openid.ns.pape=http%3A%2F%2Fspecs.openid.net%2Fextensions%2Fpape%2F1.0&openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.amazon.cn%2F%3Fie%3DUTF8%26ref_%3Dnav_ya_signin', callback=self.submit_login_info) log.msg("Login Amazon success", level=log.INFO, spider=SmzdmSpider) time.sleep(10) for (category, urls) in smzdm_config.items("category"): if category == "all_post": for url in urls.split(","): for page_num in range(1700, 1800): list_url = "%s/p%d" % (url, page_num) yield WebdriverRequest( list_url, meta={'category': category}, callback=self.parse_smzdm_post_list_page) else: for url in urls.split(","): yield WebdriverRequest(url, meta={'category': category}, callback=self.parse_smzdm_list_page)
def parse_shopping_item_page(self, response): try: sel = WebdriverXPathSelector(response) referer = response.meta["referer"] jd_jump_url_sel = sel.select( "/html/body/div[5]/div/div/div[1]/div[2]/div[3]/a/@href") if jd_jump_url_sel: log.msg("JD jump url:\t[%s]" % (jd_jump_url_sel[0].extract()), level=log.DEBUG, spider=SmzdmSpider) yield WebdriverRequest(jd_jump_url_sel[0].extract(), meta={'referer': referer}, callback=self.parse_shopping_item_page) else: img_src_list = [] description = "" title = "" price = -1.0 log.msg("Shopping url: %s" % (response.url), level=log.DEBUG, spider=SmzdmSpider) log.msg("Real shopping url: %s" % (response.webdriver.current_url), level=log.DEBUG, spider=SmzdmSpider) url = response.webdriver.current_url for url_pattern, (title_xpath, price_xpath, price_redudant_pattern, description_xpath, description_img_xpath, currency, title_img_xpath_list ) in self.__url_pattern_xpath_dict.items(): if url_pattern.match(url): log.msg("Shopping url pattern is found", level=log.DEBUG, spider=SmzdmSpider) title_sel_list = sel.select(title_xpath) if len(title_sel_list): title = self.normalize_text( title_sel_list[0].extract()) else: log.msg("Shopping page error:\ttitle is not found", level=log.ERROR, spider=SmzdmSpider) raise StopIteration continue price_sel_list = sel.select(price_xpath) if len(price_sel_list): price_text = price_sel_list[0].extract() price_text = price_redudant_pattern.sub( '', price_text) try: price = float(price_text) if url.startswith("http://www.kiddies24.de"): price /= 100 except: log.msg( "Shopping page error:\tThis item is sold out, the price is %s" % (price), level=log.WARNING, spider=SmzdmSpider) else: log.msg("Shopping page error:\tprice is not found", level=log.WARNING, spider=SmzdmSpider) title_img_sel_list = [] for title_img_xpath in title_img_xpath_list: title_img_sel_list += sel.select(title_img_xpath) title_img_src = "" for title_img_sel in title_img_sel_list: title_img_src = title_img_sel.extract() if title_img_src: img_src_list.append(title_img_src) break # if url_pattern.match('http://www.amazon.'): # try: # WebDriverWait(response.webdriver, 10) \ # .until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, '//iframe[@id="product-description-iframe"]'))) # except: # log.msg("Shopping page error:\tFrame in Amazon is not found", level=log.ERROR, spider=SmzdmSpider) # # description_sel_list = sel.select(description_xpath + "/*") # for description_sel in description_sel_list: # description_part = self.normalize_text(description_sel.extract()) # if description_part: # description += description_part + '\t' # description_img_sel_list = sel.select(description_img_xpath) # """ Run func with the given timeout. If func didn't finish running # within the timeout, raise TimeLimitExpired # """ # import threading # class GetImgSrcThread(threading.Thread): # def __init__(self, driver, sel_list): # threading.Thread.__init__(self) # self.__driver = driver # self.__sel_list = sel_list # def run(self): # for sel in self.__sel_list: # try: # self.__driver.execute_script("arguments[0].scrollIntoView(true);", sel.element) # time.sleep(1) # except: # log.msg("Shopping page error:\tscrollIntoView failed", level=log.ERROR, spider=SmzdmSpider) # img_src_sel_list = sel.select("./@src") # for img_src_sel in img_src_sel_list: # log.msg("Shopping page error:\timage %s is not found" % (img_src_sel.extract()), level=log.ERROR, spider=SmzdmSpider) # continue # it = GetImgSrcThread(response.webdriver, description_img_sel_list) # it.start() # it.join(60) # if it.isAlive(): # break # description_img_sel_list = sel.select(description_img_xpath + "/@src") # log.msg("Shopping description img list: %s[%d]" % (description_img_sel_list, len(description_img_sel_list)) , level=log.DEBUG, spider=SmzdmSpider) # for description_img_sel in description_img_sel_list: # img_src = description_img_sel.extract() # if img_src: # img_src_list.append(img_src) log.msg("Shopping item: [%s] [%s] [%s] [%s] [%s]" % (title, description, price, url, referer), level=log.DEBUG, spider=SmzdmSpider) yield items.ShoppingItem(title=title, price=price, url=url, referer=referer, image_urls=img_src_list, title_image_url=title_img_src, description=description, currency=currency) except: log.msg("Shopping item page parse failed:\t[%s]" % (response.url), level=log.ERROR, spider=SmzdmSpider) raise StopIteration
def parse(self, response): yield WebdriverRequest(response.url, callback=self.parse_search_with_js)
def start_requests(self): log.start(logfile=self.__smzdm_log_file, loglevel='INFO', logstdout=False) smzdm_config = ConfigParser.RawConfigParser() smzdm_config.read("configure/smzdm.ini") self.price_pattern = re.compile( smzdm_config.get("item_page", "price_pattern").decode("utf-8")) self.usd_price_pattern = re.compile( smzdm_config.get("item_page", "usd_price_pattern").decode("utf-8")) self.jpy_price_pattern = re.compile( smzdm_config.get("item_page", "jpy_price_pattern").decode("utf-8")) self.eur_price_pattern = re.compile( smzdm_config.get("item_page", "eur_price_pattern").decode("utf-8")) self.head_separator = smzdm_config.get( "item_page", "head_separator_pattern").decode("utf-8") self.attachment_pattern = re.compile( smzdm_config.get("item_page", "attachment_pattern").decode("utf-8")) config_file_name = "configure/shopping_page.ini" shopping_config = ConfigParser.RawConfigParser() shopping_config.read(config_file_name) for section_name in shopping_config.sections(): log.msg("Supported url pattern:\t%s" % shopping_config.get( section_name, "url_pattern").decode('utf8'), level=log.DEBUG, spider=SmzdmSpider) url_pattern = re.compile( shopping_config.get(section_name, "url_pattern").decode('utf8')) title_xpath = shopping_config.get(section_name, "title_xpath") price_xpath = shopping_config.get(section_name, "price_xpath") price_redudant_pattern = re.compile( shopping_config.get(section_name, "price_redudant_pattern").decode('utf8')) description_xpath = shopping_config.get(section_name, "description_xpath") description_img_xpath = shopping_config.get( section_name, "description_img_xpath") currency = shopping_config.get(section_name, "currency") title_img_xpath_list = shopping_config.get( section_name, "title_img_xpath").split(",") self.__url_pattern_xpath_dict[url_pattern] = (title_xpath, \ price_xpath, price_redudant_pattern, description_xpath, description_img_xpath, currency, title_img_xpath_list) CrawlSpider.start_requests(self) yield WebdriverRequest( 'http://www.smzdm.com/fenlei/yingertuiche/youhui/p1', meta={'category': 'stroller'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest( 'http://www.smzdm.com/fenlei/anquanzuoyi/youhui/p1', meta={'category': 'car_seat'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest('http://www.smzdm.com/fenlei/lego/youhui/p1', meta={'category': 'lego'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest( 'http://www.smzdm.com/fenlei/huwaibeibao/youhui/p1', meta={'category': 'backpack'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest( 'http://www.smzdm.com/fenlei/yingertuiche/haitao/p1', meta={'category': 'stroller'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest( 'http://www.smzdm.com/fenlei/anquanzuoyi/haitao/p1', meta={'category': 'car_seat'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest('http://www.smzdm.com/fenlei/lego/haitao/p1', meta={'category': 'lego'}, callback=self.parse_smzdm_list_page) yield WebdriverRequest( 'http://www.smzdm.com/fenlei/huwaibeibao/haitao/p1', meta={'category': 'backpack'}, callback=self.parse_smzdm_list_page)
def parse(self, response): _url = response.url yield WebdriverRequest(_url, callback=self.parse_category_full_page)