def parse(self, response): hxs = HtmlXPathSelector(response) file_codehaus = open("./codehausJira.txt", 'w') for i in range(1, 100): # Check the item exist or else we are out of bounds div = '/html/body/div/section/div/div[2]/div[' + str(i) + ']' getdiv = hxs.select(div) if getdiv: for j in range (1,300): # Project name lazy XPath name = '/html/body/div/section/div/div[2]/div[' + str(i) + ']/div[2]/div/table/tbody/tr[' + str(j) + ']/td/a' # Project key lazy XPath key = '/html/body/div/section/div/div[2]/div[' + str(i) + ']/div[2]/div/table/tbody/tr[' + str(j) + ']/td[2]' project_name = hxs.select(name).extract() project_key = hxs.select(key).extract() if project_name and project_key: # Crapy regexps but it works m = re.match(r".*\">(.*)</a>(.*)\">(.*)</a>(.*)\">(.*)</a>.*", str(project_name)) n = re.match(r".*\\n\s*(\w+)\\n.*", str(project_key)) if m and n: file_codehaus.write('"' + str(n.group(1)) + '", "' + str(m.group(1)) + '", ') else: break
def detail(self, response): log.msg(response.url) hxs = HtmlXPathSelector(response) variants_price=hxs.select("//div[@class='fleft catbox pricerate']//span/text()").extract() variants_seller=hxs.select("//div[@class='catbox fleft storeimage']/img/@alt").extract() quantitylist=[] pricelist=[] items=[] if (len(variants_price)!=0 or variants_price!=None) and (len(variants_seller) or variants_seller!=None): for price, seller in zip(variants_price, variants_seller): item = BillionPricesIndiaItem() item['date'] = time.strftime("%d/%m/%Y") item['vendor'] = seller.split(" ")[-1:][0] item['product'] = response.url.split('/')[-1].split(".")[0] itemprice=re.sub('[,]', '', price).split(" ")[-1:][0] item['category'] = "mobiles" item['price'] = float(itemprice) item['quantity'] = '1' item['measure']= 'pcs' item['unitprice']=float(itemprice) items.append(item) return items
def browse_and_parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) for subcat_href in hxs.select('//div[@id="navColumnOne"]//a/@href').extract(): subsubcat_url = urlparse.urljoin(base_url, subcat_href) if subsubcat_url not in self.navig_url_set: self.navig_url_set.add(subsubcat_url) yield Request(subsubcat_url, callback=self.browse_and_parse) next_page = hxs.select("//div[@id='productListing']//div[@id='productsListingListingTopLinks']//a[contains(., 'Neste')]/@href") if next_page: yield Request(next_page[0].extract(), callback=self.browse_and_parse) # parse product listing in this page, if any for tr in hxs.select('//div[@id="productListing"]//tr[@class="productListing-even" or @class="productListing-odd"]'): product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', tr.select(".//td[2]//a/@href").extract()[0]) product_loader.add_value('name', tr.select(".//td[2]//a/text()").extract()[0]) product_loader.add_value('price', tr.select(".//td[3]/text()").extract()[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.')) yield product_loader.load_item() # edge case: product listing page with a single product product_price = hxs.select('//h2[@id="productPrices"]/text()').extract() if product_price: # this product listing page contains a single product product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', '//h1[@id="productName"]/text()') product_loader.add_value('url', response.url) product_loader.add_value('price', product_price[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.')) yield product_loader.load_item()
def parse_game(self, response): self.log("Found game page %s" % response.url) hxs = HtmlXPathSelector(response) clues = hxs.select('//td[@class="clue"]') jitems = [] game = first(hxs.select('//div[@id="game_title"]/h1/text()').extract()) cats = hxs.select('//td[@class="category_name"]/text()').extract() self.log(game) for clue in clues: jitem = JarchiveItem() found = clue.select('table/tr/td/div/@onmouseover').extract() if len(found) > 0: clueinfo = first(clue.select('.//td[@class="clue_text"]/@id').extract()).split("_") round = clueinfo[1] cluecol = int(clueinfo[2])-1 if round == "DJ": cluecol+=6 togglebox = found[0].split("', '") cr = HtmlXPathSelector(text=togglebox[2]).select(".//em[@class='correct_response']/text()") cr = first(cr.extract()) v = first(clue.select('.//td[@class="clue_value"]/text()').extract()) if v: v = v[1:] c = first(clue.select('.//td[@class="clue_text"]/text()').extract()) (jitem['correct_response'], jitem['value'], jitem['clue'], jitem['game'], jitem['category']) = cr, v, c, game, cats[cluecol] jitems.append(jitem) return jitems
def parse(self, response): connection = pymongo.MongoClient("localhost", 30000) db = connection.academic hps = db.homepages tmp = hps.find_one({"url": response.url}) if not tmp: hxs = HtmlXPathSelector(response) urls = hxs.select('//a') contents = hxs.select('//p | //a | //b | //tr | //td | //li | //ul | //font | //span | //strong | //h1 | //h2 | //h3') link = [] text = "" for url in urls: u = ''.join(url.select('@href').extract()) if u[-4:] == ".pdf": link.append(u) for content in contents: s = ''.join(content.select('text()').extract()) if len(s) > 3: text += s hp = { "url" : response.url, "link" : link, "text" : text } print "[insert]" hps.insert(hp) else: print "[redundent]"
def parse_info(self, response): hxs = HtmlXPathSelector(response) rows = hxs.select("//table/tr") item = TexasItem() #Remove the last from the identifing URL item['ident'] = response.url[0:-5] #Rip out the info, compare using lookup table for tr in rows: td = tr.select("td/text()") l = len(td) key = td[l-2].extract() val = td[l-1].extract() item[self.lookup[key]] = val values = hxs.select("//p/text()") keys = hxs.select("//p/span/text()") #Rip down the auxially data for i in range(len(keys)-1): key = keys[i].extract() val = values[i+1].extract() item[self.lookup[key]] = self.cleanString(val) #and lastly the mugshot hxs = hxs.select("//table/tr/td/img/@src") if len(hxs.extract()) >= 1: item['mugshot'] = "http://www.tdcj.state.tx.us/stat/dr_info/" + hxs.extract()[0] return item
def parse_list(self, response): hxs = HtmlXPathSelector(response) for href in hxs.select(r'//ul[@id="paper-listing"]//a/@href').extract(): yield Request(urlparse.urljoin(response.url, href), callback=self.parse_paper) next = hxs.select(r'//div[@class="pagination"]/ul/li[@class="next"]/a/@href') if len(next): yield Request(urlparse.urljoin(response.url, next[0].extract()), callback=self.parse_list)
def parse(self, response): hxs = HtmlXPathSelector(response) #categories = hxs.select('//div[@class="sidebar_nav"]//li/a/@href').extract() categories = hxs.select('//div[@class="navigation"]/ul/li/a/@href').extract() categories += hxs.select('//ul[@class="cl_subs"]//a/@href').extract() loaded = False for category in categories: loaded = True yield Request(category) next_page = hxs.select('//a[@rel="next"]/@href').extract() if next_page: base_url = get_base_url(response) loaded = True yield Request(urljoin_rfc(base_url, next_page[0])) products = [product for product in self.parse_products(hxs)] for product in products: yield product if (not products or not loaded) and response.meta.get('retries', 0) < 3: yield Request(response.url, dont_filter=True, meta={'retries': response.meta.get('retries', 0) + 1})
def parse_start_url(self, response): x = HtmlXPathSelector(response) # get the list of posters posters = x.select("//b[@class='postauthor']/text()").extract() # set the first in list of posters as topic author op = posters[0] # get the topic url and title url = response.url title = x.select("//div[@id='pageheader']/h2/a/text()").extract() #scrape topic body #But this scrape is not quite working. It is posting #the entire body of the page and not just the specific loop. #I am not sure why at all post_body = x.select("//div[@class='postbody']").extract() # go through list of posters and remove any duplicates posters_export = [op] for p in posters: posters_export.append(p) # create an item for each unique poster in the topic topics = [] for i, pb in enumerate(post_body): topic = BodytestItem() topic['topic_url'] = url topic['topic_title'] = title topic['thread_author'] = op topic['post_author'] = posters[i] topic['post_body'] = pb topics.append(topic) return topics
def parse(self, response): hxs = HtmlXPathSelector(response) comments = hxs.select('//script[contains(text(),"t_post")]') #filename = response.url.split("/")[-2] self.file.write('comments: ' + str(len(comments)) + '\n\n') #items = [] for comment in comments: #pattern = re.compile(r"'?([^(,]+)'?,") pattern = re.compile(r"('(.*?)'|(\d+),)", re.S) results = pattern.findall(comment.extract()) comment_items = list((x[2] if x[2] else x[1]) for x in results) item = IxbtItem() if len(comment_items) > 5: text = comment_items[5] item['grats'] = len(text.split(';')) else: item['grats'] = 0 item['text'] = [] text = '' if len(comment_items) > 4: text = comment_items[4]; text = re.sub(r'<br>', '\n', text) text = re.sub(r'<p>.*<p>', '\n', text) text = re.sub(r'\\n', '\n', text) #text = re.sub(r'\<.*', '', text) #text = re.sub(r'\<[^>]*\>', '', text) text = re.sub(r'(\n|^).{1,20}(\n)+', '\n', text) #text = re.sub(r'(\n){3,}', '\n\n', text) #text = re.sub(r'\s+$', '', text) #text = re.sub(r'^\s+', '', text) pattern = re.compile(r'(.+?)(\n\n|$)', re.S) tuples = pattern.findall(text) item['text'] = list(x[0].strip() for x in tuples if len(x[0].strip()) > 12) item['author'] = comment_items[1] item['url'] = response.url + u'#' + comment_items[0] if item['grats'] > 2: self.file.write('Автор: ' + item['author'].encode('UTF-8') + '\n') self.file.write(str(item['grats']) + ' человек сказали спасибо\n') self.file.write(item['url'] + '\n') s = '\n'.join(item['text']) self.file.write('кол-во анекдотов: ' + str(len(item['text'])) + '\n') #self.file.write(comment_items[4].encode('UTF-8')) for joke in item['text']: self.file.write(joke.encode('UTF-8') + '\n\n') #items.append(item) yield item next_url = hxs.select('//script[contains(text(),"t_assign")]').re(u'href=([^ ]*?)>далее') if len(next_url) > 0: next_url = next_url[0] parsed_url = urlparse(next_url) next_url = urljoin(response.url, next_url) yield Request(next_url, callback=self.parse) self.file.write("Следующая страница: " + next_url.encode('UTF-8') + '\n')
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select('//td[@class="ProductDetails"]/h1/text()').extract() if name: name = name[0].strip() url = response.url url = urljoin_rfc(get_base_url(response), url) items = hxs.select('//div[@class="Item"]') for item in items: loader = ProductLoader(item=Product(), selector=item) loader.add_value('url', url) #loader.add_value('name', name[0]) sku = ''.join(item.select('./text()').extract()) n = name if sku: n += ' ' + sku.strip() loader.add_value('name', n) loader.add_xpath('price', './/span[@class="price"]/text()') loader.add_xpath('price', './div[@class="price"]/span/text()') yield loader.load_item()
def parse_catalog_page(self, response): # self.log("This is catalogue page parser: %s" % (response.url,)) magic_set = response.request.meta["magic_set"] page_number = response.request.meta["page_allocation"] date_prefix = response.request.meta["date_prefix"] page_id = response.url.split("&")[-2:] filename = "./archive/HTML/" + date_prefix + "/" + page_id[0] + page_id[1] + ".htm" try: open(filename, "wb").write(response.body) except OSError: os.remove(filename) open(filename, "wb").write(response.body) hxs = HtmlXPathSelector(response) cards = hxs.select('//table[@class="table"]//td[@class="pgcol0"]/strong/a//text()').extract() links_to_buy_cards = hxs.select('//table[@class="table"]//td[@class="pgcol0"]/strong/a//@href').extract() low_price = hxs.select('//table[@class="table"]//td[@align="right"][@class="pgcol1"][1]/text()').extract() avg_price = hxs.select('//table[@class="table"]//td[@align="right"][@class="pgcol0"]/text()').extract() high_price = hxs.select('//table[@class="table"]//td[@class="pgcol1"][2]/text()').extract() # 3 stores, prices and links to them on each card!! stores_names = hxs.select('//table[@class="table"]//td[@class="pgcol0"][3]/a/text()').extract() links_to_stores = hxs.select('//table[@class="table"]//td[@class="pgcol0"][3]/a/@href').extract() price_in_stores = hxs.select('//table[@class="table"]//td[@class="pgcol0"][3]/strong/text()').extract() # let's zip it together and split into chunks: stores_info_zip = tuple(zip(stores_names, links_to_stores, price_in_stores)) our_store_price = hxs.select( '//table[@class="table"]//td[@align="center"][@class="pgcol1"]//strong/text()' ).extract() # items have trailing \n and we need delete it later! items_in_our_store = hxs.select( '//table[@class="table"]//td[@align="center"][@class="pgcol1"]//option[last()]/text()' ).extract() cards_info_zip = tuple( zip(cards, links_to_buy_cards, low_price, avg_price, high_price, our_store_price, items_in_our_store) ) i = 0 items = [] for card, link, low_price, avg_price, high_price, our_store_price, items_in_our_store in cards_info_zip: # let's iterate store info: stores_info = stores_info_zip[i : i + 3] i = i + 3 for stores in stores_info: item = MtgpricescraperItem() item["card_name"] = card item["magic_set"] = magic_set item["page"] = page_number item["link"] = "http://www.blackborder.com" + link.split("&sid")[0] item["low_price"] = low_price.split("$")[-1] item["avg_price"] = avg_price.split("$")[-1] item["high_price"] = high_price.split("$")[-1] item["store_name"] = stores[0] item["link_to_store"] = "http://www.blackborder.com" + stores[1] item["price_in_store"] = stores[2].split("$")[-1] # let's implement transformation into decimal: if "$" in our_store_price: item["our_store_price"] = our_store_price.split("$")[-1] else: item["our_store_price"] = "0." + our_store_price item["items_in_our_store"] = items_in_our_store.split("\n")[0] items.append(item) return items
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="prod"]') for product in products: loader = ProductLoader(item=Product(), selector=product) #loader.add_xpath('name', 'div/form/fieldset/div/h5/a/span/text()') name = product.select('div/form/fieldset/div/h5/a/span/text()').extract()[0].strip() url = product.select('div/form/fieldset/div/h5/a/@href').extract() if url: url = urljoin_rfc(get_base_url(response), url[0]) #loader.add_value('url', url) #loader.add_xpath('price', 'div/form/fieldset/div/span[@class="productPrice priceExVAT"]/text()') #yield loader.load_item() price = product.select('div/form/fieldset/div/span[@class="productPrice priceExVAT"]/text()').extract()[0].strip() yield Request(url, callback=self.parse_product, meta={'name':name, 'price':price}) pages = hxs.select('//span[@class="pagingButton"]/a/@href').extract() if pages: if response.meta['do_pagination']: for page in pages: url = urljoin_rfc(get_base_url(response), page) yield Request(url, callback=self.parse_products, meta={'do_pagination':False}) else: sub_categories = hxs.select('//div[@class="subcat"]/div/a/@href').extract() for sub_category in sub_categories: url = urljoin_rfc(get_base_url(response), sub_category) yield Request(url, callback=self.parse_products, meta={'do_pagination':True})
def parse(self, response): log.msg(response.url) hxs = HtmlXPathSelector(response) items=[] variants_date=hxs.select("//span[@class='normal']//text()").extract() variants_price=hxs.select("//table[@id='objContPreviousPrices_grdPreviousPrices']//tr//td[@class='normal']//text()").extract() price_items=self.__group_iter(variants_price,4) av_price=[] for price_list in price_items: av_price.append(reduce(lambda x, y: float(x) + float(y) / float(len(price_list)), price_list, 0)) for price, date in zip(variants_price, variants_date): item = BillionPricesIndiaItem() quantity='1 lt' item['date'] = date item['vendor'] = "ioc" item['product'] = "gasoline" item['category'] = "oil and gas" value,measure,unitprice=self.__unit_price(price,quantity) item['price'] = price item['quantity'] = value item['measure']= measure item['unitprice']=unitprice items.append(item) return items
def parse(self, response): hxs = HtmlXPathSelector(response) magic_sets_full = hxs.select('//div[@class="left_block"]//ul[@class="left_menu"]//li/a/text()').extract() links_to_magic_sets_full = hxs.select( '//div[@class="left_block"]//ul[@class="left_menu"]//li/a/@href' ).extract() # lets cut first category for debuging purposes: magic_sets = magic_sets_full[0] links_to_magic_sets = links_to_magic_sets_full[0] # self.log("This is first category and link to they: %s, %s, %s" % (type(magic_sets), magic_sets, links_to_magic_sets)) # Now all magic sets are all together with the links to them: # uncoment this after debug: # magic_sets_zip = dict(zip(magic_sets, links_to_magic_sets)) magic_sets_zip = dict([[magic_sets, links_to_magic_sets]]) date_prefix = time.strftime("%Y%m%d", time.localtime()) try: os.mkdir("./archive/HTML/" + date_prefix) except OSError: self.log("The folder exists!") filename = "./archive/HTML/" + date_prefix + "/" + response.url.split("/")[-1] + ".htm" self.log("This is filename for index: %s" % (filename,)) try: open(filename, "wb").write(response.body) except OSError: os.remove(filename) open(filename, "wb").write(response.body) # Continue to extract data: for magic_set, url in magic_sets_zip.iteritems(): abs_url = urljoin("http://www.blackborder.com", url) self.log("This is magic set name and url to it: %s ---> %s" % (magic_set, abs_url)) request = Request(abs_url, callback=self.parse_set_page) request.meta["magic_set"] = magic_set request.meta["date_prefix"] = date_prefix yield request
def parse_item(self,response): x=HtmlXPathSelector(response) a={} a['position']=x.select("//div[@class='listItemDescriptonDiv']/span[@class='ListItemNumber']/text()").extract()[0] at=x.select("//div[@class='listItemDescriptonDiv']/h3/text()").extract()[0] at=at.split('-') at=[i.strip() for i in at] a['title']=at[0] a['artist']=at[1] a['description']=x.select("//div[@class='listPageContentInfo']/text()").extract()[1].strip() a['cover']=x.select("//div[@class='listPageContentImage assetContainer imageStandard']/img/@src").extract()[0] try: band=Band.objects.get(name=a['artist']) except: band=Band(name=a['artist']) band.save() try: album=Album.objects.get(name=a['title'],band=band) except: album=Album() album.name=a['title'] album.band=band album.description=a['description'] album.save() try: top=TopAlbum.objects.get(position=int(a['position'])) except: top=TopAlbum() top.album=album top.position=int(a['position']) top.save()
def parse_youku_com(self, response): hxs = HtmlXPathSelector(response) video_id = hxs.re('var videoId.*?(\d+)')[0] url_t = "http://v.youku.com/v_vpactionInfo/id/%s" url = url_t % (video_id,) text = urllib.urlopen(url).read() hxs2 = HtmlXPathSelector(text=text) pv = hxs2.select('//ul[@class="row"]//span[@class="num"]/text()').extract()[0] pv = int(''.join(pv.split(','))) # others data d_tmp = hxs2.select('//ul[@class="half"]//span/text()').extract() # up and down data ud = d_tmp[0] up, down = d_tmp[0].split('/') up, down = int(''.join(up.split(','))), int(''.join(down.split(','))) # comments count comments = int(''.join(d_tmp[2].split(','))) item = response.meta['item'] doc = item['doc'] doc['pv'] = pv doc['up'] = up doc['down'] = down doc['comments'] = comments return item
def parse_sub_info(self, html): html = self.ToUnicode(html, 'gb2312') html = self.ToString(html) hxs = HtmlXPathSelector(text=html) data_info = {} h1 = hxs.select(".//h1/text()").extract() data_info['title'] = self._format(h1) info_selector_list = hxs.select(".//table[@width='432px' and @bgcolor='cccccc']/tr/td") if len(info_selector_list) == 0: info_selector_list = hxs.select(".//table[@width='396px' and @bgcolor='cccccc']/tr/td") for info_selector in info_selector_list: key = info_selector.select(".//strong/text()").extract() key = self._format(key) v = info_selector.select("text()").extract() v = self._format(v, True) if v: v = v.strip(":") if key and v: if self.CONVERT_DICT.get(key): data_info[self.CONVERT_DICT[key]] = v else: pass # data_info[key] = v f14_desc_list = hxs.select(".//div[@class='f14']/text()").extract() data_info['description'] = "\n".join(f14_desc_list) for k,v in data_info.iteritems(): print k,v return data_info
def parse_item(self, response): url = response.url hxs = HtmlXPathSelector(response) name = hxs.select("//div[@class='product-shop']/div[@class='product-name']/h2/text()").extract() if not name: logging.error("NO NAME! %s" % url) return name = name[0] # adding product price = hxs.select("//div[@class='product-shop']/div[@class='price-box']//span[@class='price']/text()").extract() if not price: logging.error("NO PRICE! %s" % url) return price = price[0].replace(".", "").replace(",", ".") # price_delivery = hxs.select("//div[@class='product-shop']//table[@id='product-attribute-specs-table']/tr/td[(preceding::th[text()='Spese Spedizione'])]/text()").extract() # if not price_delivery: # logging.error("NO PRICE DELIVERY! %s" % url) # return # price_delivery = price_delivery[0] # price = Decimal(price) + Decimal(price_delivery) l = ProductLoader(item=Product(), response=response) l.add_value('identifier', str(name)) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) yield l.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) nodes = hxs.select('//div[@id and @class="node node-paper node-teaser paper-type-0 clearfix"]') conference = hxs.select('/html/head/title/text()').extract() items = [] for node in nodes: item = PapersItem() item['conference'] = conference item['title'] = node.select('h2/a/text()').extract() content = node.select('div[@class="content"]') item['author'] = content.select( 'div[@class="field field-name-field-paper-people-text field-type-text-long field-label-hidden"]\ /div[@class="field-items"]/div[@class="field-item even"]/p/text()' ).extract() item['affiliation'] = content.select( 'div[@class="field field-name-field-paper-people-text field-type-text-long field-label-hidden"]\ /div[@class="field-items"]/div[@class="field-item even"]/p/em/text()' ).extract() item['description'] = content.select( 'div[@class="field field-name-field-paper-description-long field-type-text-long field-label-hidden"]\ /p' ).extract() item['fulltext'] = content.select( 'div[@class="field field-name-field-presentation-pdf field-type-file field-label-hidden"]\ /div/div/span/a/attribute::href' ).extract() items.append(item) return items
def parse_item(self, response): url = response.url hxs = HtmlXPathSelector(response) name = hxs.select("//div[@id='primary_block']/div[@id='pb-left-column']/h2/text()").extract() if not name: logging.error("NO NAME! %s" % url) return name = name[0] price = hxs.select("//p[@class='price']/span[@class='our_price_display']/span/text()").extract() if not price: logging.error("NO PRICE! %s" % url) return price = price[0] price = Decimal(extract_price2uk(price)) eco_tax = hxs.select("//p[@class='price-ecotax']/span/text()").extract() if eco_tax: eco_tax[0] = eco_tax[0].encode("ascii", "ignore") print "Found eco tax %s" % eco_tax[0] price -= Decimal(extract_price2uk(eco_tax[0])) l = ProductLoader(item=Product(), response=response) l.add_value("identifier", str(name)) l.add_value("name", name) l.add_value("url", url) l.add_value("price", unicode(price)) yield l.load_item()
def parse(self, response): if not isinstance(response, HtmlResponse): return base_url = get_base_url(response) #categories hxs = HtmlXPathSelector(response) categories_urls = hxs.select('//div[@id="navigation"]/div/h2/a/@href').extract() for url in categories_urls: yield Request(url) #subcats subcats_urls = hxs.select('//div[@id="navigation"]/div/div/a/@href').extract() for surl in subcats_urls: yield Request(surl) #pages pages_urls = hxs.select('//span[@id="Pagination"]/a/@href').extract() for page in pages_urls: yield Request(page) products = hxs.select('//div[@class="listitem"]') for p in products: url_product = p.select('.//div[@class="heading"]/a/@href')[0].extract() yield Request(urljoin_rfc(base_url,url_product), callback=self.parse_product)
def parse_page (self, response): task = response.meta['task'] county_id = response.meta['county_id'] hxs = HtmlXPathSelector(response) # inspect_response (response); #get next page next = hxs.select("//a[contains(text(),'Next')]/@href") # if 0: if len(next) > 0: request = Request (urljoin(response.url, next[0].extract()), callback=self.parse_page, errback=self.error_callback, dont_filter=True) request.meta['task'] = task request.meta['county_id'] = county_id yield request else: yield self.form_request(task) rows = hxs.select ('/html/body/table[4]/tr') if (len(rows) == 0): self.send_alert ('No permit data found in search response') self.log('No permit data table present in response', log.ERROR) elif (len(rows) == 1): self.log('No incident reports found in response', log.WARNING) else: # Skip the first report record because this is the header row rows.pop (0) self.log('Retrieved {0} permits'.format(len(rows)), log.INFO) for row in rows: r = dict(zip(self.field_names, [f.strip() for f in row.select ('td/text()').extract_unquoted()])) r['county'] = self.counties[county_id] for item in self.process_row(r, task): yield item
def parse_items(self, response): if not self.logged_in: login(response) print response.body hxs = HtmlXPathSelector(response) items = [] # Get all of the links to the next pages links = hxs.select('//a[contains(@href, "/local/add/analytics?storeid=")]') for l in links: item = GoogleItem() item['link'] = l.select('@href').extract() item['value'] = l.select('text()').extract() item['next_link'] = '' items.append(item) next_links = hxs.select('//a[contains(text(), "Next")]') for n in next_links: item = GoogleItem() item['link'] = n.select('@href').extract() item['value'] = n.select('text()').extract() item['next_link'] = n.select('@href').extract() items.append(item) return items
def parse_product(self, response): hxs = HtmlXPathSelector(response) # detect multiple product page tableHeader = hxs.select("//td[text()='Item#']") if tableHeader: subProducts = tableHeader.select("../../tr[@class='Multi-Child_Background']") for subProduct in subProducts: loader = ProductLoader(Product(), subProduct) theTDs = subProduct.select("td") loader.add_value('sku', theTDs[0].select("text()").extract()) loader.add_value('name', theTDs[1].select("text()").extract()) loader.add_value('price', theTDs.select("b/text()").extract()) loader.add_value('url', response.url) yield loader.load_item() else: productNode = hxs.select('//table[@id="v65-product-parent"]')[0] priceNode = productNode.select(".//font[@class='pricecolor colors_productprice']/text()") # Unavailable products are still online but have no price if priceNode: loader = ProductLoader(selector=productNode, item=Product()) loader.add_xpath('name', './/font[@class="productnamecolorLARGE colors_productname"]/text()') loader.add_value('url', response.url) loader.add_value('price', priceNode.extract()) sku = ''.join(hxs.select('.//span[@class="product_code"]/text()').extract()).strip() loader.add_value('sku', sku) yield loader.load_item()
def parse_detail(self,response): item = TbsItem() headers = response.headers self.set_items_value(item,'character',self.get_page_character(response.body)) self.set_items_value(item,'crawl_stats',self.default_crawl_stats) self.set_items_value(item,'searchkeywords',self.keyword) self.set_items_value(item,'spiderid',self.name) self.set_items_value(item,'refer',response.meta['refer']) self.set_items_value(item,'url_hash_no_fragment', self.get_url_hash_no_fragment(response.url)) self.set_items_value(item,'url', self.parseurl(response.url)) self.set_items_value(item,'root_domain',urlparse(response.url).hostname) self.set_items_value(item,'Expires',self.to_GMT_timestamp(headers['Expires']) if 'Expires' in headers.keys() else self.to_GMT_timestamp(None)) self.set_items_value(item,'LastModified',self.to_GMT_timestamp(headers['Last-Modified']) if 'Last-Modified' in headers.keys() else self.to_GMT_timestamp(None)) try: hxs = HtmlXPathSelector(response) self.set_items_value(item,'title',','.join(hxs.select('//title/text()').extract())) self.set_items_value(item,'desc',','.join(hxs.select('//meta[@name="description"]/@content').extract())) self.set_items_value(item,'keyword',','.join(hxs.select('//meta[@name="keywords"]/@content').extract())) except: self.set_items_value(item,'title',' ') self.set_items_value(item,'desc',' ') self.set_items_value(item,'keyword',' ') self.set_items_value(item,'body',response.body) self.set_items_value(item,'stripedbody',nltk.clean_html(self.strip_body(response.body))) return item
def parse(self, response): hxs = HtmlXPathSelector(response) # all_text will get emails from the text of the page all_text = hxs.select('//html//text()') # Remove C_Data tags, since they are showing up in the body text for some reason all_text = XPathSelectorList([text for text in all_text if not (re.match(self._c_data, text.extract()) or text.extract().strip() == '')]) all_text = all_text.re(self._email_regex) # hrefs will get emails from hrefs hrefs = hxs.select("//./a[contains(@href,'@')]/@href").re(self._email_regex) emails = hrefs + all_text # Take out the unicode, and substitute [at] for @ and [dot] for . for i in range(len(emails)): emails[i] = emails[i].encode('ascii', 'ignore') emails[i] = re.sub(r'(\[at]|\(at\)| at )([A-Za-z0-9.-]+)(\[dot]|\(dot\)| dot )', r'@\2.', emails[i]) # Makes it a set then back to a list to take out duplicates that may have been both in the body and links emails = list(set(emails)) return emails
def parse_page(self,response): hxs = HtmlXPathSelector(response) #For rtl //div[@id="article-section"]/p//text() #For lemonde //div[@id="articleBody"]/p//text() #For liberation //div[@class="article-body read-left-padding"]/p//text() #For ma ville //div[@class="elmt-detail article"]/p//text() #For sud ouest //div[@class="entry-content"]/p//text() title = ''.join(hxs.select('//h1[@class="title"]//text()').extract()).strip() date_article = ''.join(hxs.select('//span[@class="posted"]').extract()).strip() body = ''.join(hxs.select('//div[@id="mainentrycontent"]/p//text()').extract()).strip() item = NewsItem() if len(body)> 0 : item['site'] = start_links item['body'] = body item['url'] = response.url item['timeOfScrap'] = datetime.datetime.now() if len(date_article)>0 : item['date_article'] = date_article[59:84] if len(title)> 0 : item['title']= title return item else : pass
def parse_deck(self, response): hxs = HtmlXPathSelector(response) # Parse and return a Deck deck = {} deck['title'] = hxs.select('//span[@class="section_title"]/text()').extract() deck['main_deck'] = [] deck['sideboard'] = [] # Parse main deck raw_main_deck = hxs.select('//div[@id="set_cards_table"]//tr[@id]') for raw_card in raw_main_deck: card = self.parse_card(raw_card) # Add card to deck deck['main_deck'].append(card) # Parse sideboard raw_sideboard = hxs.select('//div[@id="sideboard"]//tr[@id]') for raw_card in raw_sideboard: card = self.parse_card(raw_card) # Add card to deck deck['sideboard'].append(card) return deck
def parse(self, response): hxs = HtmlXPathSelector(response) #sites = hxs.select('//h1') links=hxs.select('//div[@class="browseProductContainer"]') #links2=hxs.select('//div[@class="content-img"]') #scites = hxs.select('//div[@id="yt-lockup-content"]') items = [] for site in links: item = DmozItem() item['title'] = site.select('table/tr/td/table/tr/td[@class="header3"]/a/center/text()').extract() item['category'] = hxs.select('//title/text()').extract() item['link'] = site.select('table/tr/td/table/tr/td[@class="header3"]/a/@href').extract() item['desc'] = "" item['dataorg']="" item['src'] = site.select('table/tr/td/img[@class="ponudaimage clearfix"]/@src').extract() item['cijena'] = site.select('table/tr/td/table/tr/td[@class="ponudacijena3"]/span[@class="productPrice"]/text()').extract() print items items.append(item) return items
def parse_reviews(self, response): hxs = HtmlXPathSelector(response) parsed_reviews = [] # store time retrieved date_retrieved = response.headers['Date'] # get all review <td>s reviews = hxs.select("//table[@id='productReviews']/tr/td[1]/div[not(@class='CustomerPopover_load')]") _response_uses_more_tables = False # amazon provides two different types of HTML responses, one with # more tables instead of divs, so detect which response we got and # use the appropriate xpath expressions throughout if len(reviews) == 0: reviews = hxs.select("//table[@id='productReviews']/tr/td/table/tr/td[2]") _response_uses_more_tables = True ASIN = response.url.split('/')[4] if ASIN.find('-') > 0: ASIN = response.url.split('/')[5] log.msg("Response uses more tables: " + str(_response_uses_more_tables),level=log.DEBUG) # in each review <td> for review in reviews: parsed_review = AmznReviewItem() parsed_review['DateRetrieved'] = date_retrieved parsed_review['ASIN'] = ASIN # review ID is in the permalink if _response_uses_more_tables: permalink_url = review.select('div[last()]/table/tr/td[3]/div/div/span[3]/a/@href').extract()[0] else: permalink_url = review.select('div[last()]/div/div/div/span[3]/a/@href').extract()[0] parsed_review['ReviewId'] = permalink_url.split('/')[4] # rating is in the <span>s title, of the form: '1.0 out of 5 stars' rating_el = review.select('div/span/span/@title') parsed_review['Rating'] = float(rating_el.re('[^ ]+')[0]) # reviewer ID is in a link to their profile if _response_uses_more_tables: profile_url = review.select('div[3]/table/tr/td[2]/a[1]/@href') else: profile_url = review.select('div/div/div/a[1]/@href') # some reviews genuinely do not list the reviewer ID try: parsed_review['CustomerId'] = profile_url.extract()[0].split('/')[6] except: parsed_review['CustomerId'] = None # date and summary are in the same <span> parsed_review['Date'] = review.select('div/span/nobr/text()').extract()[0] parsed_review['Summary'] = review.select('div/span/b/text()').extract()[0] # reformat the date into a datestamp try: parsed_review['Date'] = datetime.strptime(parsed_review['Date'],'%B %d, %Y').strftime('%Y-%m-%d') except ValueError: parsed_review['Date'] = '' # reformat the date retrieved into a timestamp try: parsed_review['DateRetrieved'] = datetime.strptime(parsed_review['DateRetrieved'],'%a, %d %b %Y %H:%M:%S GMT').strftime('%Y-%m-%dT%H:%M:%S') except ValueError: parsed_review['DateRetrieved'] = '' parsed_reviews.append(parsed_review) num_reviews_parsed = len(parsed_reviews) log.msg("Parsed " + str(num_reviews_parsed) + " reviews for " + ASIN,level=log.DEBUG) if num_reviews_parsed > 0: # if this is an initial review page, yield the product data if response.url.find('dp_top_cm_cr_acr_txt') > 0: amzn_product = AmznProductItem() amzn_product['ASIN'] = ASIN amzn_product['Depth'] = response.request.meta['amznDepth'] # get total reviews num_reviews = hxs.select("//span[@class='crAvgStars']/a/text()").extract()[0].split(' ')[0] amzn_product['TotalReviews'] = int(num_reviews.replace(",","")) amzn_product['MediaType'] = response.request.meta['MediaType'] yield amzn_product # yield reviews for parsed_review in parsed_reviews: yield parsed_review
def parse_product(self, response): hxs = HtmlXPathSelector(response) options = hxs.select(u'//table[@class="ropetable" or @class="dbitable"]//td/a/@href').extract() if not options: options = hxs.select('//div[@class="ProductDescriptionContainer"]/ul/li/span/a/@href').extract() if not options: options = hxs.select('//ul[@class="pricing-table orange"]/li[@class="sign-up"]/a/@href').extract() if not options: options = hxs.select(u'//table//a/@href').extract() options = [o for o in options if o.startswith(response.url.rstrip('/'))] if options: for url in options: yield Request(url, meta=response.meta, callback=self.parse_product) return sku = hxs.select(u'//div[@id="sku"]/text()').extract() if not sku: sku = hxs.select('//span[@class="VariationProductSKU"]/text()').extract() try: identifier = hxs.select('//input[@name="product_id"][1]/@value').extract()[0].strip() except: self.log('NO IDENTIFIER => %s' % response.url) return product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h1/text()') product_loader.add_xpath('price', u'//em[contains(@class,"ProductPrice")]/text()') product_loader.add_value('sku', sku[0].strip()) product_loader.add_value('identifier', identifier) product_loader.add_value('category', response.meta.get('category')) product_loader.add_xpath('image_url', u'//div[@class="ProductThumbImage"]//img/@src') in_stock = ''.join(hxs.select('//div[div[contains(text(), "Availability:")]]/div[@class="Value"]/text()').extract()).upper() in_stock = 'IN-STOCK' in in_stock or 'IN STOCK' in in_stock if not in_stock: product_loader.add_value('stock', 0) product_loader.add_xpath('brand', u'//div[@class="DetailRow"]/div/a/text()') product_loader.add_xpath('shipping_cost', u'//div[@class="DetailRow"]/div[contains(text(),"Shipping")]/../div[2]/text()') options = hxs.select(u'//div[@class="productAttributeList"]//ul/li/label/input/../../..') options2 = hxs.select(u'//div[@class="productAttributeList"]//select') product_id = hxs.select(u'//input[@name="product_id"]/@value').extract()[0] product_orig = product_loader.load_item() if options: for opt in options: # Product without mandatory options #if not opt.select(u'.//input/@checked'): # yield product_orig names = opt.select(u'.//input/../span/text()').extract() values = opt.select(u'.//input/@value').extract() value_names = opt.select(u'.//input/@name').extract() names = [x for x in names if x.strip()] for i in xrange(len(names)): product = Product(product_orig) product['name'] = (product['name'] + ' ' + names[i].strip()).strip() yield Request('http://www.ropeandrescue.com/remote.php' + '?w=getProductAttributeDetails&product_id=' + product_id + '&' + urllib.quote(value_names[i]) + '=' + values[i], meta={'product': product, 'value': values[i]}, callback=self.parse_price) elif options2: names = options2.select(u'./option[@value!=""]/text()').extract() values = options2.select(u'./option[@value!=""]/@value').extract() value_name = options2.select(u'./@name').extract()[0] for i in xrange(len(names)): product = Product(product_orig) product['name'] = (product['name'] + ' ' + names[i].strip()).strip() yield Request('http://www.ropeandrescue.com/remote.php' + '?w=getProductAttributeDetails&product_id=' + product_id + '&' + urllib.quote(value_name) + '=' + values[i], meta={'product': product, 'value': values[i]}, callback=self.parse_price) else: yield product_orig
def parse_item(self, response): hxs = HtmlXPathSelector(response) movie_name = hxs.select( '//*[@id="content"]/h1/span[1]/text()').extract() movie_director = hxs.select( '//*[@id="info"]/span[1]/span[2]/a/text()').extract() movie_writer = hxs.select( '//*[@id="info"]/span[2]/span[2]/a/text()').extract() # 爬取电影详情需要在已有对象中继续爬取 movie_description_paths = hxs.select('//*[@id="link-report"]') movie_description = [] for movie_description_path in movie_description_paths: movie_description = movie_description_path.select( './/*[@property="v:summary"]/text()').extract() # 提取演员需要从已有的xPath对象中继续爬我要的内容 movie_roles_paths = hxs.select('//*[@id="info"]/span[3]/span[2]') movie_roles = [] for movie_roles_path in movie_roles_paths: movie_roles = movie_roles_path.select( './/*[@rel="v:starring"]/text()').extract() # 获取电影详细信息序列 movie_detail = hxs.select('//*[@id="info"]').extract() item = DoubanspriderItem() item['movie_name'] = ''.join(movie_name).strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace(':', ';') # item['movie_link'] = movie_link[0] item['movie_director'] = movie_director[0].strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace( ':', ';') if len(movie_director) > 0 else '' # 由于逗号是拿来分割电影所有信息的,所以需要处理逗号;引号也要处理,否则插入数据库会有问题 item['movie_description'] = movie_description[0].strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace( ':', ';') if len(movie_description) > 0 else '' item['movie_writer'] = ';'.join(movie_writer).strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace(':', ';') item['movie_roles'] = ';'.join(movie_roles).strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace(':', ';') # item['movie_language'] = movie_language[0].strip() if len(movie_language) > 0 else '' # item['movie_date'] = ''.join(movie_date).strip() # item['movie_long'] = ''.join(movie_long).strip() # 电影详情信息字符串 movie_detail_str = ''.join(movie_detail).strip() # print movie_detail_str movie_language_str = ".*语言:</span> (.+?)<br><span.*".decode("utf8") movie_date_str = ".*上映日期:</span> <span property=\"v:initialReleaseDate\" content=\"(\S+?)\">(\S+?)</span>.*".decode( "utf8") movie_long_str = ".*片长:</span> <span property=\"v:runtime\" content=\"(\d+).*".decode( "utf8") pattern_language = re.compile(movie_language_str, re.S) pattern_date = re.compile(movie_date_str, re.S) pattern_long = re.compile(movie_long_str, re.S) movie_language = re.search(pattern_language, movie_detail_str) movie_date = re.search(pattern_date, movie_detail_str) movie_long = re.search(pattern_long, movie_detail_str) item['movie_language'] = "" if movie_language: item['movie_language'] = movie_language.group(1).strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace(':', ';') # item['movie_detail'] = ''.join(movie_detail).strip() item['movie_date'] = "" if movie_date: item['movie_date'] = movie_date.group(1).strip().replace( ',', ';').replace('\'', '\\\'').replace('\"', '\\\"').replace(':', ';') item['movie_long'] = "" if movie_long: item['movie_long'] = movie_long.group(1) yield item
def parse_product(self, response): html = response.body.replace('&', '&') hxs = HtmlXPathSelector(text=html) identifier = hxs.select('//input[@id="pid"]/@value').extract() if not identifier: self.log('PRODUCT WITHOUT IDENTIFIER: ' + response.url) return loader = ProductLoader(item=Product(), response=response) name = hxs.select('//h1[@itemprop="name"]/text()').extract()[0] if name.startswith(':'): name = name[1:] loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('brand', response.meta.get('brand', '')) categories = hxs.select( '//a[@class="breadcrumb-element"]/@href/../text()').extract() categories.remove('Home') loader.add_value('category', categories) loader.add_value('sku', identifier[0]) loader.add_value('identifier', identifier[0]) loader.add_xpath('image_url', '//div[@class="product-primary-image"]/a/@href') loader.add_xpath( 'price', '//div[@id="product-content"]//span[@class="price-sales"]/meta/@content' ) out_of_stock = hxs.select('//p[contains(@class, "not-available")]') if out_of_stock: loader.add_value('stock', 0) if loader.get_output_value('price') <= 59.99: loader.add_value('shipping_cost', 1.99) item = loader.load_item() if item.get('price', None) and item['price'] <= 59.99: item['shipping_cost'] = 1.99 options = hxs.select('//select') for option in options: for variant in option.select('./option'): if variant.select('./@selected'): var_name = variant.select( './text()').extract()[0].strip().replace('&', '&') item['name'] += ' ' + var_name else: option_url = variant.select( './@value').extract()[0].replace( '&', '&') + '&Quantity=1&uuid=&format=ajax' meta = response.meta meta['item'] = deepcopy(item) meta['base_name'] = name yield Request(option_url, callback=self.parse_option, meta=meta) if item.get('price', None): yield item
def parse_products2(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) for href in hxs.select( '//table[@id="tblContent"]//td[@class="leftPane"]//a/@href' ).extract(): url = urlparse.urljoin(base_url, href) if url not in self.visited_urls: yield Request(url, callback=self.parse_products2) self.visited_urls.add(url) for href in hxs.select( '//ul[@id="pMenuSublevelsl1"]//a/@href').extract(): url = urlparse.urljoin(base_url, href) if url not in self.visited_urls: yield Request(urlparse.urljoin(base_url, href), callback=self.parse_products2) self.visited_urls.add(url) for product_box in hxs.select( '//div[@id="ShopContent"]//div[@class="plistAreaHeader"]/div'): tabular = product_box.select('.//table[@class="Tabular"]') if tabular: for pbox in tabular.select("./tbody/tr"): product_loader = ProductLoader(item=Product(), selector=pbox) product_loader.add_xpath('name', './td[2]/a/text()') product_loader.add_value( 'url', urlparse.urljoin( base_url, pbox.select('./td[2]/a/@href').extract()[0])) product_loader.add_value( 'price', pbox.select('./td[4]/a/text()').extract()[0].split(" ") [-1].replace(".", "").replace(",", ".")) product = product_loader.load_item() if product['url']: yield product continue elements = product_box.select('.//div[@class="prelement"]') if elements: for pbox in elements: product_loader = ProductLoader(item=Product(), selector=pbox) product_loader.add_xpath( 'name', './/div[@class="prmain"]/a[1]/text()') product_loader.add_value( 'url', urlparse.urljoin( base_url, pbox.select('.//div[@class="prmain"]/a[1]/@href'). extract()[0])) product_loader.add_value( 'price', pbox.select( './/div[@class="prbasket"]/p[@class="prpri"]/text()' ).extract()[0].split(" ")[-1].replace(".", "").replace( ",", ".")) product = product_loader.load_item() if product['url']: yield product elif product_box.select('.//div[@class="prbasket"]'): product_loader = ProductLoader(item=Product(), selector=product_box) product_loader.add_xpath('name', './a[1]/text()') product_loader.add_value( 'url', urlparse.urljoin( base_url, product_box.select('./a[1]/@href').extract()[0])) product_loader.add_value( 'price', product_box.select('.//div[@class="prbasket"]/p/text()'). extract()[0].split(" ")[-1].replace(".", "").replace(",", ".")) product = product_loader.load_item() if product['url']: yield product
def parse(self, response): if 'news.ycombinator.com' in response.url: hxs = HtmlXPathSelector(response) titles = sites = hxs.select('//td[@class="title"]//a/text()') for title in titles: print title.extract()
def parse_product_list(self, response): hxs = HtmlXPathSelector(response) cats = hxs.select( u'//div[@id="RightColumn"]/table/tr/td/center/div[@class="contentsName"]/a/@href' ).extract() if cats: for url in cats: if url.split('.')[-1].lower() not in ('htm', 'html'): # Contains links to PDFs as well continue url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse_product_list) else: opt_groups = [] def fix_options(what, o): try: return (what + ':' + o[0], o[1].replace(',', '')) except: return (what + ':' + o[0], '0') for option in hxs.select(u'//div[@class="eyOptions"]//select'): what = option.select(u'./@name').extract()[0] opt_list = option.select( u'./option[@value!="PleaseSelect" and @value!="Please Select"]/text()' ).extract() opt_list = [o.replace(')', '').split('(') for o in opt_list] opt_groups.append([fix_options(what, o) for o in opt_list]) for opt_name, opt_price in multiply(opt_groups): product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h1/text()') if hxs.select(u'//div[@class="bigSalePrice"]'): product_loader.add_xpath( 'price', u'//div[@class="bigSalePrice"]/span/font/text()') elif hxs.select(u'//span[@class="bigSalePrice"]'): product_loader.add_xpath( 'price', u'//span[@class="bigSalePrice"]/font/text()') else: product_loader.add_xpath( 'price', u'//div[@class="itemRegPrice"]/span/font/text()') product_loader.add_xpath( 'sku', u'normalize-space(substring-after(//div[@class="code"]/text(),":"))' ) product_loader.add_xpath( 'category', u'//div[@class="eyBreadcrumbs"]/a[2]/text()') product_loader.add_xpath('image_url', u'//img[@id="SwitchThisImage"]/@src') # product_loader.add_xpath('brand', u'substring-after(//div[@class="product-meta"]/span[contains(text(),"Manufacturer:")]/text(),":")') product_loader.add_value('shipping_cost', '') product = product_loader.load_item() product['name'] = (product['name'] + ' ' + opt_name).strip() product['price'] = product['price'] + Decimal(opt_price) yield product
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) categories_urls = hxs.select('//ul[@class="level1"]/li/a/@href').extract() for url in categories_urls: yield Request(urljoin_rfc(base_url, url), callback=self.parse_product_list)
def parse_product(self, response): hxs = HtmlXPathSelector(response) warning = ''.join( hxs.select( '//div[@class="InfoBanner" and contains(text(), "has returned 0 results")]//text()' ).extract()) if not warning: warning = ''.join( hxs.select( '//div[@class="noSearchResultsFound"]/text()').extract()) if warning: self.log(warning) return many = hxs.select( '//div[@id="SearchResults"]//div[@class="categoryGridTitle"]/a/@href' ).extract() if many: for url in many: yield Request(urljoin(get_base_url(response), url), callback=self.parse_product) return if hxs.select( '//div[@class="color:red" and contains(text(), "this item is no longer available")]' ): self.log('Item not available [%s]' % (response.url)) return loader = ProductLoader(item=Product(), selector=hxs) comms_no = hxs.select( '//tr[td[contains(text(), "Part No:")]]/td[not(@class)]/span/text()' ).extract()[0].upper() loader.add_value('identifier', comms_no) loader.add_value('url', response.url) loader.add_xpath('name', '//div[@id="productTitle"]//text()') loader.add_xpath( 'price', '//div[@id="productMainPrice"]/span[@id="price"]/text()') loader.add_xpath( 'sku', '//tr[td[contains(text(), "Part No:")]]/td[not(@class)]/span/text()' ) category = hxs.select( '//div[@class="newbreadcrumbText"]//text()').extract()[1:-1] loader.add_value('category', category) img = hxs.select('//span[@id="mainImage"]/a/img/@src').extract() if len(img[0]) < 255: loader.add_value('image_url', urljoin(get_base_url(response), img[0])) else: loader.add_value('image_url', '') loader.add_xpath('brand', '//div[@id="supplierLogo"]/img/@title') if not loader.get_output_value('brand'): loader.add_value('brand', loader.get_output_value('name').split()[0]) if loader.get_output_value('price') < 20: loader.add_value('shipping_cost', '2.95') else: loader.add_value('shipping_cost', '0') in_stock = 'IN STOCK' in ''.join( hxs.select( '//div[@id="stockCheck"]/div/text()').extract()).upper() if in_stock: loader.add_value('stock', '1') else: loader.add_value('stock', '0') manufacturers_no = hxs.select( '//span[@id="manufactNo"]/text()').extract() if not manufacturers_no: manufacturers_no = hxs.select( '//tr[td[contains(text(), "Manufacturer No:")]]/td[not(@class)]/text()' ).extract() if not manufacturers_no: manufacturers_no = hxs.select( '//tr[td[contains(text(), "Manufacturer No:")]]/td[2]//text()' ).extract() if not manufacturers_no: manufacturers_no = hxs.select( '//tr[td[contains(text(), "Part No:")]]/td[not(@class)]/span/text()' ).extract() manufacturers_no = manufacturers_no[0].strip() m = sku_regex.search(manufacturers_no) if m: manufacturers_no = m.group(1) product = loader.load_item() product['metadata'] = {'manufacturers_no': manufacturers_no} self.yield_item_with_metadata(product) return
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) options = hxs.select('//select[@id="variant-select-size"]/option[text()!="-- Please select --"]/@value').extract() options += hxs.select('//select[@id="variant-select-colour"]/option[text()!="-- Please select --"]/@value').extract() for option in options: url = urljoin_rfc(base_url, option) yield Request(url, callback=self.parse_product) try: sku = hxs.select('//p[@id="brandAndPartNos"]/text()').extract()[-1].strip() except: retry = int(response.meta.get('retry', 0)) if retry < 10: retry += 1 new_meta = response.meta.copy() new_meta['retry'] = retry yield Request(response.url, meta=new_meta, callback=self.parse_product, dont_filter=True) return if sku or not options: product_loader = ProductLoader(item=Product(), selector=hxs) product_id = hxs.select('//input[@name="productId"]/@value').extract()[0] name = hxs.select('//h1[@class="skuHeading"]/strong/text()').extract()[0] ext_name = ' '.join(hxs.select('//h1[@class="skuHeading"]/text()').extract()).strip() category = hxs.select('//div[@class="breadcrumb"]/nav/p/a/text()').extract()[-1] image_url = hxs.select('//img[@class="productImageLarge"]/@src').extract() if image_url: image_url = urljoin_rfc(base_url, image_url[0]) brand = hxs.select('//img[@class="brandImageMedium"]/@alt').extract() brand = brand[0].replace(' logo', '') if brand else '' product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('category', category) product_name = name + ext_name brand_in_name = False for w in re.findall('([a-zA-Z]+)', product_name): if w.upper() in brand.upper(): brand_in_name = True if brand.upper() not in product_name.upper() and not brand_in_name: product_name = brand + ' ' + product_name product_loader.add_value('name', product_name) product_loader.add_value('url', response.url) product_loader.add_value('identifier', product_id) product_loader.add_value('brand', brand) product_loader.add_value('sku', sku) discontinued = hxs.select('//p[contains(@class, "stock")]/span[@class="discontinued"]') if discontinued: # Does not include discontinued items return stock = hxs.select('//span[@class="inStock"]/strong/text()').extract() add_button = hxs.select('//input[contains(@class, "ajaxBuyButton")]') if stock: product_loader.add_value('stock', extract_price(stock[0])) elif add_button: product_loader.add_value('stock', 1) else: product_loader.add_value('stock', 0) price = hxs.select('//strong[@id="price_"]/text()').extract()[0] price = extract_price(price) if price < 50: product_loader.add_value('shipping_cost', 4.50) else: product_loader.add_value('shipping_cost', 0) product_loader.add_value('price', price) product_loader.add_value('image_url', image_url) yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) name = hxs.select('normalize-space(//*[@itemprop="name"]/text())').extract()[0] brand = 'Lego' try: image_url = urljoin_rfc(base_url, hxs.select('//div[@id="prod-media-player"]' '//img/@src').extract()[0].strip()) except IndexError: image_url = '' options = hxs.select('//div[@id="prod-multi-product-types"]') if options: products = options.select('.//div[@class="product-type"]') for product in products: opt_name = product.select('.//h3/text()').extract()[0].strip() try: stock = product.select('//div[contains(@class, "mod-stock-availability")]' '//p/strong/text()').re(r'\d+')[0] except IndexError: stock = 0 loader = ProductLoader(item=Product(), selector=product) sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model name")]/following-sibling::dd/text()').extract() if not sku: sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model Number")]/following-sibling::dd/text()').extract() if sku: loader.add_value('sku', sku[0].strip()) loader.add_xpath('identifier', './/div[contains(@class, "mod-product-code")]/p/text()') loader.add_value('name', '%s %s' % (name, opt_name)) loader.add_xpath('category', '//div[@id="breadcrumbs"]//li[@class="last"]/a/text()') loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('url', response.url) loader.add_xpath('price', './/p[@class="price"]/strong/text()') loader.add_value('stock', stock) yield loader.load_item() else: price = ''.join(hxs.select('//ul/li/strong[@class="price"]/text()').extract()).strip() if not price: price = ''.join(hxs.select('//span[@class="now-price"]/text()').extract()).strip() if not price: price = ''.join(hxs.select('//div[@id="prod-price"]//strong/text()').extract()).strip() try: stock = hxs.select('//div[contains(@class, "mod-stock-availability")]' '//p/strong/text()').re(r'\d+')[0] except IndexError: stock = 0 loader = ProductLoader(item=Product(), response=response) sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model name")]/following-sibling::dd/text()').extract() if not sku: sku = hxs.select(u'//div[@id="prod-info-tab"]//dl/dt[contains(text(),"Model Number")]/following-sibling::dd/text()').extract() if sku: loader.add_value('sku', sku[0].strip()) loader.add_xpath('identifier', '//div[@id="prod-product-code"]/p/text()') loader.add_value('name', name) loader.add_xpath('category', '//div[@id="breadcrumbs"]//li[@class="last"]/a/text()') loader.add_value('image_url', image_url) loader.add_value('brand', brand) loader.add_value('url', response.url) loader.add_value('price', price) loader.add_value('stock', stock) item = loader.load_item() if item.get('identifier'): yield item
def parse_mattel_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) product_name = hxs.select( '//div[@class="product-details"]/h2/text()').extract() if not product_name: return product_name = product_name[0] product_price = hxs.select( '//*[@id="product-information"]//span[@class="promotion-now"]/text()' ).extract()[0] product_identifier = response.url.partition('productId=')[2] brand = 'Mattel' image_url = hxs.select('//*[@id="mainProductImage"]/@src').extract() category = response.meta.get('category') sku = hxs.select('//span[@class="item-number"]/text()').extract() sku = sku[0].replace('Item #: ', '') a = re.search(r'skus: {\s+(.*)},\s+availableSizes', response.body, re.DOTALL | re.IGNORECASE) a = '{' + a.groups()[0].strip() + '}' a = a.replace("'", '"') lines = a.split('\n') result = '' for line in lines: if ': "' in line: for field in mattel_fields: if field + ':' in line: result += line.replace(field, '"' + field + '"') break else: result += line options = json.loads(result) for option_id, option in options.iteritems(): loader = ProductLoader(response=response, item=Product()) identifier = product_identifier + '_' + option_id loader.add_value('identifier', identifier) price = option.get('price').strip() if price == '': price = product_price price = extract_price(price) loader.add_value('price', price) loader.add_value('brand', brand) loader.add_value('sku', sku) loader.add_value('url', response.url) name = product_name if option.get('color').lower().strip() != 'one color': name += ', ' + option.get('color') if option.get('size').lower().strip() not in [ 'one size', 'one style' ]: name += ', ' + option.get('size') loader.add_value('name', name) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_value('category', category) if price > 35: loader.add_value('shipping_cost', 0) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) item = DoubanItem() item['url'] = hxs.select('//dt/a/@href').extract() item['name'] = hxs.select('//dt/a/img/@alt').extract() return item
def member_parser(self, response): """ Given a single members page and a response with an already half created Alum item, update the rest of the Alum data and yield the item back """ alum = response.request.meta['item'] x = HtmlXPathSelector(response) #educ info for tr in x.select( "//div[@id='cid_256$ctl00$ctl01$RichPanel_949_ContentDiv']//tr" ): field = tr.select("td/text()")[0].extract() try: value = tr.select("td/text()")[1].extract() except IndexError: # probably a link in one of the fields value = tr.select("td")[1].select("a/text()").extract() if field == "Wesleyan Degree School 1 :": alum['wesleyan_degree_school_1'] = value if field == "Wesleyan Degree Year 1:": alum['wesleyan_degree_year_1'] = value if field == "Wesleyan Degree 1:": alum['wesleyan_degree_1'] = value if field == "Wesleyan Degree 1 Major 1:": alum['wesleyan_degree_1_major_1'] = value if field == "Wesleyan Degree 1 Major 2:": alum['wesleyan_degree_1_major_2'] = value if field == "Wesleyan Degree 1 Major 3:": alum['wesleyan_degree_1_major_3'] = value #member info for tr in x.select( "//div[@id='cid_256$ctl00$ctl01$RichPanel_1670_ContentDiv']//tr" ): field = tr.select("td/text()")[0].extract() try: value = tr.select("td/text()")[1].extract() except IndexError: # probably have a link in the name somewhere value = tr.select("td")[1].select("a/text()").extract() if field == "First Name:": alum['first_name'] = value if field == "Nickname:": alum['nickname'] = value if field == "Last Name at Graduation:": alum['last_name_at_grad'] = value if field == "Last Name:": alum['last_name'] = value if field == "Preferred Class Year:": alum['preferred_class_year'] = value if field == "Preferred E-mail:": alum['preferred_email'] = value #employment info for tr in x.select( "//div[@id='cid_256$ctl00$ctl01$RichPanel_950_ContentDiv']//tr" ): field = tr.select("td/text()").extract()[0] try: value = tr.select("td/text()")[1].extract() except IndexError: # probably have a link in the name somewhere value = tr.select("td")[1].select("a/text()").extract() if field == "Company Name:": alum['company_name'] = value if field == "Position/Title:": alum['position_title'] = value if field == "Position Status:": alum['position_status'] = value if field == "Business Address 1:": alum['business_address_1'] = value if field == "Business Address 2:": alum['business_address_2'] = value if field == "Business Address City:": alum['business_address_city'] = value if field == "Business Address State:": alum['business_address_state'] = value if field == "Business Address Zip:": alum['business_address_zip'] = value if field == "Business Address Country:": alum['business_address_country'] = value if field == "Occupation:": alum['occupation'] = value if field == "Industry:": alum['industry'] = value yield alum
def parse_products(self, response): hxs = HtmlXPathSelector(response) items = [] item = AmazonItem() item['title'] = hxs.select( '//div[@class="a-section a-spacing-none"]/h1/span[@id="productTitle"]/text()' ).extract() item['brand'] = hxs.select('//a[@id="brand"]/text()').extract() item['specs'] = hxs.select( '//div[@class="pdTab"][1]//node()').extract() item['offerprice'] = hxs.select( '//span[@id="priceblock_ourprice"]/text()').extract() item['saleprice'] = hxs.select( '//span[@id="priceblock_saleprice"]/text()').extract() item['description'] = hxs.select( '//div[@id="productDescription"]//text()').extract() item['feature'] = hxs.select( '//ul[@class="a-vertical a-spacing-none"]/li/span/text()').extract( ) item['image'] = hxs.select( '//span[@class="a-button-text"]/img/@src').extract() item['link'] = response.meta["url"] item['seller'] = hxs.select( '//div[@id="merchant-info"]/a[1]/text()').extract() item['sellrating'] = hxs.select( '//div[@id="merchant-info"]/text()').extract() item['starating'] = hxs.select( '//a[@class="a-link-normal"]/i/span/text()').extract()[0] item['COD'] = "Available" item['category'] = "Clothing & Accessories" item['subcategory'] = "Men's Innerwear" items.append(item) return items
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) for url in hxs.select('//ul[@class="dropdown"]//a/@href').extract(): yield Request(urljoin(base_url, url), callback=self.parse_category)
def parse(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select(u'//ul[@class="mainnav"]/li/a/@href').extract(): url = urljoin_rfc(get_base_url(response), url) yield Request(url, callback=self.parse2)
def parse(self, response): hxs = HtmlXPathSelector(response) # skip age categories for cat in hxs.select('//div[@class="fagbutik-kategoribilledeblok-container"]//a/@href').extract(): yield Request(urljoin_rfc(get_base_url(response), cat), callback=self.parse_list)
def parse_product(self, response): hxs = HtmlXPathSelector(response) if hxs.select('//a[@id="anchorUnavailable"]'): return sellers_url = 'http://www.rakuten.com/PR/SellerListingsAjax.aspx?sku=%s' name = hxs.select('//div[@id="product-title"]/h1/text()').extract()[0] sku = 0 for item in re.findall("\d+", name): if int(item) > sku: sku = int(item) if sku == 0 or sku < 100: sku = ''.join( hxs.select('//th[contains(text(), "Mfg Part#")]/../td/text()'). extract()).strip() brand = hxs.select( '//th[contains(text(), "Manufacturer")]/../td/a/text()').extract( )[0] category = hxs.select( '//div[@class="product-breadcrumbs"]//a/text()').extract()[-1] image_url = hxs.select('//img[@id="productmain"]/@src').extract() identifier = hxs.select( '//th[contains(text(), "SKU")]/../td/text()').extract()[0] price = hxs.select( '//div[@class="main-price"]/span[@itemprop="price"]/text()' ).extract() price = price[0] if price else 0 shipping = hxs.select( '//div[@class="main-price"]/span[not(@itemprop="price")]/text()' ).extract() shipping = shipping[0] if shipping else 0 sellers = hxs.select( '//div[@id="seller-contact"]//a[@itemprop="seller"]') if sellers: yield Request(sellers_url % identifier, callback=self.parse_sellers, meta={ 'name': name, 'brand': brand, 'category': category, 'identifier': identifier, 'sku': sku, 'image_url': image_url, 'url': response.url }) else: l = ProductLoader(item=Product(), response=response) seller_name = hxs.select( '//a[@id="anchorMarketplaceShipsFrom"]/text()').extract() seller_name = seller_name[0] if seller_name else '' if seller_name: l.add_value('identifier', identifier + '-' + seller_name) else: l.add_value('identifier', identifier) l.add_value('name', name) l.add_value('category', category) l.add_value('brand', brand) l.add_value('sku', sku) l.add_value('url', response.url) l.add_value('price', price) l.add_value('shipping_cost', shipping) l.add_value('image_url', image_url) l.add_value('dealer', 'Rak - ' + seller_name if seller_name else '') yield l.load_item()
def parse_item(self, response): print '> there is a new apk: ',response.url hxs = HtmlXPathSelector(response) i = BaidumarketItem() i['app_market'] = 'Baidu_Market' i['market_site'] = 'shouji.baidu.com' i['app_name'] = "".join(hxs.select('//div[@id="doc"]/div[2]/div/div[1]/div/div[2]/h1/span/text()').extract()) i['app_keywords'] = "".join(hxs.select('//meta[@name="keywords"]/@content').extract()) i['app_url'] = response.url print "response url:"+response.url i['app_icon_url'] = "".join(hxs.select('//div[@id="doc"]/div[2]/div/div[1]/div/div[1]/div/img/@src').extract()) i['app_size'] = "".join(hxs.select('//div[@id="doc"]/div[2]/div/div[1]/div/div[2]/div[2]/span[1]/text()').extract())[3:] print "app_size:"+i['app_size'] i['app_version'] = "".join(hxs.select('//div[@id="doc"]/div[2]/div/div[1]/div/div[2]/div[2]/span[2]/text()').extract())[3:] print "app_version:"+i['app_version'] i['download_times'] = "".join(hxs.select('//div[@id="doc"]/div[2]/div/div[1]/div/div[2]/div[2]/span[3]/text()').extract()) print "download_times:"+i['download_times'] i['download_url'] = "".join(hxs.select('//div[@id="doc"]/div[2]/div/div[1]/div/div[4]/a/@href').extract()) print "download url:"+ i['download_url'] i['app_author'] = "".join(hxs.select('///div[@id="doc"]/div[2]/div/div[1]/div/div[2]/div[2]/div/span[2]/span[2]/text()').extract()) i['os_version'] = "".join(hxs.select('//div[@class="com"]/div[@class="info-top"]/dl/dd[@class="info-params"]/table/tbody/tr/td/span[@class="params-platform"]/text()').extract()) i['app_description'] = "".join(hxs.select('//div[@id="doc"]/div[2]/div/div[2]/div[3]/div[2]/div[2]/p/text()').extract()) i['last_update_date'] = "".join(hxs.select('//div[@class="com"]/div[@class="info-top"]/dl/dd[@class="info-params"]/table/tbody/tr/td/span[@class="params-updatetime"]/text()').extract()) i['app_class'] = "".join(hxs.select('//div[@class="content-main-border content-intro"]/div[@class="data-tabcon params-con"]/table/tbody/tr/td/span[@class="params-catename"]/text()').extract()) i['user_rate'] = str(int(round(float("".join(response.xpath('//div[@id="doc"]/div[2]/div/div[1]/div/div[2]/div[1]/span[1]/span/@style').extract())[6:8]))/10)) i['comments_num'] = "0" print i return i
def parse_product(self, response): hxs = HtmlXPathSelector(response) childMap = json.loads( re.search('\'childMap\': (.*),', response.body).group(1)) prices = json.loads( re.search('\'prices\': (.*),', response.body).group(1)) skus = json.loads(re.search('\'skus\': (.*),', response.body).group(1)) stockStatuses = json.loads( re.search('\'stockStatuses\': (.*),', response.body).group(1)) selects = [] for sel in hxs.select('//div[@class="product-options"]//select'): s = [] for opt in sel.select('.//option'): if opt.select('./@value').extract()[0]: s.append(( opt.select('./@value').extract()[0], opt.select('./text()').extract()[0], )) if s: selects.append(s) if not selects: selects = [[('', ''), ('%', '')]] for k, v in list(childMap.items()): if '_%' in k: childMap[k.replace('_%', '')] = v found = False for c in itertools.product(*selects): key = [x[0] for x in c] name = [x[1] for x in c] code = childMap.get('_'.join(key)) if not code: continue code = str(code) loader = ProductLoader(item=Product(), response=response) loader.add_xpath('name', '//h1[@itemprop="name"]/text()') loader.add_value('name', name) loader.add_value('sku', skus[code]) loader.add_value('identifier', skus[code]) loader.add_value('price', prices[code][0]['purchase']) loader.add_value('url', response.url) loader.add_value('brand', 'Le Creuset') if 'In stock' in stockStatuses.get(code, ''): loader.add_value('stock', '1') else: loader.add_value('stock', '0') if loader.get_output_value('price') < 45: loader.add_value('shipping_cost', '4.95') else: loader.add_value('shipping_cost', '0') loader.add_xpath('category', '//div[@class="crumbs"]/a[position()>2]/text()') image_url = hxs.select( '//div[@id="product-image"]//img/@src').extract() if image_url: loader.add_value( 'image_url', urljoin_rfc(get_base_url(response), image_url[0])) item = loader.load_item() metadata = LeCreusetMeta() item['metadata'] = metadata found = True yield item if not found: self.log('No products on %s' % response.url)
def parse_product(self, response): hxs = HtmlXPathSelector(response) category = hxs.select('//div[@id="bCrumb"]/span/a/text()').extract() category = category[-1] if category else response.meta.get( 'category', '') colours = hxs.select( '//select[@id="cphMain_ddlColour"]/option[@value!="0"]/@value' ).extract() no_option_selected = hxs.select( '//select[@id="cphMain_ddlColour"]/option[@value="0" and @selected]/@value' ) if colours and no_option_selected: for colour in colours: formdata = {} inputs = hxs.select('//form[@id="frmMain"]//input') for input in inputs: name = ''.join(input.select('@name').extract()) value = ''.join(input.select('@value').extract()) formdata[name] = value formdata['ctl00$cphMain$ddlColour'] = colour form_url = hxs.select( '//form[@id="frmMain"]/@action').extract()[0] yield FormRequest(form_url, dont_filter=True, method='POST', formdata=formdata, callback=self.parse_product, meta={ 'category': category, 'colour': colour }) return sizes = hxs.select( '//select[@id="cphMain_ddlSize"]/option[@value!="0"]/@value' ).extract() no_option_selected = hxs.select( '//select[@id="cphMain_ddlSize"]/option[@value="0" and @selected]') if sizes and no_option_selected: for size in sizes: formdata = {} inputs = hxs.select('//form[@id="frmMain"]//input') for input in inputs: name = ''.join(input.select('@name').extract()) value = ''.join(input.select('@value').extract()) formdata[name] = value formdata['ctl00$cphMain$ddlSize'] = size colour = response.meta.get('colour', None) if colour: formdata['ctl00$cphMain$ddlColour'] = colour form_url = hxs.select( '//form[@id="frmMain"]/@action').extract()[0] yield FormRequest(form_url, dont_filter=True, method='POST', formdata=formdata, callback=self.parse_product, meta={ 'category': category, 'formdata': formdata }) return loader = ProductLoader(item=Product(), selector=hxs) identifier = hxs.select('//div[@class="code"]/text()').extract()[0] loader.add_xpath('sku', '//div[@class="code"]/text()') loader.add_value('url', response.url) product_name = hxs.select( '//div[@class="title"]//h1/text()').extract()[0] colour = hxs.select( '//span[@id="cphMain_lblSelectedColour"]/b/text()').extract() if colour: product_name = product_name + ' - ' + colour[0].strip() loader.add_value('category', category) img = hxs.select('//img[@id="cphMain_imgThumb"]/@src').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_xpath('brand', '//span[@class="brand"]/text()') loader.add_value('stock', '1') if loader.get_output_value('price') < 50.00: loader.add_value('shipping_cost', '4.95') else: loader.add_value('shipping_cost', '0') price = hxs.select('//span[@class="price"]/text()').extract() if colours or sizes: colour = hxs.select( '//select[@id="cphMain_ddlColour"]/option[@selected and @value!="0"]' ) option_price = None if colour: colour_id = colour.select('@value').extract()[0] colour_desc = colour.select('text()').extract()[0] identifier = identifier + '-' + colour_id product_name = product_name + ' - ' + colour_desc.split( u' - \xa3')[0].strip() option_price = re.search(r"\xa3(\d+.\d+)", colour_desc) size = hxs.select( '//select[@id="cphMain_ddlSize"]/option[@selected and @value!="0"]' ) if size: size_id = size.select('@value').extract()[0] size_desc = size.select('text()').extract()[0].strip() identifier = identifier + '-' + size_id colour = hxs.select( '//span[@id="cphMain_lblSelectedColour"]/b/text()' ).extract() product_name = product_name + ' - ' + size_desc loader.add_value('identifier', identifier) loader.add_value('name', product_name.replace(' - Collect Only', '')) if option_price: loader.add_value('price', option_price.group(1)) else: loader.add_value('price', price) else: loader.add_value('identifier', identifier) loader.add_value('name', product_name.replace(' - Collect Only', '')) loader.add_value('price', price) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select( u'//nav[@id="main-nav"]/ul/li/a/@href').extract(): yield Request(url, callback=self.parse_cats)
def parse(self, response): printable = {} year = response.url.split("/")[-1] printable['year'] = year fPath = open(self.outDir + year + "/path.txt", "w") numFile = 0 ptr = HtmlXPathSelector(response) ulDict = {} roots = ptr.select('//h2') i = 0 for root in roots: if len(root.select('span')) > 0: break i += 1 topics = root.select('../h3|../h4|../h2|../ul') flag = 0 topic_name = '' for topic in topics: node_name = str(topic.select('name()').extract()[0]) if node_name == 'h2': content = topic.select('span/text()').extract() #content = content[-1] if len(content) > 0 else '' for tt in content: topic_name = str(tt) if topic_name == 'Events' or topic_name == 'Events and trends': break if (topic_name != 'Events' and topic_name != 'Events and trends') and flag == 0: continue if flag: #print "break! ",node_name break flag = 1 #print "continue! ",node_name, topic_name topic_name = '' continue if flag == 0: continue if node_name == 'h3' or node_name == 'h4': content = topic.select('span/text()').extract() for tt in content: topic_name = str(tt.encode('utf8', 'ignore')) if len(topic_name) > 2: break #print "continue! ",node_name, topic_name continue #print "out!!! ",node_name, topic_name printable['header'] = topic_name #lis = topic.select('following-sibling::ul[1]').select('li') lis = topic.select('li') for ind, li in enumerate(lis): #print li if len(li.select('ul/li')) > 0: for tmp in li.select('ul/li'): lis.insert(ind + 1, tmp) continue htmlText = li.extract().encode('utf8', 'ignore') #htmlText = " ".join([item.encode('utf8', 'ignore') for item in li.select('*').extract()]) matchRe = re.compile("<div class=.*>.*</div>", re.DOTALL) clnhtmlText = matchRe.sub("", htmlText) #hxs = HtmlXPathSelector(clnhtmlText) hxs = HtmlXPathSelector(text=clnhtmlText) #desc = li.select('descendant::text()').extract() desc = hxs.select('descendant::text()').extract() text = '' for tt in desc: content = tt.encode('utf8', 'ignore') if str(content) != "\n": text += str(content).rstrip("\n") text = text.rstrip('\n') text = re.sub(r'\[[0-9]+\]', r' ', text) text = re.sub(r'\[?citation needed\]?\.?', r'', text) printable['description'] = text linked_entity = [] links = li.select('./a') #links = hxs.select('./a') for link in links: try: t = link.select('text()').extract()[0].encode( 'utf8', 'ignore') if t not in text: l = "" else: #l = link.select('@href').extract()[0].encode('utf8','ignore') + ' '+t key = link.select('@href').extract()[0].encode( 'utf8', 'ignore') linked_entity.append((key, t)) except: l = '' printable['urls'] = linked_entity clnhtmlText = clnhtmlText.strip() if clnhtmlText.startswith("<li>"): clnhtmlText = clnhtmlText[4:] if clnhtmlText.endswith("</li>"): clnhtmlText = clnhtmlText[:-5] clnhtmlText = re.sub( r'<sup id=".*" class="reference">.*</sup>', r' ', clnhtmlText) clnhtmlText = re.sub( r'<sup class="Template-Fact" style=".*">.*</sup>', r' ', clnhtmlText) clnhtmlText = re.sub(r'\[?citation needed\]?\.?', r'', clnhtmlText) clnhtmlText = clnhtmlText.strip() printable['html_fragment'] = clnhtmlText fout = open(self.outDir + year + "/" + str(numFile) + ".txt", "w") fJson = open(self.outDir + year + "/" + str(numFile) + ".json", "w") print >> fJson, json.dumps(printable) print >> fout, printable['description'] print >> fPath, self.outDir + year + "/" + str( numFile) + ".txt" #fout.write("%s\t%s\t%s\t%s\t%s\n" %(year,topic_name,text, clnhtmlText, "\t".join(linked_entity)) fout.close() fJson.close() numFile += 1 fPath.close
def parse_product(self, response): hxs = HtmlXPathSelector(response) product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('url', response.url) product_loader.add_xpath('name', u'//h1/text()') # inc. vat product_loader.add_xpath('price', u'//span[@class="price"]/text()') product_loader.add_xpath('category', u'//div[@id="crumb"]/a[2]/text()') img = hxs.select(u'//img[@id="productImage"]/@src').extract()[0] product_loader.add_value('image_url', urljoin_rfc(get_base_url(response), img)) product_loader.add_xpath('sku', u'//span[@id="divskucode1"]/text()') product_loader.add_xpath('identifier', u'//span[@id="divskucode1"]/text()') product_loader.add_xpath('sku', u'//input[@name="pf_id"]/@value') product_loader.add_xpath('identifier', u'//input[@name="pf_id"]/@value') product_loader.add_xpath( 'brand', 'substring-after(//div[@class="brandlogo"]/a/img/@alt, "View All Products From ")' ) #product_loader.add_xpath('shipping_cost', '') options = [] for line in response.body.split('\n'): if 'new seldata(' in line: parts = line.split('new seldata(')[1:] for part in parts: part = part.split(');')[0].replace('new Array', '').replace(')', ',)') try: data = eval('(' + part + ')') except: # no options break self.log(part) options.append((data[0], data[2], data[-2])) product = product_loader.load_item() if 'price' not in product: product['price'] = None if options: for opt in options: prod = Product(product) prod['name'] = prod['name'] + ' ' + ' '.join(opt[0]) prod['sku'] = opt[1] prod['price'] = Decimal( opt[2].split('price=')[1].split('&')[0].replace(',', '')) prod['identifier'] = opt[1] yield prod # http://www.axminster.co.uk/axminster-veritas-additional-mounting-plates-with-rod-for-veritas-carvers-vice-prod821439/ elif hxs.select('//div[@class="prodOPTIONS"]'): for opt in hxs.select('//div[@class="prodOPTIONS"]'): prod = Product(product) prod['name'] = prod['name'] + ' ' + opt.select( u'normalize-space(.//div[@class="option"]/text())' ).extract()[0] prod['sku'] = opt.select( u'.//input[@name="sku"]/@value').extract()[0] prod['identifier'] = opt.select( u'.//input[@name="sku"]/@value').extract()[0] prod['price'] = extract_price( opt.select(u'.//span[@class="price"]/text()').extract()[0]) yield prod else: yield product
def parse(self, response): #inspect_response(response, self) #return hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) tmp = hxs.select( '//div[@id="col2"]//input[contains(@id,"product_minidetail_")]/@value' ).extract() if tmp: loader.add_value('identifier', tmp[0]) loader.add_value('sku', tmp[0]) else: log.msg('### No product ID at ' + response.url, level=log.INFO) return #tmp = hxs.select('//input[@name="productId"]/@value').extract() #if tmp: # loader.add_value('sku', tmp[0]) name = '' tmp = hxs.select( '//div[@id="col2"]//h1[@class="titre"]/text()').extract() if tmp: name = tmp[0].strip() loader.add_value('name', name) else: log.msg('### No name at ' + response.url, level=log.INFO) #price price = 0 stock = 0 tmp = hxs.select( '//div[@id="col2"]//span[@class="prix"]/text()').extract() if tmp: price = extract_price(tmp[0].strip()) loader.add_value('price', price) #stock = 1 #stock #stock = 0 tmp = hxs.select('//div[@id="col2"]//span[text()="Add to basket"]') if tmp: stock = 1 loader.add_value('stock', stock) #image_url tmp = hxs.select( '//div[@id="col1"]//div[contains(@class,"product")]/img/@src' ).extract() if tmp: url = urljoin(response.url, tmp[0].strip()) loader.add_value('image_url', url) #brand tmp = hxs.select( '//div[@id="col2"]//td[@class="catName"]/a/text()').extract() if tmp: loader.add_value('brand', tmp[0].upper()) #category tmp = hxs.select('//div[@id="breadcrumb"]/h2/a/text()').extract() if tmp: for s in tmp: loader.add_value('category', s) #shipping_cost if price <= 26: loader.add_value('shipping_cost', 3.6) #elif price<50: # loader.add_value('shipping_cost', 5.95) product = loader.load_item() metadata = YMeta() tmp = hxs.select( '//div[@id="col2"]//div[@class="promo"]/img/@alt').extract() if tmp: metadata['promotions'] = [] for s in tmp: s = s.replace('picto-', '') metadata['promotions'].append(s) metadata['promotions'] = ','.join(metadata['promotions']) if metadata: product['metadata'] = metadata return product
def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) for url in hxs.select('//ul[@id="category-navigation"]//a/@href').extract(): yield Request(urljoin_rfc(base_url, url), callback=self.parse_category)
def parse_form_result(self, response): qs = parse_qs(response.request.body) legislature, chamber = qs['Legislatura'][0], qs['Cuerpo'][0] hxs = HtmlXPathSelector(response) a_selector = '//center/a' table_selector = '//center/a/following-sibling::table' script_selector = '//script[contains(text(), "innerHTML = \'Procesando Sesiones")]' z = (hxs.select(selector) for selector in (a_selector, table_selector, script_selector)) for a, table, script in zip(*z): session, session_date = script.select('text()').re( '(\d+) del (\d{2}/\d{2}/\d{4})') session_diary = a.select('@href').extract() session_diary = urljoin(response.url, session_diary[0]) if session_diary.startswith('javascript:'): session_diary = None textnodes = table.select('tr/td/div').extract()[0].replace( '<br>', '\n').splitlines() notes_dict, notes_re = {}, re.compile('<b>\((\d+)\)</b> (.*)') for textnode in textnodes: match = notes_re.match(textnode) if match: notes_dict[match.group(1)] = match.group(2) for textnode in textnodes: search = { 'present': u'Asisten los se\xf1ores (?:Senadores|Representantes): (.*)\.', 'absent_w_warn': u'Faltan? con aviso: (.*)\.', 'absent_wo_warn': u'Faltan? sin aviso: (.*)\.', 'on_vacation': u'Con licencia: (.*)\.', } for status, pattern in search.items(): match = re.search(pattern, textnode) if match: asistees = parse_nlp_list(match.group(1)) for asistee in (asistee.strip() for asistee in asistees): notes, notes_re = None, re.compile( ' <b>\((\d+)\)</b>') if notes_re.search(asistee): notes = [ notes_dict[note_n] for note_n in notes_re.findall(asistee) ] asistee = notes_re.sub('', asistee) assistance_item = AssistanceItem( legislature=legislature, chamber=chamber, session=session, session_date=session_date, session_diary=session_diary, asistee=asistee, status=status, ) if notes: assistance_item.notes = [ AssistanceNote(note=note) for note in notes ] yield assistance_item
def parse(self, response): items = [] # extract site domain site = Utils.extract_domain(response.url) if not site: return items # handle staples televisions if site == 'staples': ############################################ # # # Use selenium - not necessary anymore # # zipcode = "12345" # # hxs = HtmlXPathSelector(response) # # return Request(self.cat_page, callback = self.parsePage_staples, cookies = {"zipcode" : zipcode}, meta = {"dont_redirect" : False}) # # use selenium to complete the zipcode form and get the first results page # driver = webdriver.Firefox() # driver.get(response.url) # # set a hardcoded value for zipcode # zipcode = "12345" # textbox = driver.find_element_by_name("zipCode") # if textbox.is_displayed(): # textbox.send_keys(zipcode) # button = driver.find_element_by_id("submitLink") # button.click() # cookie = {"zipcode": zipcode} # driver.add_cookie(cookie) # time.sleep(5) # # convert html to "nice format" # text_html = driver.page_source.encode('utf-8') # #print "TEXT_HTML", text_html # html_str = str(text_html) # # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module) # resp_for_scrapy = TextResponse('none',200,{},html_str,[],None) # #resp_for_scrapy = TextResponse(html_str) # # pass first page to parsePage function to extract products # items += self.parsePage_staples(resp_for_scrapy) # # use selenium to get next page, while there is a next page # next_page = driver.find_element_by_xpath("//li[@class='pageNext']/a") # while (next_page): # next_page.click() # time.sleep(5) # # convert html to "nice format" # text_html = driver.page_source.encode('utf-8') # #print "TEXT_HTML", text_html # html_str = str(text_html) # # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module) # resp_for_scrapy = TextResponse('none',200,{},html_str,[],None) # #resp_for_scrapy = TextResponse(html_str) # # pass first page to parsePage function to extract products # items += self.parsePage_staples(resp_for_scrapy) # hxs = HtmlXPathSelector(resp_for_scrapy) # next = hxs.select("//li[@class='pageNext']/a") # next_page = None # if next: # next_page = driver.find_element_by_xpath("//li[@class='pageNext']/a") # #TODO: this doesn't work # # try: # # next_page = driver.find_element_by_xpath("//li[@class='pageNext']/a") # # break # # except NoSuchElementException: # # # if there are no more pages exit the loop # # driver.close() # # return items # driver.close() # return items # ############################################## zipcode = "12345" request = Request(response.url, callback = self.parsePage_staples, cookies = {"zipcode" : zipcode}, \ headers = {"Cookie" : "zipcode=" + zipcode}, meta = {"dont_redirect" : True, "dont_merge_cookies" : True}) return request # handle bloomingdales sneakers if site == 'bloomingdales': driver = webdriver.Firefox() driver.get(response.url) # use selenium to select USD currency link = driver.find_element_by_xpath( "//li[@id='bl_nav_account_flag']//a") link.click() time.sleep(5) button = driver.find_element_by_id("iShip_shipToUS") button.click() time.sleep(10) # convert html to "nice format" text_html = driver.page_source.encode('utf-8') html_str = str(text_html) # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module) resp_for_scrapy = TextResponse('none', 200, {}, html_str, [], None) # parse first page with parsePage_bloomingdales function items += self.parsePage_bloomingdales(resp_for_scrapy) hxs = HtmlXPathSelector(resp_for_scrapy) # while there is a next page get it and pass it to parsePage_bloomingdales next_page_url = hxs.select("//li[@class='nextArrow']//a") while next_page_url: # use selenium to click on next page arrow and retrieve the resulted page if any next = driver.find_element_by_xpath( "//li[@class='nextArrow']//a") next.click() time.sleep(5) # convert html to "nice format" text_html = driver.page_source.encode('utf-8') html_str = str(text_html) # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module) resp_for_scrapy = TextResponse('none', 200, {}, html_str, [], None) # pass the page to parsePage function to extract products items += self.parsePage_bloomingdales(resp_for_scrapy) hxs = HtmlXPathSelector(resp_for_scrapy) next_page_url = hxs.select("//li[@class='nextArrow']//a") driver.close() return items # works for both product list pages and higher level pages with links in the left side menu to the product links page if site == 'walmart': hxs = HtmlXPathSelector(response) # try to see if it's not a product page but branches into further subcategories, select "See all..." page URL #! this has a space after the div class, maybe in other pages it doesn't seeall = hxs.select( "//div[@class='CustomSecondaryNav ']//li[last()]/a/@href" ).extract() if seeall: root_url = "http://www.walmart.com" page_url = root_url + seeall[0] # send the page to parsePage and extract product URLs request = Request(page_url, callback=self.parsePage_walmart) return request # if you can't find the link to the product list page, try to parse this as the product list page else: return Request(response.url, callback=self.parsePage_walmart) # works for both product list pages and higher level pages with links in the left side menu to the product links page if site == 'amazon': hxs = HtmlXPathSelector(response) # select first see more list ("All Televisions") seeall = hxs.select("//p[@class='seeMore'][1]/a/@href").extract() root_url = "http://www.amazon.com" # if we can find see all link, follow it and pass it to parsePage to extract product URLs if seeall: page_url = root_url + seeall[0] return Request(page_url, callback=self.parsePage_amazon) # otherwise, try to parse current page as product list page else: return Request(response.url, callback=self.parsePage_amazon) # works for both product list pages and higher level pages with links in the left side menu to the product links page if site == 'bestbuy': hxs = HtmlXPathSelector(response) # try to see if it's not a product page but branches into further subcategories, select "See all..." page URL seeall_list = hxs.select("//ul[@class='search']") if seeall_list: seeall = seeall_list[0].select("li[1]/a/@href").extract() if seeall: root_url = "http://www.bestbuy.com" page_url = root_url + seeall[0] # send the page to parsePage and extract product URLs return Request(page_url, callback=self.parsePage_bestbuy) else: return Request(response.url, callback=self.parsePage_bestbuy) # if you can't find the link to the product list page, try to parse this as the product list page else: return Request(response.url, callback=self.parsePage_bestbuy) if site == 'nordstrom': hxs = HtmlXPathSelector(response) return Request(response.url, callback=self.parsePage_nordstrom) if site == 'macys': hxs = HtmlXPathSelector(response) m = re.match("http://www1.macys.com/shop(.*)\?id=([0-9]+).*", self.cat_page) cat_id = 0 if m: cat_id = int(m.group(2)) productids_request = "http://www1.macys.com/catalog/category/facetedmeta?edge=hybrid&categoryId=%d&pageIndex=1&sortBy=ORIGINAL&productsPerPage=40&" % cat_id return Request(productids_request, callback=self.parse_macys, headers={"Cookie": "shippingCountry=US"}, meta={ 'dont_merge_cookies': True, "cat_id": cat_id, "page_nr": 1 }) if site == 'williams-sonoma': return Request(url=self.cat_page, callback=self.parsePage_sonoma) #TODO: is the list of product numbers ok for all pages? got if from laptops category request, seems to work for others as well even though it's not the same if site == 'overstock': # # get category, and if it's laptops treat it specially using the hardcoded url # m = re.match("http://www.overstock.com/[^/]+/([^/]+)/.*", self.cat_page) # if m and m.group(1) == "Laptops": return Request(url = self.cat_page + "&index=1&count=25&products=7516115,6519070,7516111,7646312,7382330,7626684,8086492,8233094,7646360,8135172,6691004,8022278&infinite=true", callback = self.parsePage_overstock, \ headers = {"Referer": self.cat_page + "&page=2", "X-Requested-With": "XMLHttpRequest"}, \ meta = {"index" : 1}) # else: # return Request(url = self.cat_page, callback = self.parsePage_overstock) if site == 'newegg': return Request(url=self.cat_page, callback=self.parsePage_newegg, meta={'page': 1}) if site == 'tigerdirect': return Request(url = self.cat_page, callback = self.parsePage_tigerdirect,\ # add as meta the page number and the base URL to which to append page number if necessary meta = {'page' : 1, 'base_url' : self.cat_page})
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), selector=hxs) # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl), # the pattern should be set as the product's name name = hxs.select('//td[@class="tread"]/text()').extract() if not name: msg = "No name found on page: %s" % response.url # self.errors.append(msg) self.log("[ERROR] %s" % msg) return loader.add_value('name', name[0]) brand = hxs.select( '//table[@class="single searchresults"]//td[@class="tyreinfo"]/b/text()' ).extract()[0].strip() loader.add_value('brand', unify_brand(brand)) loader.add_value('category', find_brand_segment(brand)) fitting_method = 'Delivered' loader.add_value('url', response.url) out_of_stock = hxs.select( '//table[@class="single searchresults"]//span[@class="outofstock"]' ) if out_of_stock: loader.add_value('stock', 0) image_url = hxs.select( '//table[@class="single searchresults"]//td[@class="logo-pic"]/img/@src' ).extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) identifier = hxs.select( '//table[@class="single searchresults"]//form/input[@name="pid"]/@value' )[0].extract() loader.add_value('identifier', identifier) price = hxs.select( '//table[@class="single searchresults"]//td[@class="netprice"]/text()' )[0].extract() loader.add_value('price', price) name = hxs.select( '//table[@class="single searchresults"]//td[@class="tyreinfo"]/span/text()' )[0].extract() data = parse_pattern(name) if not data: log.msg('ERROR parsing "{}" [{}]'.format(name, response.url)) # self.errors.append('ERROR parsing "{}" [{}]'.format(name, response.url)) return metadata = MicheldeverMeta() metadata['aspect_ratio'] = data['Aspect_Ratio'] metadata['rim'] = data['Rim'] metadata['speed_rating'] = data['Speed_Rating'] metadata['width'] = data['Width'] metadata['fitting_method'] = fitting_method metadata['load_rating'] = data['Load_Rating'] metadata['alternative_speed_rating'] = '' xl = 'XL' in name metadata['xl'] = 'Yes' if xl else 'No' run_flat_found = is_run_flat( loader.get_output_value('name') + ' ' + name) run_flat = 'rflat' in name.lower() or run_flat_found metadata['run_flat'] = 'Yes' if run_flat else 'No' if '*' in name: manufacturer_mark = '*' else: manufacturer_mark = [ mark for mark in self.all_man_marks.keys() if mark in name.split(' ') ] manufacturer_mark = manufacturer_mark[0].strip( ) if manufacturer_mark else [] metadata['manufacturer_mark'] = self.all_man_marks.get(manufacturer_mark, '') if manufacturer_mark \ else '' metadata['mts_stock_code'] = '' metadata['full_tyre_size'] = '/'.join( (metadata['width'], metadata['aspect_ratio'], metadata['rim'], metadata['load_rating'], metadata['speed_rating'])) # metadata['alternative_speed_rating'])) fuel = hxs.select('//div[@class="eulabels"]/div/img/@src').re( r'fuel-(\w)') grip = hxs.select('//div[@class="eulabels"]/div/img/@src').re( r'grip-(\w)') noise = hxs.select( '//div[@class="eulabels"]/div[contains(@class, "noise")]/strong/text()' ).extract() metadata['fuel'] = fuel[0].upper() if fuel else '' metadata['grip'] = grip[0].upper() if grip else '' metadata['noise'] = noise[0].upper() if noise else '' product = loader.load_item() product['metadata'] = metadata if not is_product_correct(product): return product['metadata']['mts_stock_code'] = find_mts_stock_code( product, spider_name=self.name, log=self.log) yield product