def parse_ann_family(self, response): hxs = HtmlXPathSelector(response) # find name of item item_name_path = hxs.select('//div[@class="hd-info"]//h1/text()') if len(item_name_path) == 0: self.invalid_links += 1 print "Invalid link: " + str(response.url) return (False, None) item_name = item_name_path.extract()[0] logging.critical("Name: " + str(item_name)) self.count_scraped += 1 meta_tag_url = hxs.select('//meta[@property="og:url"]/@content') prod_url = meta_tag_url.extract()[0] logging.critical("PRODUCT URL:" + str(prod_url) + " ITEM_NAME " + str(item_name) + " TOTAL SO FAR " + str(self.count_scraped)) # Ann Taylor is for women only gender = 'F' # find price and sale price item_id_, price_, sale_price_ = self._find_price(hxs, prod_url) if item_id_ in self.items_scraped: logging.critical("ITEM ALREADY SCRAPED " + str(item_id_)) return (False, None) else: self.items_scraped.append(item_id_) logging.critical("ITEM_ID " + str(item_id_) + " PRICE " + str(price_) + " SALE PRICE " + str(sale_price_)) if price_ > sale_price_: logging.critical("SALE on ITEM_ID " + str(item_id_) + " PRICE " + str(price_) + " SALE PRICE " + str(sale_price_)) # extract image URL prod_img_path = hxs.select('//img[@id="productImage"]/@src') prod_img_url = str(prod_img_path.extract()[0]) logging.critical("Image URL: " + str(prod_img_url)) # find description and keywords: these will be useful in categorization desc = hxs.select('//div[@class="gu gu-first description"]/p/text()').extract() prod_desc = ''.join(desc) logging.critical("Description: " + prod_desc) # promo text # DIDN'T FIND ANY #promo_path = hxs.select('//span[@class="cat-pro-promo-text"]//font/text()').extract() #promo_str = str(promo_path) #logging.critical("Promotion: ") #logging.critical(promo_str) promo_str = "" product, created_new = self._create_product_item(item_name, item_id_, str(prod_url), price_, \ sale_price_, gender, str(prod_img_url), promo_str, prod_desc) if created_new: new_cat = simple_product_categorization(product) product.cat1 = new_cat["cat1"] product.cat2 = new_cat["cat2"] product.cat3 = new_cat["cat3"] product.save() #self._store_in_file(response, item_id_) #raise CloseSpider('Blah') logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\ str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped))) #raise SystemExit return (True, product)
def parse_pinterest(self, response): #self.check_shelfit_validity(response) #return (False, None) hxs = HtmlXPathSelector(response) # find name of item item_name_path = hxs.select('//title/text()') if len(item_name_path) == 0: self.invalid_links += 1 print "Invalid link: " + str(response.url) return (False, None) item_name = item_name_path.extract()[0] if '|' in item_name: index = item_name.find('|') item_name = item_name[0:index] logging.critical("Name: " + item_name.encode('utf-8')) self.count_scraped += 1 prod_url = response.url logging.critical("PRODUCT URL:" + str(prod_url) + " ITEM_NAME " + item_name.encode('utf-8') + " TOTAL SO FAR " + str(self.count_scraped)) gender = 'F' if "www.bananarepublic.com" in prod_url or 'www.gap.com' in prod_url: gender_path = hxs.select( '//a/img[contains (@class, "_selected")]/@alt') if len(gender_path) > 0: gender__ = gender_path.extract()[0] if 'men' in gender__ or 'boy' in gender__: gender = 'M' logging.critical("GENDER: " + gender) # find price and sale price item_id_, price_, sale_price_ = self._find_price(hxs, prod_url) if item_id_ in self.items_scraped: logging.critical("ITEM ALREADY SCRAPED " + str(item_id_)) return (False, None) else: self.items_scraped.append(item_id_) logging.critical("ITEM_ID " + str(item_id_) + " PRICE " + str(price_) + " SALE PRICE " + str(sale_price_)) if price_ > sale_price_: logging.critical("SALE on ITEM_ID " + str(item_id_) + " PRICE " + str(price_) + " SALE PRICE " + str(sale_price_)) # extract image URL prod_img_path = hxs.select('//img[@id="productImage"]/@src') if len(prod_img_path) > 0: prod_img_url = str(prod_img_path.extract()[0]) logging.critical("Image URL: " + str(prod_img_url)) else: prod_img_url = "" # find description and keywords: these will be useful in categorization prod_desc = '' logging.critical("Description: " + prod_desc) # promo text promo_str = "" product, created_new = self._create_product_item(item_name, item_id_, str(prod_url), price_, \ sale_price_, gender, str(prod_img_url), promo_str, prod_desc) if product == None: logging.critical( "Product is None----SHOULDN'T HAPPEN!!!!!******************") #import sys #sys.exit(1) if created_new: new_cat = simple_product_categorization(product) product.cat1 = new_cat["cat1"] product.cat2 = new_cat["cat2"] product.cat3 = new_cat["cat3"] product.save() logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\ str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped))) return (True, product)
def parse_product(self, response): print "parse_product %s" % response # return #self.check_shelfit_validity(response) #return hxs = HtmlXPathSelector(response) # find name of item item_name_path = hxs.select('//div[@id="product_info"]/h2/text()') if len(item_name_path) == 0: self.invalid_links += 1 return (False, None) item_name = item_name_path.extract() logging.critical("Name: " + str(item_name)) self.count_scraped += 1 ''' PLAYING NICE: sleeping for 1min after crawling every 100 pages ''' # if self.count_scraped % 100 == 0: # sleep(60) # sleep for 1 mins for express prod_url = response.url logging.critical("PRODUCT URL:" + str(prod_url) + " TITLE " + str(item_name) + " TOTAL SO FAR " + str(self.count_scraped)) # find gender gender = 'Nil' try: gen_strs = hxs.select('//h3[@class="s_here open"]').extract() for gen_str in gen_strs: if 'boy' in gen_str.lower(): gender = 'M' if 'girl' in gen_str.lower(): gender = 'F' except: pass # if prod_url.lower().find('women') >= 0 or prod_url.lower().find('girl') >= 0: # gender = 'F' logging.critical("Gender: " + gender) ''' TODO: if same page has multiple items, our logic will not work. So, leaving it for future. ''' # if len(item_name) == 0: # logging.critical("DIDN'T FIND TITLE AT NORMAL PLACE, MUST BE SUIT. RETURNING." + str(prod_url)) # print item_name_path # print "Size of response " + str(len(str(response))) # print str(response) # return (False, None) # find price and sale price item_id_, price_, sale_price_ = self._find_price(hxs, prod_url) # print item_id_ # return if item_id_ in self.all_items_scraped: print "RETURNING since we have already scraped " + str(item_id_) self.all_items_scraped.add(item_id_) logging.critical("ITEM_ID " + str(item_id_) + " PRICE " + str(price_) + " SALE PRICE " + str(sale_price_)) # extract image URL # <meta content="http://www.childrensplace.com/www/b/TCP/images/cloudzoom/p/136532_p.jpg" property="og:image"> prod_img_path = hxs.select('//meta[@property="og:image"]/@content') prod_img_url = str(prod_img_path.extract()[0]) # prod_img_url = prod_img_str[28: len(prod_img_str) - 2] logging.critical("Image URL: " + str(prod_img_url)) # find description and keywords: these will be useful in categorization # <div id="tab-content"> # <dl class="tabs"> # <dt id="tab_description" class="tab_here tab_here" width="91" style="display: block; left: 0px;">Description</dt> # <dd style="display: block;"> # <p>Rev up his look with this cute style!</p> # desc = hxs.select('//div[@id="tab-content"]/dl[@class="tabs"]/dd[0]/p[0]/text()').extract() desc = hxs.select( '//div[@id="tab-content"]/dl[@class="tabs"]/dd/p/text()').extract( ) logging.critical("Description: ") logging.critical(desc) prod_desc = desc[0] # promo text promo_str = 'Nil' # promo_path = hxs.select('//span[@class="cat-pro-promo-text"]//font/text()').extract() # promo_str = str(promo_path) # logging.critical("Promotion: ") # logging.critical(promo_str) product, created_new = self._create_product_item(item_name[0], item_id_, str(prod_url), price_, \ sale_price_, gender, str(prod_img_url), promo_str, prod_desc) if (not created_new): return (False, product) if created_new: new_cat = simple_product_categorization(product) product.cat1 = new_cat["cat1"] product.cat2 = new_cat["cat2"] product.cat3 = new_cat["cat3"] product.save() #self._create_category(product, categories) #self._store_in_file(response, item_id_) #raise CloseSpider('Blah') logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\ str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped))) #raise SystemExit return (True, product)
def parse_aeaglefamily(self, response): #self.check_shelfit_validity(response) #return (False, None) hxs = HtmlXPathSelector(response) # find name of item item_name_path = hxs.select('//h1[@class="pName"]/text()') if len(item_name_path) == 0: self.invalid_links += 1 print "Invalid link: " + str(response.url) return (False, None) item_name = item_name_path.extract()[0] logging.critical("Name: " + item_name.encode('utf-8')) self.count_scraped += 1 meta_tag_url = hxs.select('//meta[@property="og:url"]/@content') if len(meta_tag_url) > 0: prod_url = meta_tag_url.extract()[0] else: prod_url = response.url logging.critical("PRODUCT URL:" + str(prod_url) + " ITEM_NAME " + item_name.encode('utf-8') + " TOTAL SO FAR " + str(self.count_scraped)) gender = 'F' logging.critical("GENDER: " + gender) # find price and sale price item_id_, price_, sale_price_ = self._find_price(hxs, prod_url) if item_id_ in self.items_scraped: logging.critical("ITEM ALREADY SCRAPED " + str(item_id_)) return (False, None) else: self.items_scraped.append(item_id_) logging.critical("ITEM_ID " + str(item_id_) + " PRICE " + str(price_) + " SALE PRICE " + str(sale_price_)) if price_ > sale_price_: logging.critical("SALE on ITEM_ID " + str(item_id_) + " PRICE " + str(price_) + " SALE PRICE " + str(sale_price_)) # extract image URL meta_tag_url = hxs.select('//meta[@property="og:image"]/@content') if len(meta_tag_url) > 0: prod_img_url = meta_tag_url.extract()[0] else: prod_img_url = "" # find description and keywords: these will be useful in categorization prod_desc = '' logging.critical("Description: " + prod_desc) # promo text promo_str = "" product, created_new = self._create_product_item(item_name, item_id_, str(prod_url), price_, \ sale_price_, gender, str(prod_img_url), promo_str, prod_desc) if product == None: logging.critical( "Product is None----SHOULDN'T HAPPEN!!!!!******************") #import sys #sys.exit(1) ### HANDLE CATEGORIZATION if created_new: new_cat = simple_product_categorization(product) product.cat1 = new_cat["cat1"] product.cat2 = new_cat["cat2"] product.cat3 = new_cat["cat3"] product.save() logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\ str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped))) return (True, product)
def parse_express(self, response): #self.check_shelfit_validity(response) #return hxs = HtmlXPathSelector(response) # find name of item item_name_path = hxs.select( '//div[@id="cat-pro-con-detail"]//h1/text()') if len(item_name_path) == 0: self.invalid_links += 1 return (False, None) item_name = item_name_path.extract() logging.critical("Name: " + str(item_name)) self.count_scraped += 1 prod_url = response.url logging.critical("PRODUCT URL:" + str(prod_url) + " TITLE " + str(item_name) + " TOTAL SO FAR " + str(self.count_scraped)) # find gender gender = 'M' if prod_url.lower().find('women') >= 0 or prod_url.lower().find( 'girl') >= 0: gender = 'F' logging.critical("Gender: " + gender) ''' TODO: if same page has multiple items, our logic will not work. So, leaving it for future. ''' if len(item_name) == 0: logging.critical( "DIDN'T FIND TITLE AT NORMAL PLACE, MUST BE SUIT. RETURNING." + str(prod_url)) print item_name_path print "Size of response " + str(len(str(response))) print str(response) return (False, None) # find price and sale price item_id_, price_, sale_price_ = self._find_price(hxs, prod_url) if item_id_ in self.all_items_scraped: print "RETURNING since we have already scraped " + str(item_id_) self.all_items_scraped.add(item_id_) logging.critical("ITEM_ID " + str(item_id_) + " PRICE " + str(price_) + " SALE PRICE " + str(sale_price_)) # extract image URL prod_img_path = hxs.select('//link[@rel="image_src"]') prod_img_str = str(prod_img_path.extract()[0]) prod_img_url = prod_img_str[28:len(prod_img_str) - 2] logging.critical("Image URL: " + str(prod_img_url)) # find description and keywords: these will be useful in categorization desc = hxs.select( '//div[@id="cat-pro-con-detail"]//li[@class="cat-pro-desc"]/text()' ).extract() logging.critical("Description: ") logging.critical(desc) prod_desc = desc # promo text promo_path = hxs.select( '//span[@class="cat-pro-promo-text"]//font/text()').extract() promo_str = str(promo_path) logging.critical("Promotion: ") logging.critical(promo_str) product, created_new = self._create_product_item(item_name[0], int(item_id_), str(prod_url), price_, \ sale_price_, gender, str(prod_img_url), promo_str, prod_desc) if (not created_new): return (False, product) if created_new: new_cat = simple_product_categorization(product) product.cat1 = new_cat["cat1"] product.cat2 = new_cat["cat2"] product.cat3 = new_cat["cat3"] product.save() #self._create_category(product, categories) #self._store_in_file(response, item_id_) #raise CloseSpider('Blah') logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\ str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped))) #raise SystemExit return (True, product)
def parse_dsw(self, response): hxs = HtmlXPathSelector(response) meta_tag_item_name = hxs.select('//meta[@property="og:title"]/@content') if len(meta_tag_item_name) > 0: item_name = meta_tag_item_name.extract()[0] else: item_name_path = hxs.select('//title/text()') if len(item_name_path) > 0: item_name = item_name_path.extract()[0] else: logging.error("Not a product page: " + response.url) return (False, None) logging.critical(smart_str(item_name)) self.count_scraped += 1 meta_tag_url = hxs.select('//meta[@property="og:url"]/@content') if len(meta_tag_url) > 0: prod_url = meta_tag_url.extract()[0] else: prod_url = response.url logging.critical("PRODUCT URL:" + smart_str(prod_url) + " TITLE " + smart_str(item_name) + \ " TOTAL SO FAR " + str(self.count_scraped)) # find gender gender = 'M' if prod_url.lower().find('women') >= 0 or prod_url.lower().find('girl') >= 0: gender = 'F' logging.critical("Gender: " + gender) # find price and sale price item_id_, price_, sale_price_ = self._find_price(hxs) if item_id_ in self.items_scraped: logging.critical("ITEM ALREADY SCRAPED " + smart_str(item_id_) + ". RETURNING.") return (True, None) else: self.items_scraped.append(item_id_) logging.critical("ITEM_ID " + item_id_ + " PRICE " + smart_str(price_) + " SALE PRICE " + smart_str(sale_price_)) if price_ > sale_price_: logging.critical("SALE on ITEM_ID " + smart_str(item_id_) + " PRICE " + smart_str(price_) +\ " SALE PRICE " + smart_str(sale_price_)) meta_img_url = hxs.select('//meta[@property="og:image"]/@content') if len(meta_img_url) > 0: prod_img_url = meta_img_url.extract()[0] else: prod_img_url = "" logging.critical("Image URL: " + smart_str(prod_img_url)) # find description and keywords: these will be useful in categorization desc = hxs.select('//meta[@property="og:description"]/@content') if len(desc) > 0: desc_content = desc.extract()[0] else: desc_content = '' logging.critical("Description: " + str(desc_content.encode('utf-8'))) prod_desc = desc_content promo_str = '' product, created_new = self._create_product_item(item_name, item_id_, str(prod_url), price_, \ sale_price_, gender, prod_img_url, promo_str, prod_desc) if created_new: new_cat = simple_product_categorization(product) product.cat1 = new_cat["cat1"] product.cat2 = new_cat["cat2"] product.cat3 = new_cat["cat3"] product.save() logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\ str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped))) #raise SystemExit return (True, product)
def parse_nyc(self, response): hxs = HtmlXPathSelector(response) # find name of item item_name_path = hxs.select('//h1/text()') if len(item_name_path) == 0: self.invalid_links += 1 return (False, None) item_name = item_name_path.extract() logging.critical("Name: " + str(item_name)) self.count_scraped += 1 ''' PLAYING NICE: sleeping for 1min after crawling every 100 pages ''' if self.count_scraped % 100 == 0: sleep(0) # sleep for 1 mins for express can_url_path = hxs.select('//link[@rel="canonical"]/@href') if len(can_url_path) > 0: prod_url = can_url_path.extract()[0] else: prod_url = response.url logging.critical("PRODUCT URL:" + str(prod_url) + " TITLE " + str(item_name) + " TOTAL SO FAR " + str(self.count_scraped)) # find gender gender = 'F' logging.critical("Gender: " + gender) # find price and sale price item_id_, price_, sale_price_ = self._find_price(hxs, prod_url) if item_id_ in self.all_items_scraped: print "RETURNING since we have already scraped " + str(item_id_) self.all_items_scraped.add(item_id_) logging.critical("ITEM_ID " + str(item_id_) + " PRICE " + str(price_) + " SALE PRICE " + str(sale_price_)) # extract image URL img_str = re.findall('strLarge = ["\w\d\/:_\$.?]+', str(response.body)) prod_img_url = "" if len(img_str) > 0: img_str_ = img_str[0] img_str_parts = img_str_.split() if len(img_str_parts) > 2: prod_img_url = img_str_parts[2].strip('"') if prod_img_url == "": logging.critical("PROBLEM with Image URL for " + str(response.url)) logging.critical("Image URL: " + str(prod_img_url)) # find description and keywords: these will be useful in categorization desc = hxs.select('//p[@class="itemstyle_pdp"]/span[@class="details"]/text()').extract() logging.critical("Description: ") logging.critical(desc) prod_desc = desc # promo text promo_str = "" product, created_new = self._create_product_item(response.url, item_name[0], item_id_, str(prod_url), price_, \ sale_price_, gender, str(prod_img_url), promo_str, prod_desc) if product == None: logging.critical("PROBLEM: product is None for URL " + str(response.url)) if created_new: new_cat = simple_product_categorization(product) product.cat1 = new_cat["cat1"] product.cat2 = new_cat["cat2"] product.cat3 = new_cat["cat3"] product.save() logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\ str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped))) return (True, product)
def parse_jcrew(self, response): hxs = HtmlXPathSelector(response) meta_tag_item_name = hxs.select( '//meta[@property="og:title"]/@content') if len(meta_tag_item_name) > 0: item_name = meta_tag_item_name.extract()[0] else: item_name_path = hxs.select('//title/text()') if len(item_name_path) > 0: item_name = item_name_path.extract()[0] else: logging.error("Not a product page: " + response.url) return (False, None) logging.critical(item_name.encode('utf-8')) self.count_scraped += 1 meta_tag_url = hxs.select('//meta[@property="og:url"]/@content') if len(meta_tag_url) > 0: prod_url = meta_tag_url.extract()[0] else: prod_url = response.url logging.critical("PRODUCT URL:" + str(prod_url) + " TITLE " + str(item_name.encode('utf-8')) + \ " TOTAL SO FAR " + str(self.count_scraped)) # find gender gender = 'M' if prod_url.lower().find('women') >= 0 or prod_url.lower().find( 'girl') >= 0: gender = 'F' logging.critical("Gender: " + gender) # find price and sale price item_id_, price_, sale_price_ = self._find_price(hxs) if item_id_ in self.items_scraped: logging.critical("ITEM ALREADY SCRAPED " + str(item_id_) + ". RETURNING.") return (True, None) else: self.items_scraped.append(item_id_) logging.critical("ITEM_ID " + str(item_id_) + " PRICE " + str(price_) + " SALE PRICE " + str(sale_price_)) if price_ > sale_price_: logging.critical("SALE on ITEM_ID " + str(item_id_) + " PRICE " + str(price_) +\ " SALE PRICE " + str(sale_price_)) # extract image URL prod_img_path = hxs.select( '//div[contains (@class, "prod_main_img")]/a/img[contains (@src, "http")]/@src' ) prod_img_url = prod_img_path.extract() logging.critical("Image URL: " + str(prod_img_url)) # find description and keywords: these will be useful in categorization desc = hxs.select('//meta[@property="og:description"]/@content') if len(desc) > 0: desc_content = desc.extract()[0] else: desc_content = '' logging.critical("Description: " + str(desc_content.encode('utf-8'))) keywords = hxs.select('//meta[@name="keywords"]/@content').extract() keywords_content = keywords[0] logging.critical("Keywords: ") logging.critical(keywords_content) prod_desc = desc_content + "\n" + keywords_content print "Length of prod_desc " + str(len(prod_desc)) promo_str = '' product, created_new = self._create_product_item(item_name, item_id_, str(prod_url), price_, \ sale_price_, gender, str(prod_img_url[0]), promo_str, prod_desc) print "gender " + str(product.gender) if created_new: new_cat = simple_product_categorization(product) product.cat1 = new_cat["cat1"] product.cat2 = new_cat["cat2"] product.cat3 = new_cat["cat3"] product.save() error = hxs.select('//span[@class="select-error"]/text()') if len(error) > 0: logging.critical("Error: " + (error.extract()[0]).encode('utf-8')) #self._store_in_file(response, item_id_) #raise CloseSpider('Blah') logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\ str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped))) #raise SystemExit return (True, product)