class GuitarCenterSpider(CrawlSpider): name = "guitar_center" allowed_domains = ["musiciansfriend.com"] start_urls = ["http://www.musiciansfriend.com"] counter = 0 def __init__(self, *a, **kw): super(GuitarCenterSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d["database"]: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"]) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.handle_not_provided() self.start_urls = self.products["urls"] self.total = len(self.products["urls"]) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = GuitarCenterItem() from scrapy.conf import settings if "redirect_urls" in response.request.meta: cur_url = response.request.meta["redirect_urls"][0] else: cur_url = response.url index = self.products["urls"].index(cur_url) try: item["product_id"] = [self.products["product_ids"][index]] item["name"], item["brand"] = self.get_basic_info(hxs) item["heading"], item["details"], item["specs"], item["call_to_action"] = self.get_description(hxs) item["brand_image"], item["brand_image_promo"], brand_images = self.get_description_images(hxs) item["old_price"], item["discount"], item["price"] = self.get_prices(hxs) item["image_json"], img = self.get_images(hxs) item["serial"] = self.get_serials(hxs) item["warranty"] = self.gold_coverage(hxs) item["in_stock"] = self.get_available(hxs) item["product_ref"], item["add_to_cart_id"] = self.get_add_to_cart(hxs) if not item["add_to_cart_id"]: item["in_stock"] = ["NOT_AVAILABLE"] item["shipping"] = self.get_shipping(hxs) item["colors"] = self.get_colors(hxs) self.products["status"][index] = "ran" except StandardError: self.products["status"][index] = "error" self.exc.code_handler(100, response.url) else: self.xml.create_xml(item) item["image_urls"] = img + brand_images return item def handle_not_provided(self): item = GuitarCenterItem() for n in self.no_urls["product_ids"]: item["product_id"] = [n] index = self.no_urls["product_ids"].index(n) item["name"] = [self.no_urls["names"][index]] item["in_stock"] = ["NOT_AVAILABLE"] self.xml.create_xml(item) def get_basic_info(self, hxs): name = hxs.select('//h1[@class="fn"]/text()').extract() name = [basic.clean_string("".join(name))] brand = hxs.select('//span[@class="brand"]/text()').extract() name = [name[0].replace(u"\xa0", "")] return name, brand def get_description_images(self, hxs): brand_image = hxs.select('//a[@class="brandImage"]/img/@src').extract() brand_image_promo = hxs.select('//div[@class="brandPromoLogo"]/img/@src').extract() images = brand_image + brand_image_promo if brand_image: brand_image = [self.get_server_path(brand_image[0])] if brand_image_promo: brand_image_promo = [self.get_server_path(brand_image_promo[0])] return brand_image, brand_image_promo, images def get_description(self, hxs): heading = hxs.select('//div[@id="description"]/p').extract() details = hxs.select('//p[@class="description"]').extract() specs = hxs.select('//div[@class="specs"]/ul').extract() last = hxs.select('//div[@class="callToAction"]/p/text()').extract() return basic.cdata_field(heading), basic.cdata_field(details), basic.cdata_field(specs), basic.cdata_field(last) # function for getting prices, returns tags and values or empty field if no option for one of them new is discount def get_prices(self, hxs): tag = hxs.select('//dl[@class="lineItemList"]/dt/text()').extract() value = hxs.select('//dl[@class="lineItemList"]/dd/text()').extract() old_price = [] discount = [] price = [] if len(tag) > 1: old_price = [basic.clean_string(value[0])] try: discount = [basic.clean_string(value[len(value) - 1])] except IndexError: print "This product has no price." try: price = hxs.select('//span[@class="topAlignedPrice"]/text()').extract() except IndexError: print "This product has no price." if not old_price and not discount and not price: price = hxs.select('//dl[@class="inlineList"]/dd/text()').extract() return self.clean_price(old_price), self.clean_price(discount), self.clean_price(price) # returning json with image url and serial number of product image refers to def get_images(self, hxs): images = hxs.select('//ul[@id="prodDetailThumbs"]/li/a/@href').extract() tags = hxs.select('//ul[@id="prodDetailThumbs"]/li/@class').extract() images_list = [] d = {} img = [] for i in range(0, len(images)): d["image_url"] = self.get_server_path(images[i]) img.append(images[i]) if "site1sku" in tags[i]: d["product_serial"] = tags[i].replace("site1sku", "") else: d["product_serial"] = tags[i] images_list.append(basic.cdata(simplejson.dumps(d))) return images_list, img # function for getting serials and all information about them, currently returns field with jsons with all # information, can be modified to return dicts if needed for subproducts for those one day def get_serials(self, hxs): serials = hxs.select('//var[@class="hidden styleInfo"]/text()').extract() new = [] for serial in serials: d = simplejson.loads(serial) new.append(basic.cdata(simplejson.dumps(d))) return new def get_server_path(self, url): # uncomment next line if you want to keep absolute image path from their site return url return IMAGES_STORE + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg" # function for getting gold coverage from the page which is actually additional warranty options def gold_coverage(self, hxs): ids = hxs.select('//div[@class="goldCoverage"]/input[@type="checkbox"]/@value').extract() labels = hxs.select('//div[@class="goldCoverage"]/label/text()').extract() d = {} new = [] for i in range(0, len(ids)): d["id"] = ids[i] d["name"] = labels[i] new.append(basic.cdata(simplejson.dumps(d))) return new # function for getting availability def get_available(self, hxs): p = hxs.select('//var[@class="hidden availability"]/text()').extract() if p: if p[0] == "in_stock": p = [p[0].upper()] else: # for those that have color options and in stock status for each of those # put IN_STOCK for the product as it has no that option on the page p = ["IN_STOCK"] return p # function for getting add to cart id and product reference def get_add_to_cart(self, hxs): try: temp = hxs.select('//span[@class="magicLink addToList"]/@data-rel').extract()[0] except: print "Product not available" else: return [temp.split("|")[0]], [temp.split("|")[1]] return [], [] # function for gatting shipping information def get_shipping(self, hxs): return hxs.select('//div[@id="targeter_pdpShipping"]/span/text()').extract() # function for getting colors, return jsons with all the data about options def get_colors(self, hxs): colors = hxs.select('//var[@class="styleInfo"]/text()').extract() new = [] for color in colors: d = simplejson.loads(color) new.append(basic.cdata(simplejson.dumps(d))) return new # cleaning price to leave only numbers def clean_price(self, price): new = [] for i in price: new.append(re.sub("[^0-9.]", "", i)) return new def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}".format(datetime.now()) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d["database"]: try: self.database.connect() filename = self.database.get_name(self.d["catalog_id"]) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d["file"] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d["upload"]: exp = CommonExport() try: exp.xml_to_db(self.name, filename, "4a9f5955-9b8e-4e13-84ef-95f937dbc00d") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" ## part for exporting to database here from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "GuitarCenter: {0}".format(filename)) if self.d["email"]: mail.send_mail(msg, "GuitarCenter: {0}".format(filename), self.d["email"]) except: msg += "\nSending mail failed." if self.d["database"]: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), "w") as f: f.write(msg) def add_properties(self, xml): xml.add_property("old_price", "Old Price", "decimal") xml.add_property("image_json", "Image Json", "text_list") xml.add_property("discount", "Discount", "decimal") xml.add_property("product_ref", "Product Ref.", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("serial", "Serial", "text_list") xml.add_property("colors", "Colors", "text_list") xml.add_property("add_to_cart_id", "Add To Cart ID", "text") xml.add_property("shipping", "Shipping", "text") xml.add_property("warranty", "Warranty", "text_list") xml.add_property("heading", "Heading", "text") xml.add_property("details", "Details", "text") xml.add_property("specs", "Specs", "text") xml.add_property("call_to_action", "Call To Action", "text") xml.add_property("brand_image", "Brand Image", "text") xml.add_property("brand_image_promo", "Brand Image Promo", "text") def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d["file"])) self.products = dict() try: self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15) self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15) self.products["names"] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d["file"]) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d["file"]) self.exc.code_handler(103, msg=msg) self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
class LydiasSpider(CrawlSpider): name = "lydias" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(LydiasSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = VariantsXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() # fix for bug with links they provide self.products['urls'] = basic.cut_string_field(self.products['urls'], "&cat=") self.handle_not_provided() self.start_urls = self.products['urls'] self.images_store = "/" + settings['IMAGES_STORE'] lydias.add_properties(self.xml) self.total = len(self.products['urls']) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = LydiasItem() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) id = self.products['product_ids'][index] try: available = hxs.select('//div[@id="searchfor"]/text()').extract() if not available: item['product_id'] = [id] item['name'], item['price'], item['old_price'], item['description'] = self.get_basic_info(hxs) item['rating'], item['custom_rating'] = self.get_rating(hxs) chart = self.absolute_path(self.get_size_image(hxs)) item['sizes_chart_image_url'] = self.get_server_path(chart) color_urls, color_names, product_image, color_codes = self.get_image_swatches(hxs) color_urls = self.absolute_path(color_urls) item['color_image_url'] = self.make_colors_json(color_urls, color_names, color_codes) item['in_stock'] = ["IN_STOCK"] item['embroidery'] = self.get_embroidery(hxs) default_images = self.absolute_path(self.get_extra_images(hxs)) item['default_image_url'] = self.get_server_path(default_images) self.xml.create_xml(item) product_image = self.absolute_path(product_image) self.create_subproducts(id, color_names, product_image, color_codes, hxs) item['image_urls'] = product_image + color_urls + chart + default_images self.products['status'][index] = "ran" else: self.exc.code_handler(102, response.url) item['product_id'] = [id] item['in_stock'] = ["NOT_AVAILABLE"] self.products['status'][index] = "not_avail" self.xml.create_xml(item) except: self.products['status'][index] = "error" self.exc.code_handler(100, response.url) return item # function for checking if product has embroidery or not def get_embroidery(self, hxs): page = hxs.select('//html').extract()[0] if "document.getElementById('logocolor').disabled = true;" in page: return ["True"] else: return ["False"] # function for creating json with all information for colors def make_colors_json(self, color_urls, color_names, color_codes): dict = {} jsons = [] for i in range(0, len(color_urls)): dict['color_url'] = self.get_server_path_single(color_urls[i]) dict['color_name'] = color_names[i] dict['color_short'] = color_codes[i] json = basic.cdata(simplejson.dumps(dict)) jsons.append(json) return jsons # function for getting image server path def get_server_path_single(self, url): # return url return self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg" # function for getting image path for field of images def get_server_path(self, urls): # return urls new = [] for url in urls: new.append(self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg") return new #function for getting basic information for product def get_basic_info(self, hxs): name = hxs.select('//div[@id="proddetail"]/h1/text()').extract() price = hxs.select('//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()').extract() description = basic.cdata(hxs.select('//div[@id="details"]').extract()[0]) description = basic.clean_string(description) old_price = hxs.select('//span[@class="yourprice_product"]/text()').extract() if not price: price = hxs.select('//span[@id="PriceDisplay"]/text()').extract() if old_price: old_price = [re.sub('[^0-9.]', '', old_price[0])] price = [re.sub('[^0-9.]', '', price[0])] return name, price, old_price, [description] # function for getting rating, both number and sentence (e.g. Rating 5 out of 6 votes) def get_rating(self, hxs): temp = hxs.select('//div[@id="Customerssay"]/p[2]/text()').extract() if temp: rating = basic.get_middle_text(temp[0].replace(" ", ""), "Rating:", "out") return rating, temp else: return [], temp #function for getting reviews, returning rating and field of json reviews # or empty fields if there's no reviews def get_reviews(self, hxs): reviews = hxs.select('//div[@class="prodReview"]') if reviews: title = reviews[0].select('p[@class="review_title"]/text()').extract() text = reviews[0].select('p[@class="review_text"]/text()').extract() author = reviews[0].select('p[@class="review_author"]/text()').extract() location = reviews[0].select('p[@class="review_location"]/text()').extract() jsons = self.make_reviews_json(title, text, author, location) return jsons else: return [] # function for making json for reviews # currently not in use. cause there are no reviews in DPW design def make_reviews_json(self, title, text, author, location): jsons = [] print len(title) print len(text) print len(author) print len(location) os._exit(0) for i in range(0, len(title)): json = '{ "title" : " %s ", "text" : "%s", "author" : "%s", "location" :\ "%s" }' % (title[i], text[i], author[i], location[i]) json = basic.cdata(json) jsons.append(json) return jsons #function for getting size chart image def get_size_image(self, hxs): temp = hxs.select('//div[@class="TabbedPanelsContent cells"]/img/@src').extract() return temp #function for getting image swatches, returning fields (image_urls, image name, product color image) def get_image_swatches(self, hxs): colors = hxs.select('//div[@class="lolite"]') color_images = [] color_names = [] products_image = [] color_codes = [] for color in colors: color_images.append(color.select('a/img/@src').extract()[0]) color_names.append(color.select('a/img/@alt').extract()[0]) #if zoom image needed, this is the place to get it products_image.append(color.select('a/@rev').extract()[0]) color_codes.append(color.select('a/@onclick').extract()[0].split(",")[1].replace("'", "")) return color_images, color_names, products_image, color_codes #function for getting additional images, returns field of images or empty field if there is no def get_extra_images(self, hxs): additional_images = hxs.select('//div[@id="AddImg"]/script/text()').extract() if additional_images: temp = basic.get_middle_text(additional_images[0], '"', '"') thumb_images = temp[0].split(",") return thumb_images else: return [] #function for getting product id from the page def get_product_id(self, hxs): temp = hxs.select('//div[@id="wrap"]/script/text()').extract() id = basic.get_middle_text(temp[0], 'productid","', '"') return id[0] # function for getting sizes from another url, retunrning field of jsons for sizes # one id from the page is 115NB, if needed here to hardcode for testing # currently not in use def get_sizes(self, id, hxs): showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0] itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0] salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0] url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=AV&opt2=-1&type2=l1type" % (id) url += "&type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=429" % (showmode, itemmode, salemode) jsons = [] print "reading page..." page = urllib2.urlopen(url).read() print "page read" page = page.replace("'", "") page = page.replace("[", ",") page = page.replace(",,", "") temp = page.split("]") for i in range(0, len(temp) - 2): tmp = temp[i].split(",") json = '{ "size_short" : " %s ", "size_full" : "%s", "some_number" :\ "%s", "some_id" : "%s" }' % (tmp[0], tmp[1], tmp[2], tmp[3]) json = basic.cdata(json) jsons.append(json) return jsons # function that handles creating subproducts, can be implemented for the usual way product for every combination # of size and color if needed def create_subproducts(self, id, color_names, product_image, color_codes, hxs): item = LydiasItem() # if no colors for specific product do this part and call to creating size children with empty string instead # of actual color name if len(color_names) == 0: item['master_product_id'] = [id] item['product_id'] = [id + "_" + "0"] item['color'] = ["NO_COLOR"] item['custom_size'] = self.create_sizes_subproducts(id, id + "_" + "0", "", hxs) self.xml.create_xml(item) # for handling cases when there are color options for specific product, create child for every color, and call # for creating size children for every provided color else: for i in range(0, len(color_names)): print "name :" + color_names[i] + " code:" + color_codes[i] item['master_product_id'] = [id] item['product_id'] = [id + "_" + str(i)] item['color'] = [color_names[i]] item['color_short'] = [color_codes[i]] item['normal_image_url'] = self.get_server_path([product_image[i]]) item['in_stock'] = ["IN_STOCK"] item['custom_size'] = self.create_sizes_subproducts(id, id + "_" + str(i), color_codes[i], hxs) self.xml.create_xml(item) item.clear() return 0 # function for creating child products for sizes # little messy with all the commented lines but those lines can be used if needed to go back to old way with # child products instead of json def create_sizes_subproducts(self, main_id, id, color_code, hxs): print color_code jsons = [] # if block for cases when color is provided if color_code != "": showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0] itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0] salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0] url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=%s&opt2=-1&type2=l1type&" \ "type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=193" % (main_id, color_code, showmode, itemmode, salemode) page = urllib2.urlopen(url).read() page = page.replace("'", "") page = page.replace("[", ",") page = page.replace(",,", "") temp = page.split("]") for i in range(0, len(temp) - 2): tmp = temp[i].split(",") item = {} # item['master_product_id'] = [id] item['size_short'] = tmp[0] item['price_url'] = self.get_size_price(str(main_id), str(color_code), tmp[0]) item['size'] = tmp[1] # item['product_id'] = [id + "_" + str(i)] # item['in_stock'] = ["IN_STOCK"] # xml.create_xml(item) jsons.append(basic.cdata(simplejson.dumps(item))) return jsons # when the color is not provided different block of code cause it's done differently on the page else: temp = hxs.select('//div[@class="not_size"]/text()').extract() for i in range(0, len(temp)): item = {} # item['master_product_id'] = [id] # item['product_id'] = [id + "_" + str(i)] item['size_short'] = temp[i] item['price_url'] = self.get_size_price(str(main_id), "", temp[i]) # item['in_stock'] = ["IN_STOCK"] # xml.create_xml(item) jsons.append(basic.cdata(simplejson.dumps(item))) return jsons # return 0 # function for getting price for combination of every size and color, can return url where the price is, or can # parse that url to get that actual price but will drastically increase scraping time def get_size_price(self, id, color, size): if color != "": url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s %s&qty=1&itemmode=" \ "0&showmode=1&rnum=388" % (str(id), str(color), size) else: url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s&qty=1&itemmode=" \ "0&showmode=1&rnum=259" % (id, size) url = url.replace(" ", "%20") return url # just adding part for getting absolute paths for relative paths from page def absolute_path(self, urls): new = [] for i in urls: new.append("http://www.lydiasuniforms.com" + i) return new # function used for gettin embroidery information from clients page, was used only once to get it # cause embroidery is the same for all the products def get_emb(self, hxs): emb = hxs.select('//div[@id="emb"]').extract() lettering_colors = hxs.select('//select[@id="threadcolor"]/option/@value').extract() urls = [] d = {} colors = [] for i in range(1, len(lettering_colors)): d['type'] = "lettering colors" d['name'] = lettering_colors[i] url = "http://www.lydiasuniforms.com/images/lydias/threadcolor_" url += lettering_colors[i].lower().replace(' ', '_') + ".gif" d['url'] = self.get_server_path_single(url) urls.append(url) colors.append(basic.cdata(simplejson.dumps(d))) lettering = hxs.select('//select[@id="lettering"]/option/@value').extract() l = {} letterings = [] for i in range(1, len(lettering)): l['type'] = "lettering" l['name'] = lettering[i] url = "http://www.lydiasuniforms.com/images/lydias/lettering_" url += lettering[i].lower().replace(' ', '_') + ".gif" l['url'] = self.get_server_path_single(url) letterings.append(basic.cdata(simplejson.dumps(l))) urls.append(url) logo = hxs.select('//select[@id="logoname"]/option/@value').extract() logos = {} log = [] for i in range(1, len(logo)): logos['type'] = "logo" logos['name'] = logo[i] url = "http://www.lydiasuniforms.com/images/logos/" url += logo[i].lower() + ".jpg" logos['url'] = self.get_server_path_single(url) urls.append(url) log.append(basic.cdata(simplejson.dumps(logos))) item = LydiasItem() item['color'] = colors item['lettering'] = letterings item['log'] = log xml.create_xml(item) xml.write_xml("emb") return urls print colors, letterings, log os._exit(0) def handle_not_provided(self): item = LydiasItem() for n in self.no_urls['product_ids']: item['product_id'] = [n] index = self.no_urls['product_ids'].index(n) item['name'] = [self.no_urls['names'][index]] item['in_stock'] = ['NOT_AVAILABLE'] self.xml.create_xml(item) def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "" if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) #if self.d['upload']: #exp = CommonExport() #try: #exp.xml_to_db(self.name, filename, "4b0d6b52-7b05-4e54-9d87-dfe77ac270c9") #msg += "\n\nExport to database successful" #except StandardError: #msg += "\n\nExport to database failed" #else: #msg += "\n\nUpload to database not selected" ## part for exporting to database here from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "Lydias: {0}".format(filename)) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15) self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15) self.products['names'] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) else: self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
class SportmanSpider(CrawlSpider): name = "sportman" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(SportmanSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5, "Sportmann") if self.d["database"]: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"]) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.start_urls = self.products["urls"] self.images_store = "/" + settings["IMAGES_STORE"] self.total = len(self.start_urls) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = SportmanItem() if "redirect_urls" in response.request.meta: cur_url = response.request.meta["redirect_urls"][0] else: cur_url = response.url index = self.products["urls"].index(cur_url) try: if "redirect_urls" in response.request.meta: item["product_id"] = [self.products["product_ids"][index]] item["name"] = [self.products["names"][index]] item["in_stock"] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item["name"], item["short_desc"], item["description"], item["old_price"], item["custom_price"], item[ "product_id" ], item["sku"] = self.get_basic_info(hxs) item["in_stock"] = ["IN_STOCK"] viewstate, eventval, prevpage, hidden, view_page, even_page, pre_page, hidd_page = self.get_vars( response, hxs ) viewstate1 = viewstate[:2000] viewstate2 = viewstate[2000:4000] viewstate3 = viewstate[4000:6000] viewstate4 = viewstate[6000:8000] viewstate5 = viewstate[8000:10000] viewstate6 = viewstate[10000:] item["viewstate1"] = [basic.cdata(viewstate1)] item["viewstate2"] = [basic.cdata(viewstate2)] item["viewstate3"] = [basic.cdata(viewstate3)] item["viewstate4"] = [basic.cdata(viewstate4)] item["viewstate5"] = [basic.cdata(viewstate5)] item["viewstate6"] = [basic.cdata(viewstate6)] item["eventval"] = [basic.cdata(eventval)] item["size_options"] = self.get_variants(hxs, response) images_url = self.get_images(hxs) item["normal_image_url"] = self.get_server_path(images_url) self.xml.create_xml(item) item.clear() item["image_urls"] = self.get_images(hxs) self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item def get_basic_info(self, hxs): name = hxs.select('//div[@id="fragment-1"]/h2/text()').extract() short_desc = hxs.select('//div[@class="description2"]/text()').extract() description = hxs.select('//div[@id="fragment-1"]/div[@class="description"]').extract() description = sportman.delete_tags(re, description[0]) description = [basic.cdata(description)] old_price = hxs.select('//span[@class="oldprice"]/text()').extract() if old_price != []: old_price = " ".join(old_price) old_price = old_price.split(":") old_price = old_price[1].replace("Kr", "") old_price = [old_price.replace(" ", "")] else: old_price = old_price price = hxs.select('//span[@class="nowprice"]/text()').extract() if price != []: price = " ".join(price) price = price.split(":") price = price[1].replace("Kr", "") price = [price.replace(" ", "")] else: price = hxs.select('//span[@class="normalprice"]/text()').extract() price = " ".join(price) price = price.split(":") price = price[1].replace("Kr", "") price = [price.replace(" ", "")] id = hxs.select('//div[@class="articlenumber"]').extract() id = " ".join(id) id = id.replace(u"\xa0", "") id = basic.get_middle_text(id, "Art.nr.", "</div>") sku = id id = [id[0]] return name, short_desc, description, old_price, price, id, sku def get_vars(self, response, hxs): headers1 = { "User-Agent": "Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1", "Host": "www.sportmann.no", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-us,en;q=0.5", "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7", "Connection": "keep-alive", "Referer": "/product.aspx?productid=613232", "Cookie": "ASP.NET_SessionId=lurvsvrn3jxsfd45cedmsv45; Besok=922884e3-e9cb-4b69-b8c8-215f3cc988a9; __utma=184084580.1353376623.1312483243.1312483243.1312483243.1; __utmb=184084580.9.10.1312483243; __utmc=184084580; __utmz=184084580.1312483243.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)", } page = hxs.select("//html").extract() page = " ".join(page) viewst = basic.get_middle_text(page, 'id="__VIEWSTATE" value="', '"') eventval = basic.get_middle_text(page, 'id="__EVENTVALIDATION" value="', '"') prevpage = [""] hidden_field = [""] r = requests.get(response.url, headers=headers1) page_one = r.content viewst_page = basic.get_middle_text(page_one, 'id="__VIEWSTATE" value="', '"') eventval_page = basic.get_middle_text(page_one, 'id="__EVENTVALIDATION" value="', '"') prevpage_page = basic.get_middle_text(page_one, 'id="__PREVIOUSPAGE" value="', '"') hidden_temp = page_one.split('id="__VIEWSTATE"') hidden_temp = hidden_temp[1].split('id="__PREVIOUSPAGE"') hidden_temp = hidden_temp[0].split("<script sr") val_x = len(hidden_temp) - 1 hidden_temp = basic.get_middle_text(hidden_temp[val_x], 'c="', '"') hidden_temp_val = hidden_temp[0] hidden_temp_val = hidden_temp_val.replace("amp;", "") hidden_url = "http://www.sportmann.no" + hidden_temp_val request_hidden = urllib2.Request(hidden_url) response_hidden = urllib2.urlopen(request_hidden) hidden_field_page = basic.get_middle_text( response_hidden.read(), "ctl00_ScriptManager1_HiddenField').value += '", "';" ) return ( viewst[0], eventval[0], prevpage[0], hidden_field[0], viewst_page[0], eventval_page[0], prevpage_page[0], hidden_field_page[0], ) def get_variants(self, hxs, response): page = hxs.select("//html").extract() page = " ".join(page) dict_one = {} test_one = [] temp = page.split('<div class="color">') temp = temp[1].split("</div>") temp = temp[0].split("<select name") viewstate, eventvalidation, previouspage, hiddenfield, view_page, even_page, pre_page, hidd_page = self.get_vars( response, hxs ) if len(temp) == 1: color = hxs.select('//div[@class="color"]/text()').extract() value = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant1Hidden"]/@value').extract() color[0] = color[0].replace(" ", "") color = basic.clean_string(color[0]) value = value[0] # color = basic.clean_string(color[0]) # color = color.replace(" ","") # # dict['color'] = color # dict['color_value'] = value[0] else: test_color = basic.get_middle_text(temp[1], "farge</option>", "</select>") color = basic.get_middle_text(test_color[0], '">', "</option>") value = basic.get_middle_text(test_color[0], 'value="', '">') for i in range(0, len(color)): color[i] = color[i].replace(" ", "") # # dict['color'] = color # dict['color_value'] = value size_temp = page.split('<div class="size">') size_temp = size_temp[1].split("</div>") size_temp = size_temp[0].split("<select name") if len(size_temp) == 1: size = hxs.select('//div[@class="size"]/text()').extract() size = basic.clean_string(size[0]) size = [size.replace(" ", "")] size_val = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant2Hidden"]/@value').extract() if size[0] == "": for i in range(len(value)): resp_page = self.get_data(response, hidd_page, view_page, pre_page, even_page, value[i]) a_page = resp_page.split('<div class="siz') a_page = a_page[1].split("</select>") if len(a_page) == 1: size = basic.get_middle_text(a_page[0], 'e">', '<input type="hidden"') size_val = basic.get_middle_text(a_page[0], 'value="', '"') size_val = size_val[0] size_val = [size_val] else: a_page = basic.get_middle_text(a_page[0], "se</option>", "</select>") size = basic.get_middle_text(a_page[0], '">', "</option>") size_val = basic.get_middle_text(a_page[0], 'value="', '">') dict_one["color"] = color[i] dict_one["color_value"] = value[i] dict_one["size_value"] = size_val for x in range(0, len(size)): size[x] = basic.clean_string(size[x]) size[x] = size[x].replace(" ", "") dict_one["size"] = size test_one.append(basic.cdata(json.dumps(dict_one))) else: dict_one["color"] = color dict_one["color_value"] = value dict_one["size"] = size dict_one["size_value"] = size_val test_one.append(basic.cdata(simplejson.dumps(dict_one))) else: test_size = basic.get_middle_text(size_temp[1], "se</option>", "</select>") size = basic.get_middle_text(test_size[0], '">', "</option>") size_val = basic.get_middle_text(test_size[0], 'value="', '">') for x in range(0, len(size)): size[x] = basic.clean_string(size[x]) size[x] = size[x].replace(" ", "") dict_one["color"] = color dict_one["color_value"] = value dict_one["size"] = size dict_one["size_value"] = size_val test_one.append(basic.cdata(json.dumps(dict_one))) return test_one def get_server_path(self, url): images_array = [] for i in range(0, len(url)): url[i] = basic.clean_string(url[i]) images_array.append(self.images_store + "/full/" + hashlib.sha1(url[i]).hexdigest() + ".jpg") return images_array def get_images(self, hxs): page = hxs.select("//html").extract() page = " ".join(page) images = [] temp = page.split('class="gallery_demo_unstyled"') temp = temp[1].split('<div class="right_container">') temp = basic.get_middle_text(temp[0], 'src="', '"') for i in range(0, len(temp)): image_url = "http://www.sportmann.no" + temp[i] images.append(image_url) return images def get_data(self, response, hidden, viewstate, previouspage, eventvalidation, colorvalue): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0", "Host": "www.sportmann.no", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-us,en;q=0.5", "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7", "Connection": "keep-alive", "Referer": "http://www.sportmann.no/product.aspx?productid=613232", "Cookie": "", } eventvalidation = urllib.urlencode({"__EVENTVALIDATION": eventvalidation}) viewstate = urllib.urlencode({"__VIEWSTATE": viewstate}) previouspage = urllib.urlencode({"__PREVIOUSPAGE": previouspage}) hidden = urllib.urlencode({"ctl00_ScriptManager1_HiddenField": hidden}) data = ( "ctl00%24ScriptManager1=ctl00%24ContentPlaceHolder1%24dropdownPanel%7Cctl00%24ContentPlaceHolder1%24ddlVariant&" + hidden + "%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0&__EVENTTARGET=ctl00%24ContentPlaceHolder1%24ddlVariant&__EVENTARGUMENT=&__LASTFOCUS=&" + viewstate + "&" + previouspage + "&" + eventvalidation + "&ctl00%24ProductSearch%24txtProdSearch=&ctl00%24ProductSearch%24TextBoxWatermarkProdSearch_ClientState=&ctl00%24ContentPlaceHolder1%24ddlVariant=" + colorvalue + "&ctl00%24ContentPlaceHolder1%24Variant1Hidden=&ctl00%24ContentPlaceHolder1%24Variant2Hidden=&ctl00%24ContentPlaceHolder1%24tbAmount=1&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtComment=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceComment_ClientState=&__ASYNCPOST=true&" ) # r = requests.get(response.url, h) req = urllib2.Request(response.url, data, headers) resp_page = urllib2.urlopen(req).read() return resp_page def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}".format(datetime.now()) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d["database"]: try: self.database.connect() filename = self.database.get_name(self.d["catalog_id"]) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d["file"] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d["upload"]: exp = CommonExport() try: exp.xml_to_db(self.name, filename, "1ccd39a5-af4e-47cc-aebe-e0dede5b14d8") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "Sportmann: {0}".format(filename)) if self.d["email"]: mail.send_mail(msg, "Sportmann: {0}".format(filename), self.d["email"]) except: msg += "\nSending mail failed." if self.d["database"]: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), "w") as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d["file"])) self.products = dict() try: self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15) self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15) self.products["names"] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d["file"]) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d["file"]) self.exc.code_handler(103, msg=msg) self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls) def add_properties(self, xml): xml.add_property("short_desc", "Short Description", "text") xml.add_property("old_price", "Old Price", "text") xml.add_property("custom_price", "New Price", "text") xml.add_property("color_value", "Color Value", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("size_val", "Size Value", "text_list") xml.add_property("sku", "Sku", "text") xml.add_property("size_options", "Size_options", "text_list") xml.add_property("viewstate1", "Viewstate1", "text_list") xml.add_property("viewstate2", "Viewstate2", "text_list") xml.add_property("viewstate3", "Viewstate3", "text_list") xml.add_property("viewstate4", "Viewstate4", "text_list") xml.add_property("viewstate5", "Viewstate5", "text_list") xml.add_property("viewstate6", "Viewstate6", "text_list") xml.add_property("eventval", "Eventval", "text_list") xml.add_property("hidden", "Hidden Field", "text_list") xml.add_property("prevpage", "Previous Page", "text_list") xml.add_property("recommended_product", "Recommended Product", "text_list")
class BurtonSpider(CrawlSpider): name = "burton" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(BurtonSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5, "Burton") if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products( self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.handle_not_provided() burton.add_properties(self.xml) self.start_urls = self.products['urls'] self.start_urls = [ "http://www.dickssportinggoods.com/product/index.jsp?productId=13243074" ] self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.start_urls) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = BurtonItem() page = hxs.extract() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: if 'redirect_urls' in response.request.meta: item['product_id'] = [self.products['product_ids'][index]] item['name'] = [self.products['names'][index]] item['in_stock'] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item['product_id'], item['name'] = self.get_basic_info(hxs) item['description'], item['features'] = self.get_description( hxs) item['variants'], thumb_urls, color_names = self.get_variants( page) item['all_sizes'] = self.get_all_sizes(page) item['color_json'], image_urls = self.get_colors( page, color_names) item['price'], item['old_price'] = self.get_prices(hxs) item['in_stock'] = ['IN_STOCK'] item['product_link'] = [basic.cdata(response.url)] self.xml.create_xml(item) item['image_urls'] = image_urls + thumb_urls self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item def handle_not_provided(self): item = BurtonItem() for n in self.no_urls['product_ids']: item['product_id'] = [n] index = self.no_urls['product_ids'].index(n) item['name'] = [self.no_urls['names'][index]] item['in_stock'] = ['NOT_AVAILABLE'] self.xml.create_xml(item) def get_basic_info(self, hxs): name = hxs.select('//h1[@class="productHeading"]/text()').extract() product_id = hxs.select('//input[@name="productId"]/@value').extract() return product_id, name def get_server_path(self, url): path = self.images_store + "/full/" + hashlib.sha1( url).hexdigest() + ".jpg" return path def get_prices(self, hxs): price = hxs.select('//div[@class="op"]/text()').extract() price = [basic.get_price(price[0])] old_price = hxs.select('//span[@class="lp"]/text()').extract() if old_price: old_price = [basic.get_price(old_price[0])] return price, old_price def get_description(self, hxs): description = hxs.select( '//div[@id="FieldsetProductInfo"]/text()').extract()[3] features = hxs.select('//div[@id="FieldsetProductInfo"]/ul').extract() if features: features = [features[0][:2000]] return [basic.cdata(description)], basic.cdata_field(features) def get_variants(self, page): """Gets jsons for colors with all available sizes. In json are also fetched all information for sizes that are on the site """ script = basic.get_middle_text(page, 'var skuSizeColorObj = new Array();', '</script>')[0] sizes = [] image_urls = [] color_names = [] colors = script.split('skuSizeColorObj') for c in range(1, len(colors)): temp = basic.get_middle_text(colors[c], '= ', ';') # delete swatch image as it obviously won't be needed t = simplejson.loads(burton.replace_for_json(temp[0])) image_urls.append(t['swatchURL']) color_names.append(t['ColorDesc']) t['swatchURL'] = self.get_server_path(t['swatchURL']) sizes.append(basic.cdata(simplejson.dumps(t))) return sizes, image_urls, color_names def get_all_sizes(self, page): script = basic.get_middle_text(page, 'var distsizeobj=new Array();', 'var indexcolor=0;')[0] all_sizes = basic.get_middle_text(script, ']="', '";') return [basic.cdata(simplejson.dumps(all_sizes))] def get_colors(self, page, color_names): """Gets color information with images from javascript on the page. Returns json with color name and imagself.images_store = "/" + settings['IMAGES_STORE']e url for that color, and returnes filed of image urls that can be used for download later""" script = basic.get_middle_text(page, 'var imageMap_0 = new Array();', '</script>')[0] colors = basic.get_middle_text(script, '] = ', ';') image_urls = [] colors_json = [] for i in range(0, len(color_names)): color = burton.replace_color_json(colors[i]) color = simplejson.loads(color) color['cname'] = color_names[i] color.pop('reg') image_urls.append(color['enh']) color['enh'] = self.get_server_path(color['enh']) colors_json.append(basic.cdata(simplejson.dumps(color))) return colors_json, image_urls def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}".format(datetime.now()) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d['upload']: exp = CommonExport() try: exp.xml_to_db(self.name, filename, "4ea95a81-90fb-49e2-837e-acf5ab58f574") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" # part for exporting to database here from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "Burton: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "Burton: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15) self.products["product_ids"] = xls.read_excel_collumn_for_ids( 1, 15) self.products["names"] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format( self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format( self.d['file']) self.exc.code_handler(103, msg=msg) self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
class GuitarCenterSpider(CrawlSpider): name = "guitar_center" allowed_domains = ["musiciansfriend.com"] start_urls = ["http://www.musiciansfriend.com"] counter = 0 def __init__(self, *a, **kw): super(GuitarCenterSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.handle_not_provided() self.start_urls = self.products['urls'] self.total = len(self.products['urls']) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = GuitarCenterItem() from scrapy.conf import settings if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: item['product_id'] = [self.products['product_ids'][index]] item['name'], item['brand'] = self.get_basic_info(hxs) item['heading'], item['details'], item['specs'], item['call_to_action'] = self.get_description(hxs) item['brand_image'], item['brand_image_promo'], brand_images = self.get_description_images(hxs) item['old_price'], item['discount'], item['price'] = self.get_prices(hxs) item['image_json'], img = self.get_images(hxs) item['serial'] = self.get_serials(hxs) item['warranty'] = self.gold_coverage(hxs) item['in_stock'] = self.get_available(hxs) item['product_ref'], item['add_to_cart_id'] = self.get_add_to_cart(hxs) if not item['add_to_cart_id']: item['in_stock'] = ["NOT_AVAILABLE"] item['shipping'] = self.get_shipping(hxs) item['colors'] = self.get_colors(hxs) self.products['status'][index] = "ran" except StandardError: self.products['status'][index] = "error" self.exc.code_handler(100, response.url) else: self.xml.create_xml(item) item['image_urls'] = img + brand_images return item def handle_not_provided(self): item = GuitarCenterItem() for n in self.no_urls['product_ids']: item['product_id'] = [n] index = self.no_urls['product_ids'].index(n) item['name'] = [self.no_urls['names'][index]] item['in_stock'] = ['NOT_AVAILABLE'] self.xml.create_xml(item) def get_basic_info(self, hxs): name = hxs.select('//h1[@class="fn"]/text()').extract() name = [basic.clean_string("".join(name))] brand = hxs.select('//span[@class="brand"]/text()').extract() name = [name[0].replace(u"\xa0", "")] return name, brand def get_description_images(self, hxs): brand_image = hxs.select('//a[@class="brandImage"]/img/@src').extract() brand_image_promo = hxs.select('//div[@class="brandPromoLogo"]/img/@src').extract() images = brand_image + brand_image_promo if brand_image: brand_image = [self.get_server_path(brand_image[0])] if brand_image_promo: brand_image_promo = [self.get_server_path(brand_image_promo[0])] return brand_image, brand_image_promo, images def get_description(self, hxs): heading = hxs.select('//div[@id="description"]/p').extract() details = hxs.select('//p[@class="description"]').extract() specs = hxs.select('//div[@class="specs"]/ul').extract() last = hxs.select('//div[@class="callToAction"]/p/text()').extract() return basic.cdata_field(heading), basic.cdata_field(details), basic.cdata_field(specs), basic.cdata_field(last) #function for getting prices, returns tags and values or empty field if no option for one of them new is discount def get_prices(self, hxs): tag = hxs.select('//dl[@class="lineItemList"]/dt/text()').extract() value = hxs.select('//dl[@class="lineItemList"]/dd/text()').extract() old_price = [] discount = [] price = [] if len(tag) > 1: old_price = [basic.clean_string(value[0])] try: discount = [basic.clean_string(value[len(value) - 1])] except IndexError: print "This product has no price." try: price = hxs.select('//span[@class="topAlignedPrice"]/text()').extract() except IndexError: print "This product has no price." if not old_price and not discount and not price: price = hxs.select('//dl[@class="inlineList"]/dd/text()').extract() return self.clean_price(old_price), self.clean_price(discount), self.clean_price(price) # returning json with image url and serial number of product image refers to def get_images(self, hxs): images = hxs.select('//ul[@id="prodDetailThumbs"]/li/a/@href').extract() tags = hxs.select('//ul[@id="prodDetailThumbs"]/li/@class').extract() images_list = [] d = {} img = [] for i in range(0, len(images)): d['image_url'] = self.get_server_path(images[i]) img.append(images[i]) if "site1sku" in tags[i]: d['product_serial'] = tags[i].replace("site1sku", "") else: d['product_serial'] = tags[i] images_list.append(basic.cdata(simplejson.dumps(d))) return images_list, img # function for getting serials and all information about them, currently returns field with jsons with all # information, can be modified to return dicts if needed for subproducts for those one day def get_serials(self, hxs): serials = hxs.select('//var[@class="hidden styleInfo"]/text()').extract() new = [] for serial in serials: d = simplejson.loads(serial) new.append(basic.cdata(simplejson.dumps(d))) return new def get_server_path(self, url): #uncomment next line if you want to keep absolute image path from their site return url return IMAGES_STORE + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg" # function for getting gold coverage from the page which is actually additional warranty options def gold_coverage(self, hxs): ids = hxs.select('//div[@class="goldCoverage"]/input[@type="checkbox"]/@value').extract() labels = hxs.select('//div[@class="goldCoverage"]/label/text()').extract() d = {} new = [] for i in range(0, len(ids)): d['id'] = ids[i] d['name'] = labels[i] new.append(basic.cdata(simplejson.dumps(d))) return new # function for getting availability def get_available(self, hxs): p = hxs.select('//var[@class="hidden availability"]/text()').extract() if p: if p[0] == "in_stock": p = [p[0].upper()] else: #for those that have color options and in stock status for each of those #put IN_STOCK for the product as it has no that option on the page p = ["IN_STOCK"] return p # function for getting add to cart id and product reference def get_add_to_cart(self, hxs): try: temp = hxs.select('//span[@class="magicLink addToList"]/@data-rel').extract()[0] except: print "Product not available" else: return [temp.split("|")[0]], [temp.split("|")[1]] return [], [] # function for gatting shipping information def get_shipping(self, hxs): return hxs.select('//div[@id="targeter_pdpShipping"]/span/text()').extract() # function for getting colors, return jsons with all the data about options def get_colors(self, hxs): colors = hxs.select('//var[@class="styleInfo"]/text()').extract() new = [] for color in colors: d = simplejson.loads(color) new.append(basic.cdata(simplejson.dumps(d))) return new # cleaning price to leave only numbers def clean_price(self, price): new = [] for i in price: new.append(re.sub('[^0-9.]', '', i)) return new def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}".format(datetime.now()) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d['upload']: exp = CommonExport() try: exp.xml_to_db(self.name, filename, "4a9f5955-9b8e-4e13-84ef-95f937dbc00d") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" ## part for exporting to database here from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "GuitarCenter: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "GuitarCenter: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def add_properties(self, xml): xml.add_property("old_price", "Old Price", "decimal") xml.add_property("image_json", "Image Json", "text_list") xml.add_property("discount", "Discount", "decimal") xml.add_property("product_ref", "Product Ref.", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("serial", "Serial", "text_list") xml.add_property("colors", "Colors", "text_list") xml.add_property("add_to_cart_id", "Add To Cart ID", "text") xml.add_property("shipping", "Shipping", "text") xml.add_property("warranty", "Warranty", "text_list") xml.add_property("heading", "Heading", "text") xml.add_property("details", "Details", "text") xml.add_property("specs", "Specs", "text") xml.add_property("call_to_action", "Call To Action", "text") xml.add_property("brand_image", "Brand Image", "text") xml.add_property("brand_image_promo", "Brand Image Promo", "text") def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15) self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15) self.products['names'] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) self.products= xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
class ChomeSpider(CrawlSpider): name = "chome" allowed_domains = ["zmags.com"] start_urls = ["http://www.zmags.com/"] counter = 0 def __init__(self, *a, **kw): super(ChomeSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.no_urls['product_ids']) def parse(self, response): self.counter += 1 hxs = HtmlXPathSelector(response) item = ChomeItem() print "IDs in excel feed: {0}".format(self.total) item['image_urls'] = self.parse_whole_xml() return item def parse_whole_xml(self): xml_dir = "xml/{0}".format(self.name) file_url = "https://svc.celebratinghome.com/ZMags.svc/ProductInfo1" downloader = Downloader() if self.d['download']: downloader.get_file(xml_dir, file_url, "client_feed") else: if not os.path.exists('xml/{0}/client_feed.xml'.format(self.name)): basic.warning("Feed file doesn't exist please de-select no download option") os._exit(2) self.number = 0 xml_item = ChomeItem() urls_all = [] for event, elem in iterparse('xml/{0}/client_feed.xml'.format(self.name)): if elem.tag == "{http://schemas.microsoft.com/ado/2007/08/dataservices/metadata}properties": for r in elem: p = "{http://schemas.microsoft.com/ado/2007/08/dataservices}" if r.tag == p + "Id" and r.text in self.no_urls['product_ids']: index = self.no_urls['product_ids'].index(r.text) self.no_urls['status'][index] = 'ran' self.number += 1 urls = [] flag = 0 for x in elem: if x.tag == p + "Id": xml_item['product_id'] = [x.text] elif x.tag == p + "EngLongDesc" and x.text is not None: xml_item['description_english'] = [self.escape(basic.cdata(x.text))] elif x.tag == p + "RetailPrice": xml_item['custom_price'] = [x.text[:-2]] elif x.tag == p + "SpnLongDesc" and x.text is not None: xml_item['description_spanish'] = [self.escape(basic.cdata(x.text))] elif x.tag == p + "PartNumber": xml_item['add_to_cart_id'] = [x.text] elif x.tag == p + "MaxQty": xml_item['max_qty'] = [x.text] elif x.tag == p + "TimeType": xml_item['time_type'] = [x.text] elif x.tag == p + "SpnName" and x.text is not None: xml_item['name_spanish'] = [x.text] elif x.tag == p + "EngName": xml_item['name_english'] = [x.text] elif x.tag == p + "ImagePath_Large" and x.text is not None: urls.append(self.get_absolute(x.text)) xml_item['normal_image_url'] = [self.get_server_path(self.get_absolute(x.text))] elif x.tag == p + "IsActive": if x.text == 0: xml_item['in_stock'] = ["NOT_IN_STOCK"] else: xml_item['in_stock'] = ['IN_STOCK'] else: for i in range(1, 4): tag = p + "Alternate%sImagePath_Large" % (str(i)) if x.tag == tag and x.text is not None: urls.append(self.get_absolute(x.text)) xml_item['normal_image_url'].append(self.get_server_path(self.get_absolute(x.text))) # change image paths for normal_image_url and return urls self.xml.create_xml(xml_item) urls_all += urls for i in range(0, len(self.no_urls['status'])): if self.no_urls['status'][i] != 'ran': self.no_urls['status'][i] = 'not_found' return urls_all def get_server_path(self, url): path = self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg" return path def get_absolute(self, url): return "http://www.celebratinghome.com/" + url def escape(self, string): temp = HTMLParser.HTMLParser().unescape(string) return HTMLParser.HTMLParser().unescape(temp) def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}\n".format(datetime.now()) if self.total - self.number: msg += "{0} id(s) from id list weren't found in feed".format(self.total - self.number) basic.warning(msg) else: msg += "All ids found in feed." basic.green(msg) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.no_urls) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) #if self.d['upload']: #exp = CommonExport() #try: #exp.xml_to_db(self.name, self.d['file'], "40b029c9-dff7-4bc1-b8bc-ef062960b24d") #msg += "\n\nExport to database successful" #except StandardError: #msg += "\n\nExport to database failed" #else: #msg += "\n\nUpload to database not selected" from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "CelebratingHome: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "CelebratingHome: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15) self.products['names'] = xls.read_excel_collumn(2, 15) self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) self.products= xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls) def add_properties(self, xml): xml.add_property("description_english", "Description English", "text") xml.add_property("description_spanish", "Description Spanish", "text") xml.add_property("add_to_cart_id", "Add To Cart ID", "text") xml.add_property("max_qty", "Max Quantity", "text") xml.add_property("time_type", "Time Type", "text") xml.add_property("name_english", "Name English", "text") xml.add_property("name_spanish", "Name Spanish", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("custom_price", "Custom Price", "text")
class PartyliteSpider(CrawlSpider): name = "partylite" allowed_domains = ["partylite.biz"] start_urls = ["http://www.zmags.com"] counter = 0 def __init__(self, *a, **kw): super(PartyliteSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = PartyliteTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.images_store = "/" + settings['IMAGES_STORE'] self.users = party.get_users(settings, self.d) self.exc = ZmagsException(50) self.production = self.d['env'] self.upload = self.d['upload'] self.english = self.d['lang'] self.file_name = self.d['file'] if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() self.change_url_list() else: self.get_lists_from_excel() self.xml = CommonXml() party.add_properties(self.xml) self.total = len(self.products['urls']) def parse(self, response): for url in self.products['urls']: if self.d['lang'] == 'us': request = Request(url, callback=self.parse_can, dont_filter=True) yield request elif self.d['lang'] == 'english': c_url = url.replace(self.users['us'], self.users['canada_en']) request = Request(c_url, callback=self.parse_can, dont_filter=True) request.meta['language'] = "eng" yield request elif self.d['lang'] == 'french': c_url = url.replace(self.users['us'], self.users['canada_fr']) request = Request(c_url, callback=self.parse_can, dont_filter=True) request.meta['language'] = "fr" yield request def change_url_list(self): for i in range(0, len(self.products['urls'])): if not self.production: self.products['urls'][i] = self.products['urls'][i].replace('www', 'qa') self.products['urls'][i] = self.products['urls'][i].replace('XXXXX', self.users['us']) def get_in_stock(self, hxs): """Gets in stock information about product.""" stock = hxs.select('//div[@id="availability_container"]').extract() if not stock: return ["IN_STOCK"] else: return ["NOT_IN_STOCK"] def get_basic_info(self, hxs): """Getting basic info about products (name, shown with).""" name = hxs.select('//div[@id="product_name"]/text()').extract() if name: name = basic.cdata_field(name) shown_with = hxs.select('//div[@id="shown_with_container"]').extract() if shown_with: shown_with = [basic.cdata(shown_with[0])] return name, shown_with def get_description(self, hxs): description = description = hxs.select('//div[@id="item_description"]').extract() description = [basic.cdata(basic.remove_tags(description[0]))] description = [description[0].replace(u"\u2044", "/")] return description def get_price(self, hxs): """Getting product prices. Gets regular and discount price if there is one.""" price = hxs.select('//span[@id="divUnitPrice"]/text()').extract() if not price: price = hxs.select('//div[@id="product_price"]/span[1]/text()').extract() if not price: price = hxs.select('//div[@id="product_price"]/text()').extract() discount = hxs.select('//div[@id="product_price"]/span[@class="pc-salePrice"]/text()').extract() price = basic.clean_string(price[0]) price = re.sub(" +", " ", price) price = price.replace("Price:", "") price = price.replace("Prix:", "") price = basic.cdata(price.strip()) if discount: discount = basic.cdata_field(discount) return [price], discount def get_add_to_cart_id(self, page): """Gets add to cart id from the javascript on the page.""" tmp = basic.get_middle_text(page, "if(isOrderStarted){", "}else")[0] tmp = basic.get_middle_text(tmp, "addItemToCart(", ",") return tmp def create_subproducts(self, page): """Gets information about colors from javascript. Returns field of dicts with information about colors. Those are really color variants for product.""" try: tmp = page.split("var largeImages = new Array();")[1] except IndexError: print "This product has no images" else: tmp = tmp.split("colorDropdownArray")[0] images = basic.get_middle_text(tmp, "ProductGroupProduct(", ");") image_names = self.get_image_names(page) color_products = [] for im in images: product = {} attributes = im.split("',") product['normal_image_url'] = "http://qa.partylite.biz/imaging/resize?fileName=/productcatalog/production" product['normal_image_url'] += self.custom_clean_string(attributes[26], True) product['description'] = basic.cdata(self.custom_clean_string(attributes[27])) product['color_id'] = self.custom_clean_string(attributes[7], True) product['swatch_color'] = basic.cdata(self.custom_clean_string(attributes[9]).replace(" ", "")) product['name'] = basic.cdata(image_names[product['color_id']]) product['add_to_cart_id'] = self.custom_clean_string(attributes[0], True).replace(" ", "") product['price'] = self.custom_clean_string(attributes[10], True) color_products.append(product) return color_products return [] def custom_clean_string(self, string, spaces=False): """Custom function for cleaning strings. Replaces new line, return and tab signs, also replaces multiple spaces with only one.""" string = string.replace("\r", "") string = string.replace("\n", "") string = string.replace("\t", "") if not spaces: string = re.sub(' +', ' ', string) else: string = re.sub(' ', '', string) string = string.replace("'", "") return string def get_image_names(self, page): """Gets color names for color swatches.""" temp = page.split("new DropDownInfo") names = {} for i in range(1, len(temp)): names[basic.get_middle_text(temp[i], "('", "'")[0]] = basic.get_middle_text(temp[i], "'", "')")[2] return names def get_recommended(self, hxs): """Gets recommended product information. Returns information about recommended products as dict""" rec = hxs.select('//div[@id="right_column_container"]/div') new = [] i = 0 for r in rec: d = {} #to do: see how to get full href(different accounts) if not i: d['link'] = r.select('div/a/@href').extract()[0] d['image'] = "http://www.partylite.biz/imaging/resize" d['image'] += r.select('div/a/img/@src').extract()[0] d['name'] = r.select('div/a/text()').extract()[0] new.append(basic.cdata(simplejson.dumps(d))) i += 1 return new def get_reviews(self, page): """Gets average product rating. Returns string like 4.6 of 5 reviews.""" id = self.get_review_id(page) url = "http://partylite.ugc.bazaarvoice.com/8504-en_us/" + id + "/reviews.djs?format=embeddedhtml" url = url.replace(" ", "") page = urllib2.urlopen(url).read() page = basic.get_middle_text(page, '<div class=\\"BVRRRatingNormalImage\\">', '<\/div>') if page: rating = basic.get_middle_text(page[0], 'alt=\\"', '\\"')[0] return [rating] else: return [] def get_more_images(self, page): """Gets field of images.""" try: script = basic.get_middle_text(page, "var moreImages", "var numberOfImages")[0] except IndexError: print "This product has no images." else: r = basic.get_middle_text(script, "moreImages[", "';") images = [] # return cdata here if needed to go with absolute links for i in range(0, len(r)): if self.production: images.append("http://www.partylite.biz" + r[i].split("= '")[1]) else: images.append("http://qa.partylite.biz" + r[i].split("= '")[1]) return images return [] def get_absolute(self, relatives): """Creates absolute path for images. [DEPRECATED] Please check if there is a need for this function again. If needed dimensions of images got from the client server can be changed here.""" new = [] print relatives os._exit(0) for i in range(0, len(relatives)): #add width, height here for different dimensions #don't change the url in here from qa to www it's meant to be qa always new.append("http://www.partylite.biz/imaging/resize?fileName=/productcatalog/production" + relatives[i]) return new def get_review_id(self, page): """Gets review id that is used in javascript for reviews.""" return basic.get_middle_text(page, 'productId: "', '"')[0] def write_subproducts(self, id, list, xml): """Writes child products to xml. Receives id, list and xml attributes, id is master product id, list is list of child products and xml is Xml instance""" for i in range(0, len(list)): item = PartyliteItem() item['master_product_id'] = id item['product_id'] = [id[0] + "_" + str(i)] item['in_stock'] = ["IN_STOCK"] for k, v in list[i].iteritems(): item[k] = [v] xml.create_xml(item) return 1 def parse_can(self, response): """Parse function for scraping canadian sites. There is meta information send in request in this function about language.""" self.counter += 1 basic.print_status(self.counter, self.total) item = PartyliteItem() hxs = HtmlXPathSelector(response) image_urls = [] if 'redirect_urls' in response.request.meta: item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0]] self.exc.code_handler(102, response.request.meta['redirect_urls']) if 'language' in response.request.meta: item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0] + "_can" + "_" + response.meta['language']] try: index = self.products['product_ids'].index(self.get_id (response.request.meta['redirect_urls'][0])[0]) item['name'] = [basic.cdata(item['product_id'][0] + self.products['names'][index])] self.products['status'][index] = 'no_avail' except KeyError as e: print "This %s id is not in list" % (item['product_id'][0]) item['in_stock'] = ['NOT_AVAILABLE'] item['product_id'] = self.remove_spaces(item['product_id']) self.xml.create_xml(item) else: index = self.products['product_ids'].index(self.get_id(response.url)[0]) try: item['product_id'] = self.get_id(response.url) item['name'], item['shown_with'] = self.get_basic_info(hxs) item['description'] = self.get_description(hxs) if 'language' in response.meta: item['product_id'] = [item['product_id'][0] + "_can" + "_" + response.meta['language']] response.meta['item'] = item page = " ".join(hxs.select('//html').extract()) image_urls = self.get_more_images(page) item['normal_image_url'] = self.get_server_path_field(image_urls) item['in_stock'] = self.get_in_stock(hxs) color_products = self.create_subproducts(page) if color_products: self.write_subproducts(item['product_id'], color_products, xml) else: item['add_to_cart_id'] = self.get_add_to_cart_id(page) item['custom_price'], item['custom_discount'] = self.get_price(hxs) self.products['status'][index] = "ran" except StandardError: basic.print_error() self.products['status'][index] = "error" self.exc.code_handler(100, response.url) else: item['product_id'] = self.remove_spaces(item['product_id']) self.xml.create_xml(item) if image_urls: item['image_urls'] = image_urls return item def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = party.get_settings_message(self.d) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] logname = filename filename = "{0}_{1}".format(filename, self.d['lang']) self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) from modules.mail import Mail from modules.export_to_db import CommonExport exp = CommonExport() if self.upload: try: if self.d['lang'] == 'us': exp.xml_to_db(self.name, filename, "55892247-1b92-4ff9-a8a3-33cc976f9341") else: exp.xml_to_db(self.name, filename, "9cb6c676-c14f-403b-b94f-b981184e1de0") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" mail = Mail() try: mail.send_mail(msg, "Partylite: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "Partylite: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = 'logs/{0}'.format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, logname), 'w') as f: f.write(msg) def get_id(self, url): """Gets id from product url.""" return [url.split("&sku=")[1]] def get_server_path(self, url): """Gets server path for image url.""" url = url.split("partylite.biz")[1] return self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg" def get_server_path_field(self, urls): """Getting server path for field of image urls.""" new = [] for url in urls: url = url.split("partylite.biz")[1] new.append(self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg") return new def remove_spaces(self, field): new = [] for i in field: new.append(i.replace(' ', '')) return new def get_lists_from_excel(self): excel_path = "xls/{0}/{1}.xls".format(self.name, self.d['file']) xls = PartyliteExcel(path=excel_path, user=self.users['us'], production=self.production) self.products = dict() try: self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15) self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15) self.products['names'] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) exc.code_handler(103, msg=msg) self.products= xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
class LydiasSpider(CrawlSpider): name = "lydias" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(LydiasSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = VariantsXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products( self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() # fix for bug with links they provide self.products['urls'] = basic.cut_string_field(self.products['urls'], "&cat=") self.handle_not_provided() self.start_urls = self.products['urls'] self.images_store = "/" + settings['IMAGES_STORE'] lydias.add_properties(self.xml) self.total = len(self.products['urls']) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = LydiasItem() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) id = self.products['product_ids'][index] try: available = hxs.select('//div[@id="searchfor"]/text()').extract() if not available: item['product_id'] = [id] item['name'], item['price'], item['old_price'], item[ 'description'] = self.get_basic_info(hxs) item['rating'], item['custom_rating'] = self.get_rating(hxs) chart = self.absolute_path(self.get_size_image(hxs)) item['sizes_chart_image_url'] = self.get_server_path(chart) color_urls, color_names, product_image, color_codes = self.get_image_swatches( hxs) color_urls = self.absolute_path(color_urls) item['color_image_url'] = self.make_colors_json( color_urls, color_names, color_codes) item['in_stock'] = ["IN_STOCK"] item['embroidery'] = self.get_embroidery(hxs) default_images = self.absolute_path(self.get_extra_images(hxs)) item['default_image_url'] = self.get_server_path( default_images) self.xml.create_xml(item) product_image = self.absolute_path(product_image) self.create_subproducts(id, color_names, product_image, color_codes, hxs) item[ 'image_urls'] = product_image + color_urls + chart + default_images self.products['status'][index] = "ran" else: self.exc.code_handler(102, response.url) item['product_id'] = [id] item['in_stock'] = ["NOT_AVAILABLE"] self.products['status'][index] = "not_avail" self.xml.create_xml(item) except: self.products['status'][index] = "error" self.exc.code_handler(100, response.url) return item # function for checking if product has embroidery or not def get_embroidery(self, hxs): page = hxs.select('//html').extract()[0] if "document.getElementById('logocolor').disabled = true;" in page: return ["True"] else: return ["False"] # function for creating json with all information for colors def make_colors_json(self, color_urls, color_names, color_codes): dict = {} jsons = [] for i in range(0, len(color_urls)): dict['color_url'] = self.get_server_path_single(color_urls[i]) dict['color_name'] = color_names[i] dict['color_short'] = color_codes[i] json = basic.cdata(simplejson.dumps(dict)) jsons.append(json) return jsons # function for getting image server path def get_server_path_single(self, url): # return url return self.images_store + "/full/" + hashlib.sha1( url).hexdigest() + ".jpg" # function for getting image path for field of images def get_server_path(self, urls): # return urls new = [] for url in urls: new.append(self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg") return new #function for getting basic information for product def get_basic_info(self, hxs): name = hxs.select('//div[@id="proddetail"]/h1/text()').extract() price = hxs.select( '//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()' ).extract() description = basic.cdata( hxs.select('//div[@id="details"]').extract()[0]) description = basic.clean_string(description) old_price = hxs.select( '//span[@class="yourprice_product"]/text()').extract() if not price: price = hxs.select('//span[@id="PriceDisplay"]/text()').extract() if old_price: old_price = [re.sub('[^0-9.]', '', old_price[0])] price = [re.sub('[^0-9.]', '', price[0])] return name, price, old_price, [description] # function for getting rating, both number and sentence (e.g. Rating 5 out of 6 votes) def get_rating(self, hxs): temp = hxs.select('//div[@id="Customerssay"]/p[2]/text()').extract() if temp: rating = basic.get_middle_text(temp[0].replace(" ", ""), "Rating:", "out") return rating, temp else: return [], temp #function for getting reviews, returning rating and field of json reviews # or empty fields if there's no reviews def get_reviews(self, hxs): reviews = hxs.select('//div[@class="prodReview"]') if reviews: title = reviews[0].select( 'p[@class="review_title"]/text()').extract() text = reviews[0].select( 'p[@class="review_text"]/text()').extract() author = reviews[0].select( 'p[@class="review_author"]/text()').extract() location = reviews[0].select( 'p[@class="review_location"]/text()').extract() jsons = self.make_reviews_json(title, text, author, location) return jsons else: return [] # function for making json for reviews # currently not in use. cause there are no reviews in DPW design def make_reviews_json(self, title, text, author, location): jsons = [] print len(title) print len(text) print len(author) print len(location) os._exit(0) for i in range(0, len(title)): json = '{ "title" : " %s ", "text" : "%s", "author" : "%s", "location" :\ "%s" }' % (title[i], text[i], author[i], location[i]) json = basic.cdata(json) jsons.append(json) return jsons #function for getting size chart image def get_size_image(self, hxs): temp = hxs.select( '//div[@class="TabbedPanelsContent cells"]/img/@src').extract() return temp #function for getting image swatches, returning fields (image_urls, image name, product color image) def get_image_swatches(self, hxs): colors = hxs.select('//div[@class="lolite"]') color_images = [] color_names = [] products_image = [] color_codes = [] for color in colors: color_images.append(color.select('a/img/@src').extract()[0]) color_names.append(color.select('a/img/@alt').extract()[0]) #if zoom image needed, this is the place to get it products_image.append(color.select('a/@rev').extract()[0]) color_codes.append( color.select('a/@onclick').extract()[0].split(",")[1].replace( "'", "")) return color_images, color_names, products_image, color_codes #function for getting additional images, returns field of images or empty field if there is no def get_extra_images(self, hxs): additional_images = hxs.select( '//div[@id="AddImg"]/script/text()').extract() if additional_images: temp = basic.get_middle_text(additional_images[0], '"', '"') thumb_images = temp[0].split(",") return thumb_images else: return [] #function for getting product id from the page def get_product_id(self, hxs): temp = hxs.select('//div[@id="wrap"]/script/text()').extract() id = basic.get_middle_text(temp[0], 'productid","', '"') return id[0] # function for getting sizes from another url, retunrning field of jsons for sizes # one id from the page is 115NB, if needed here to hardcode for testing # currently not in use def get_sizes(self, id, hxs): showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0] itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0] salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0] url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=AV&opt2=-1&type2=l1type" % ( id) url += "&type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=429" % ( showmode, itemmode, salemode) jsons = [] print "reading page..." page = urllib2.urlopen(url).read() print "page read" page = page.replace("'", "") page = page.replace("[", ",") page = page.replace(",,", "") temp = page.split("]") for i in range(0, len(temp) - 2): tmp = temp[i].split(",") json = '{ "size_short" : " %s ", "size_full" : "%s", "some_number" :\ "%s", "some_id" : "%s" }' % (tmp[0], tmp[1], tmp[2], tmp[3]) json = basic.cdata(json) jsons.append(json) return jsons # function that handles creating subproducts, can be implemented for the usual way product for every combination # of size and color if needed def create_subproducts(self, id, color_names, product_image, color_codes, hxs): item = LydiasItem() # if no colors for specific product do this part and call to creating size children with empty string instead # of actual color name if len(color_names) == 0: item['master_product_id'] = [id] item['product_id'] = [id + "_" + "0"] item['color'] = ["NO_COLOR"] item['custom_size'] = self.create_sizes_subproducts( id, id + "_" + "0", "", hxs) self.xml.create_xml(item) # for handling cases when there are color options for specific product, create child for every color, and call # for creating size children for every provided color else: for i in range(0, len(color_names)): print "name :" + color_names[i] + " code:" + color_codes[i] item['master_product_id'] = [id] item['product_id'] = [id + "_" + str(i)] item['color'] = [color_names[i]] item['color_short'] = [color_codes[i]] item['normal_image_url'] = self.get_server_path( [product_image[i]]) item['in_stock'] = ["IN_STOCK"] item['custom_size'] = self.create_sizes_subproducts( id, id + "_" + str(i), color_codes[i], hxs) self.xml.create_xml(item) item.clear() return 0 # function for creating child products for sizes # little messy with all the commented lines but those lines can be used if needed to go back to old way with # child products instead of json def create_sizes_subproducts(self, main_id, id, color_code, hxs): print color_code jsons = [] # if block for cases when color is provided if color_code != "": showmode = hxs.select( '//input[@name="showmode"]/@value').extract()[0] itemmode = hxs.select( '//input[@name="itemmode"]/@value').extract()[0] salemode = hxs.select( '//input[@name="salemode"]/@value').extract()[0] url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=%s&opt2=-1&type2=l1type&" \ "type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=193" % (main_id, color_code, showmode, itemmode, salemode) page = urllib2.urlopen(url).read() page = page.replace("'", "") page = page.replace("[", ",") page = page.replace(",,", "") temp = page.split("]") for i in range(0, len(temp) - 2): tmp = temp[i].split(",") item = {} # item['master_product_id'] = [id] item['size_short'] = tmp[0] item['price_url'] = self.get_size_price( str(main_id), str(color_code), tmp[0]) item['size'] = tmp[1] # item['product_id'] = [id + "_" + str(i)] # item['in_stock'] = ["IN_STOCK"] # xml.create_xml(item) jsons.append(basic.cdata(simplejson.dumps(item))) return jsons # when the color is not provided different block of code cause it's done differently on the page else: temp = hxs.select('//div[@class="not_size"]/text()').extract() for i in range(0, len(temp)): item = {} # item['master_product_id'] = [id] # item['product_id'] = [id + "_" + str(i)] item['size_short'] = temp[i] item['price_url'] = self.get_size_price( str(main_id), "", temp[i]) # item['in_stock'] = ["IN_STOCK"] # xml.create_xml(item) jsons.append(basic.cdata(simplejson.dumps(item))) return jsons # return 0 # function for getting price for combination of every size and color, can return url where the price is, or can # parse that url to get that actual price but will drastically increase scraping time def get_size_price(self, id, color, size): if color != "": url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s %s&qty=1&itemmode=" \ "0&showmode=1&rnum=388" % (str(id), str(color), size) else: url = "http://www.lydiasuniforms.com/ajaxed/product-showprice.asp?sku=%s %s&qty=1&itemmode=" \ "0&showmode=1&rnum=259" % (id, size) url = url.replace(" ", "%20") return url # just adding part for getting absolute paths for relative paths from page def absolute_path(self, urls): new = [] for i in urls: new.append("http://www.lydiasuniforms.com" + i) return new # function used for gettin embroidery information from clients page, was used only once to get it # cause embroidery is the same for all the products def get_emb(self, hxs): emb = hxs.select('//div[@id="emb"]').extract() lettering_colors = hxs.select( '//select[@id="threadcolor"]/option/@value').extract() urls = [] d = {} colors = [] for i in range(1, len(lettering_colors)): d['type'] = "lettering colors" d['name'] = lettering_colors[i] url = "http://www.lydiasuniforms.com/images/lydias/threadcolor_" url += lettering_colors[i].lower().replace(' ', '_') + ".gif" d['url'] = self.get_server_path_single(url) urls.append(url) colors.append(basic.cdata(simplejson.dumps(d))) lettering = hxs.select( '//select[@id="lettering"]/option/@value').extract() l = {} letterings = [] for i in range(1, len(lettering)): l['type'] = "lettering" l['name'] = lettering[i] url = "http://www.lydiasuniforms.com/images/lydias/lettering_" url += lettering[i].lower().replace(' ', '_') + ".gif" l['url'] = self.get_server_path_single(url) letterings.append(basic.cdata(simplejson.dumps(l))) urls.append(url) logo = hxs.select('//select[@id="logoname"]/option/@value').extract() logos = {} log = [] for i in range(1, len(logo)): logos['type'] = "logo" logos['name'] = logo[i] url = "http://www.lydiasuniforms.com/images/logos/" url += logo[i].lower() + ".jpg" logos['url'] = self.get_server_path_single(url) urls.append(url) log.append(basic.cdata(simplejson.dumps(logos))) item = LydiasItem() item['color'] = colors item['lettering'] = letterings item['log'] = log xml.create_xml(item) xml.write_xml("emb") return urls print colors, letterings, log os._exit(0) def handle_not_provided(self): item = LydiasItem() for n in self.no_urls['product_ids']: item['product_id'] = [n] index = self.no_urls['product_ids'].index(n) item['name'] = [self.no_urls['names'][index]] item['in_stock'] = ['NOT_AVAILABLE'] self.xml.create_xml(item) def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "" if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) #if self.d['upload']: #exp = CommonExport() #try: #exp.xml_to_db(self.name, filename, "4b0d6b52-7b05-4e54-9d87-dfe77ac270c9") #msg += "\n\nExport to database successful" #except StandardError: #msg += "\n\nExport to database failed" #else: #msg += "\n\nUpload to database not selected" ## part for exporting to database here from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "Lydias: {0}".format(filename)) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15) self.products['product_ids'] = xls.read_excel_collumn_for_ids( 1, 15) self.products['names'] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format( self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format( self.d['file']) self.exc.code_handler(103, msg=msg) else: self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
class ChomeSpider(CrawlSpider): name = "chome" allowed_domains = ["zmags.com"] start_urls = ["http://www.zmags.com/"] counter = 0 def __init__(self, *a, **kw): super(ChomeSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products( self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.no_urls['product_ids']) def parse(self, response): self.counter += 1 hxs = HtmlXPathSelector(response) item = ChomeItem() print "IDs in excel feed: {0}".format(self.total) item['image_urls'] = self.parse_whole_xml() return item def parse_whole_xml(self): xml_dir = "xml/{0}".format(self.name) file_url = "https://svc.celebratinghome.com/ZMags.svc/ProductInfo1" downloader = Downloader() if self.d['download']: downloader.get_file(xml_dir, file_url, "client_feed") else: if not os.path.exists('xml/{0}/client_feed.xml'.format(self.name)): basic.warning( "Feed file doesn't exist please de-select no download option" ) os._exit(2) self.number = 0 xml_item = ChomeItem() urls_all = [] for event, elem in iterparse('xml/{0}/client_feed.xml'.format( self.name)): if elem.tag == "{http://schemas.microsoft.com/ado/2007/08/dataservices/metadata}properties": for r in elem: p = "{http://schemas.microsoft.com/ado/2007/08/dataservices}" if r.tag == p + "Id" and r.text in self.no_urls[ 'product_ids']: index = self.no_urls['product_ids'].index(r.text) self.no_urls['status'][index] = 'ran' self.number += 1 urls = [] flag = 0 for x in elem: if x.tag == p + "Id": xml_item['product_id'] = [x.text] elif x.tag == p + "EngLongDesc" and x.text is not None: xml_item['description_english'] = [ self.escape(basic.cdata(x.text)) ] elif x.tag == p + "RetailPrice": xml_item['custom_price'] = [x.text[:-2]] elif x.tag == p + "SpnLongDesc" and x.text is not None: xml_item['description_spanish'] = [ self.escape(basic.cdata(x.text)) ] elif x.tag == p + "PartNumber": xml_item['add_to_cart_id'] = [x.text] elif x.tag == p + "MaxQty": xml_item['max_qty'] = [x.text] elif x.tag == p + "TimeType": xml_item['time_type'] = [x.text] elif x.tag == p + "SpnName" and x.text is not None: xml_item['name_spanish'] = [x.text] elif x.tag == p + "EngName": xml_item['name_english'] = [x.text] elif x.tag == p + "ImagePath_Large" and x.text is not None: urls.append(self.get_absolute(x.text)) xml_item['normal_image_url'] = [ self.get_server_path( self.get_absolute(x.text)) ] elif x.tag == p + "IsActive": if x.text == 0: xml_item['in_stock'] = ["NOT_IN_STOCK"] else: xml_item['in_stock'] = ['IN_STOCK'] else: for i in range(1, 4): tag = p + "Alternate%sImagePath_Large" % ( str(i)) if x.tag == tag and x.text is not None: urls.append(self.get_absolute(x.text)) xml_item['normal_image_url'].append( self.get_server_path( self.get_absolute(x.text))) # change image paths for normal_image_url and return urls self.xml.create_xml(xml_item) urls_all += urls for i in range(0, len(self.no_urls['status'])): if self.no_urls['status'][i] != 'ran': self.no_urls['status'][i] = 'not_found' return urls_all def get_server_path(self, url): path = self.images_store + "/full/" + hashlib.sha1( url).hexdigest() + ".jpg" return path def get_absolute(self, url): return "http://www.celebratinghome.com/" + url def escape(self, string): temp = HTMLParser.HTMLParser().unescape(string) return HTMLParser.HTMLParser().unescape(temp) def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}\n".format(datetime.now()) if self.total - self.number: msg += "{0} id(s) from id list weren't found in feed".format( self.total - self.number) basic.warning(msg) else: msg += "All ids found in feed." basic.green(msg) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.no_urls) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) #if self.d['upload']: #exp = CommonExport() #try: #exp.xml_to_db(self.name, self.d['file'], "40b029c9-dff7-4bc1-b8bc-ef062960b24d") #msg += "\n\nExport to database successful" #except StandardError: #msg += "\n\nExport to database failed" #else: #msg += "\n\nUpload to database not selected" from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "CelebratingHome: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "CelebratingHome: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products['product_ids'] = xls.read_excel_collumn_for_ids( 1, 15) self.products['names'] = xls.read_excel_collumn(2, 15) self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format( self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format( self.d['file']) self.exc.code_handler(103, msg=msg) self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls) def add_properties(self, xml): xml.add_property("description_english", "Description English", "text") xml.add_property("description_spanish", "Description Spanish", "text") xml.add_property("add_to_cart_id", "Add To Cart ID", "text") xml.add_property("max_qty", "Max Quantity", "text") xml.add_property("time_type", "Time Type", "text") xml.add_property("name_english", "Name English", "text") xml.add_property("name_spanish", "Name Spanish", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("custom_price", "Custom Price", "text")
class BurtonSpider(CrawlSpider): name = "burton" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(BurtonSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5, "Burton") if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.handle_not_provided() burton.add_properties(self.xml) self.start_urls = self.products['urls'] self.start_urls = ["http://www.dickssportinggoods.com/product/index.jsp?productId=13243074"] self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.start_urls) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = BurtonItem() page = hxs.extract() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: if 'redirect_urls' in response.request.meta: item['product_id'] = [self.products['product_ids'][index]] item['name'] = [self.products['names'][index]] item['in_stock'] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item['product_id'], item['name'] = self.get_basic_info(hxs) item['description'], item['features'] = self.get_description(hxs) item['variants'], thumb_urls, color_names = self.get_variants(page) item['all_sizes'] = self.get_all_sizes(page) item['color_json'], image_urls = self.get_colors(page, color_names) item['price'], item['old_price'] = self.get_prices(hxs) item['in_stock'] = ['IN_STOCK'] item['product_link'] = [basic.cdata(response.url)] self.xml.create_xml(item) item['image_urls'] = image_urls + thumb_urls self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item def handle_not_provided(self): item = BurtonItem() for n in self.no_urls['product_ids']: item['product_id'] = [n] index = self.no_urls['product_ids'].index(n) item['name'] = [self.no_urls['names'][index]] item['in_stock'] = ['NOT_AVAILABLE'] self.xml.create_xml(item) def get_basic_info(self, hxs): name = hxs.select('//h1[@class="productHeading"]/text()').extract() product_id = hxs.select('//input[@name="productId"]/@value').extract() return product_id, name def get_server_path(self, url): path = self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg" return path def get_prices(self, hxs): price = hxs.select('//div[@class="op"]/text()').extract() price = [basic.get_price(price[0])] old_price = hxs.select('//span[@class="lp"]/text()').extract() if old_price: old_price = [basic.get_price(old_price[0])] return price, old_price def get_description(self, hxs): description = hxs.select('//div[@id="FieldsetProductInfo"]/text()').extract()[3] features = hxs.select('//div[@id="FieldsetProductInfo"]/ul').extract() if features: features = [features[0][:2000]] return [basic.cdata(description)], basic.cdata_field(features) def get_variants(self, page): """Gets jsons for colors with all available sizes. In json are also fetched all information for sizes that are on the site """ script = basic.get_middle_text(page, 'var skuSizeColorObj = new Array();', '</script>')[0] sizes = [] image_urls = [] color_names = [] colors = script.split('skuSizeColorObj') for c in range(1, len(colors)): temp = basic.get_middle_text(colors[c], '= ', ';') # delete swatch image as it obviously won't be needed t = simplejson.loads(burton.replace_for_json(temp[0])) image_urls.append(t['swatchURL']) color_names.append(t['ColorDesc']) t['swatchURL'] = self.get_server_path(t['swatchURL']) sizes.append(basic.cdata(simplejson.dumps(t))) return sizes, image_urls, color_names def get_all_sizes(self, page): script = basic.get_middle_text(page, 'var distsizeobj=new Array();', 'var indexcolor=0;')[0] all_sizes = basic.get_middle_text(script, ']="','";') return [basic.cdata(simplejson.dumps(all_sizes))] def get_colors(self, page, color_names): """Gets color information with images from javascript on the page. Returns json with color name and imagself.images_store = "/" + settings['IMAGES_STORE']e url for that color, and returnes filed of image urls that can be used for download later""" script = basic.get_middle_text(page, 'var imageMap_0 = new Array();', '</script>')[0] colors = basic.get_middle_text(script, '] = ', ';') image_urls = [] colors_json = [] for i in range(0, len(color_names)): color = burton.replace_color_json(colors[i]) color = simplejson.loads(color) color['cname'] = color_names[i] color.pop('reg') image_urls.append(color['enh']) color['enh'] = self.get_server_path(color['enh']) colors_json.append(basic.cdata(simplejson.dumps(color))) return colors_json, image_urls def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}".format(datetime.now()) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d['upload']: exp = CommonExport() try: exp.xml_to_db(self.name, filename, "4ea95a81-90fb-49e2-837e-acf5ab58f574") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" # part for exporting to database here from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "Burton: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "Burton: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15) self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15) self.products["names"] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) self.products= xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
class KennethSpider(CrawlSpider): name = "kenneth" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(KennethSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.images_store = "/" + settings['IMAGES_STORE'] + "/" self.d = terminal.get_arguments() self.xml = VariantsXml() self.exc = ZmagsException(5) print self.d if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.no_url_products(self.no_urls) self.start_urls = self.products['urls'] self.total = len(self.start_urls) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = KennethItem() #main try for script, run general except if error happens in code (send # url on mail where it happened) try: cur_url = response.url # search for noResultContent div on the page, if it exists keep # track, that product doesn't exist on # their page, otherwise continue scraping page available = hxs.select('//div[@id="noResultsContent"]').extract() if not available: index = self.products['urls'].index(cur_url) cur_id = self.get_product_id(cur_url) id = self.products['product_ids'][index] page = hxs.select('//div[@id="mainContent"]').extract() page = " ".join(page) item['name'], item['description'] = self.get_basic_info(hxs) price, new_p, old_p = self.get_prices(hxs) if new_p: item['new_price'] = new_p item['old_price'] = old_p else: item['price'] = price desc = basic.clean_string(item['description'][0]) item['description'] = [desc] urls = self.get_color_image(hxs) new = self.get_image_server_path(urls, id) item['color_image_urls'] = new self.export(item['color_image_urls'], [id], "swatchImage") jsons, images = self.we_also_recommend(cur_id, id) item['product_page'] = [cur_url] item['product_id'] = [id] item['add_to_cart_id'] = [cur_id] item['recommended_product'] = jsons item['in_stock'] = ["IN_STOCK"] self.products['status'][index] = "ran" images_or_404 = self.get_colors(hxs, page, id) if images_or_404 == 404: item['in_stock'] = ["NOT_AVAILABLE"] self.xml.create_xml(item) item['image_urls'] = [] if images_or_404 != 404: item['image_urls'] += images_or_404 item['image_urls'] += urls item['image_urls'] += images #self.export(item['image_urls']) #item['image_urls'] = [] #uncomment for donwloading images else: # part for handling products that are not available cur_id = self.get_product_id(cur_url) cur_url = "http://www.kennethcole.com/product/index.jsp?" cur_url += "productId=" + str(cur_id) index = self.products['urls'].index(cur_url) self.products['status'][index] = "no_avail" item['product_id'] = [self.products['product_ids'][index]] if self.products['product_ids'][index]: item['name'] = [self.products['names'][index]] else: item['name'] = ["not available"] item['in_stock'] = ["NOT_AVAILABLE"] self.xml.create_xml(item) self.exc.code_handler(102, cur_url) except: # part for catching errors and keeping track of numbers of # it and urls where it happened print "Error occured scraping this product" index = self.products['urls'].index(cur_url) self.products['status'][index] = "error" self.exc.code_handler(100, cur_url) return item def no_url_products(self, no_url): item = KennethItem() for n in no_url['product_ids']: item['product_id'] = [n] index = no_url['product_ids'].index(n) item['name'] = [no_url['names'][index]] item['in_stock'] = ['NOT_AVAILABLE'] self.xml.create_xml(item) #function for getting basic product info from the page def get_basic_info(self, hxs): name = hxs.select('//div[@id="productInfoTop"]/h1/text()').extract() description = basic.cdata(hxs.select('//div[@id="productDescription"]').extract()[0]) return name, [description] # function for getting prices from the page, nly one or new and old one if # that's the case def get_prices(self, hxs): price = hxs.select('//div[@id="productInfoTop"]/h2/text()').extract()[0] new_p = hxs.select('//h2[@class="sale-now"]/text()').extract() old_p = hxs.select('//span[@class="productGrey"]/text()').extract() price = re.sub('[^0-9.,]', '', price) return [price], new_p, old_p def get_color_image(self, hxs): return hxs.select('//div[@id="productInfoR2W"]/img/@src').extract() # function for gettng colors from javascript on the page, and writing them # in xml, from here is called function # for creating further sizes subproducts def get_colors(self, hxs, page, main_id): item = KennethItem() try: tmp = page.split('displays[0]')[1] except IndexError: print "This product is not available" return 404 script = tmp.split('</script>')[0] displays = script.split("};") global counter ids = [] images = [] color_ids = [] sizes_script = self.get_sizes_part_page(page) color_internal_code = {} for x in range(0, len(displays) - 1): id = basic.get_middle_text(displays[x], 'colorId: "', '"') ids.append(id[0]) reg = displays[x].count("Reg") images_in = [] for i in range(1, reg + 1): image = basic.get_middle_text(displays[x], "vw" + str(i) + 'Reg: "', '"') if len(image) == 0: image = basic.get_middle_text(displays[x], "vw" + str(i) + 'Reg:"', '"') if (len(image) > 0): if (image[0] != "null"): images_in.append(image[0]) if not images_in: images_in = hxs.select('//input[@name="productImage"]/@value').extract() color_ids.append(str(main_id) + "_" + str(x)) item['product_id'] = [str(main_id) + "_" + str(x)] item['color_option_id'] = id item['master_product_id'] = [main_id] item['normal_image_url'] = self.get_image_server_path(images_in, main_id) item['thumb_image_url'] = self.get_image_server_path_thumb(images_in, main_id) item['in_stock'] = ["NOT_IN_STOCK"] item['color'] = self.get_color_name(sizes_script, id[0]) color_internal_code[id[0]] = str(x) self.xml.create_xml(item) images += images_in self.export(item['normal_image_url'], item['product_id'], "productImage") self.get_sizes(sizes_script, ids, main_id, color_internal_code) return images # function for getting sizes for products from javascript, and storing # information in dicts of format {id : information} def get_sizes(self, page, ids, main_id, color_internal_code): options = page.split("};") skus = {} colors_name = {} inStocks = {} sizes = {} prices = {} for x in range(0, len(options) - 1): id = basic.get_middle_text(options[x], 'cId: "', '"') for i in range(0, len(ids)): if (id[0] == ids[i]): sku = basic.get_middle_text(options[x], 'sku: ', ',s') sku = re.sub("[^0-9]", "", sku[0]) skus = self.add_to_dict(skus, ids[i], sku) size = basic.get_middle_text(options[x], 'sDesc: "', '"') sizes = self.add_to_dict(sizes, ids[i], size[0]) price = basic.get_middle_text(options[x], 'price: "', '"') price = self.clean_price(price[0]) prices = self.add_to_dict(prices, ids[i], price[0]) available = basic.get_middle_text(options[x], 'avail: "', '"') inStocks = self.add_to_dict(inStocks, ids[i], available[0]) self.create_subproducts_xml(main_id, color_internal_code, colors_name, sizes, skus, inStocks, prices) return main_id, colors_name, sizes, skus, inStocks, prices # function for creating subproducts for every size def create_subproducts_xml(self, main_id, color_internal_code, colors_name, sizes, skus, inStocks, prices): number = 0 global counter for k, v in sizes.iteritems(): item = KennethItem() for i in range(0, len(v)): item['size'] = [v[i]] item['size_option_id'] = [skus[k][i]] m_id = main_id + "_" + color_internal_code[k] item['master_product_id'] = [m_id] id = m_id + "_" + str(i) item['product_id'] = [id] if inStocks[k][i] == "NOT_AVAILABLE": item['in_stock'] = ["NOT_IN_STOCK"] elif inStocks[k][i] == "ADVANCED_SALE_LIMITED": item['in_stock'] = ["IN_STOCK"] else: item['in_stock'] = [inStocks[k][i]] item['price'] = [prices[k][i]] #item['color'] = colors_name[k] self.xml.create_xml(item) number += 1 def add_to_dict(self, dict, index, value): try: dict[index].append(value) except: dict[index] = [value] return dict # function for getting we also recommend information about products from # their page, returns json list with information and images # list with images urls def we_also_recommend(self, id, main_id): url = "http://www.res-x.com/ws/r2/Resonance.aspx?appid=kennethcole01&t" url += "k=154212870918247&ss=525178103419747&sg=1&pg=897706724574618&b" url += "x=true&vr=2.67&sc=product_rr&ev=product&ei=" + id + "&cu=&ct=k" url += "ennethcolec01&no=3&cb=r1eh&clk=&cv1=" + id + "&cv23=63&ur=http%" url += "3A//www.kennethcole.com/product/index.jsp%3FproductId%3D3" + id url += "&plk=&rf=" import urllib2 page = urllib2.urlopen(url).read() temp = page.split("certonaRecBoxes") images = [] ids = [] names = [] prices = [] urls = [] # parsing data got from the upper url about we also recommend products for i in range(1, len(temp)): id = [basic.get_middle_text(temp[i], "d=", '\\"')[0]] image = basic.get_middle_text(temp[i], 'src=\\"', '\\"')[0] name = basic.get_middle_text(temp[i], 'alt=\\"', '\\"') price = basic.get_middle_text(temp[i], '<br>', '</a>') url = "http://www.kennethcole.com/product/index.jsp?productId=" url += id[0] urls.append(url) ids.append(id) names.append(name) prices.append(price) images.append(image) jsons = self.make_json(ids, names, prices, self.get_image_server_path(images, main_id), urls) return jsons, images # function for getting product id from the url def get_product_id(self, url): return url.split("=")[1] #function for making json def make_json(self, ids, names, prices, images, urls): jsons = [] for i in range(0, len(ids)): json = "{" + ' "id" : "' + str(ids[i][0]) + '", ' json += '"name" : "' + str(names[i][0]) + '", ' # insert function for storing the right image path json += '"image_url" : "' + str(images[i]) + '", ' json += '"product_url" : "' + urls[i] + '", ' json += '"price" : "' + str(prices[i][0]) + '" } ' json = basic.cdata(json) jsons.append(json) return jsons #function for getting javascript where sizes are handled def get_sizes_part_page(self, page): tmp = page.split("availDates = new Array();")[1] script = tmp.split("</script>")[0] return script # function for getting name of the color by id def get_color_name(self, script, id): temp = script.split(id) temp = temp[0].split('cDesc: "') temp = temp[len(temp) - 1] name = temp.split('"')[0] return [name] return {id: name} #function for exporting images to database via rest def export(self, images, id, tags): #set override to 0 for uploading images or else to skip uploading override = 1 if override == 0: import MultipartPostHandler import urllib2 import os url = 'http://api.admin.zmags.com/productImage/import?key=5ef90922-283b-4412-a1c8-3e70bc28b9d3' for i in range(0, len(images)): image_name = self.get_image_name(images[i]) path = "images/kenneth_images/small/" + str(image_name) params = {'file': file(path, 'rb'), 'product_id': id[0], 'index': str(i + 1), 'tags': tags} #token not working opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler) code = opener.open(url, params).getcode() if (code != 202): print ("Achtung") global images_number images_number += 1 print images_number print "Image uploaded to product " + id[0] else: #print "Image upload overriden.." pass #function for getting image name from url def get_image_server_path(self, urls, id): # print urls new = [] for url in urls: temp = url.split("/") new.append(self.images_store + id + "/full/" + temp[len(temp) - 1]) return new # function for getting image paths on our server def get_image_server_path_thumb(self, urls, id): new = [] for url in urls: temp = url.split("/") new.append(self.images_store + id + "/small/" + temp[len(temp) - 1]) return new def clean_price(self, price): return [re.sub('[^0-9.,]', '', price)] def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "" if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped {0} product out of {1}\n\n".format(self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d['upload']: exp = CommonExport() #try: exp.xml_to_db(self.name, filename, "29eac9ea-8c57-4d22-baf4-3f1471dc3ab6") msg += "\n\nExport to database successful" #except StandardError: #msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "KennethCole: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "KennethCole: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = 'logs/{0}'.format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products['urls'] = xls.read_excel_collumn_for_urls(2, 2) self.products['product_ids'] = xls.read_excel_collumn_for_ids(0, 2) self.products['names'] = xls.read_excel_collumn(1, 2) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls) def add_properties(self, xml): xml.add_property("add_to_cart_id", "Add To Cart Id", "text") xml.add_property("product_page", "Product page", "text") xml.add_property("color_image_urls", "Color Image URLs", "text_list") xml.add_property("color_option_id", "Color Option ID", "text") xml.add_property("recommended_product", "Recommended Product", "text_list") xml.add_property("size_option_id", "Size Option ID", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("old_price", "Old Price", "text") xml.add_property("new_price", "New Price", "text")
class SportmanSpider(CrawlSpider): name = "sportman" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(SportmanSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5, "Sportmann") if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products( self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.start_urls = self.products['urls'] self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.start_urls) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = SportmanItem() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: if 'redirect_urls' in response.request.meta: item['product_id'] = [self.products['product_ids'][index]] item['name'] = [self.products['names'][index]] item['in_stock'] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item["name"], item["short_desc"], item["description"], item[ "old_price"], item["custom_price"], item[ "product_id"], item["sku"] = self.get_basic_info(hxs) item['in_stock'] = ['IN_STOCK'] viewstate, eventval, prevpage, hidden, view_page, even_page, pre_page, hidd_page = self.get_vars( response, hxs) viewstate1 = viewstate[:2000] viewstate2 = viewstate[2000:4000] viewstate3 = viewstate[4000:6000] viewstate4 = viewstate[6000:8000] viewstate5 = viewstate[8000:10000] viewstate6 = viewstate[10000:] item["viewstate1"] = [basic.cdata(viewstate1)] item["viewstate2"] = [basic.cdata(viewstate2)] item["viewstate3"] = [basic.cdata(viewstate3)] item["viewstate4"] = [basic.cdata(viewstate4)] item["viewstate5"] = [basic.cdata(viewstate5)] item["viewstate6"] = [basic.cdata(viewstate6)] item["eventval"] = [basic.cdata(eventval)] item["size_options"] = self.get_variants(hxs, response) images_url = self.get_images(hxs) item["normal_image_url"] = self.get_server_path(images_url) self.xml.create_xml(item) item.clear() item['image_urls'] = self.get_images(hxs) self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item def get_basic_info(self, hxs): name = hxs.select('//div[@id="fragment-1"]/h2/text()').extract() short_desc = hxs.select( '//div[@class="description2"]/text()').extract() description = hxs.select( '//div[@id="fragment-1"]/div[@class="description"]').extract() description = sportman.delete_tags(re, description[0]) description = [basic.cdata(description)] old_price = hxs.select('//span[@class="oldprice"]/text()').extract() if (old_price != []): old_price = " ".join(old_price) old_price = old_price.split(':') old_price = old_price[1].replace('Kr', '') old_price = [old_price.replace(" ", "")] else: old_price = old_price price = hxs.select('//span[@class="nowprice"]/text()').extract() if (price != []): price = " ".join(price) price = price.split(':') price = price[1].replace('Kr', '') price = [price.replace(" ", "")] else: price = hxs.select('//span[@class="normalprice"]/text()').extract() price = " ".join(price) price = price.split(':') price = price[1].replace('Kr', '') price = [price.replace(" ", "")] id = hxs.select('//div[@class="articlenumber"]').extract() id = " ".join(id) id = id.replace(u"\xa0", "") id = basic.get_middle_text(id, 'Art.nr.', '</div>') sku = id id = [id[0]] return name, short_desc, description, old_price, price, id, sku def get_vars(self, response, hxs): headers1 = { 'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1', 'Host': 'www.sportmann.no', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-us,en;q=0.5', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Connection': 'keep-alive', 'Referer': '/product.aspx?productid=613232', 'Cookie': 'ASP.NET_SessionId=lurvsvrn3jxsfd45cedmsv45; Besok=922884e3-e9cb-4b69-b8c8-215f3cc988a9; __utma=184084580.1353376623.1312483243.1312483243.1312483243.1; __utmb=184084580.9.10.1312483243; __utmc=184084580; __utmz=184084580.1312483243.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)' } page = hxs.select('//html').extract() page = " ".join(page) viewst = basic.get_middle_text(page, 'id="__VIEWSTATE" value="', '"') eventval = basic.get_middle_text(page, 'id="__EVENTVALIDATION" value="', '"') prevpage = [""] hidden_field = [""] r = requests.get(response.url, headers=headers1) page_one = r.content viewst_page = basic.get_middle_text(page_one, 'id="__VIEWSTATE" value="', '"') eventval_page = basic.get_middle_text( page_one, 'id="__EVENTVALIDATION" value="', '"') prevpage_page = basic.get_middle_text(page_one, 'id="__PREVIOUSPAGE" value="', '"') hidden_temp = page_one.split('id="__VIEWSTATE"') hidden_temp = hidden_temp[1].split('id="__PREVIOUSPAGE"') hidden_temp = hidden_temp[0].split('<script sr') val_x = len(hidden_temp) - 1 hidden_temp = basic.get_middle_text(hidden_temp[val_x], 'c="', '"') hidden_temp_val = hidden_temp[0] hidden_temp_val = hidden_temp_val.replace('amp;', '') hidden_url = "http://www.sportmann.no" + hidden_temp_val request_hidden = urllib2.Request(hidden_url) response_hidden = urllib2.urlopen(request_hidden) hidden_field_page = basic.get_middle_text( response_hidden.read(), "ctl00_ScriptManager1_HiddenField').value += '", "';") return viewst[0], eventval[0], prevpage[0], hidden_field[ 0], viewst_page[0], eventval_page[0], prevpage_page[ 0], hidden_field_page[0] def get_variants(self, hxs, response): page = hxs.select('//html').extract() page = " ".join(page) dict_one = {} test_one = [] temp = page.split('<div class="color">') temp = temp[1].split('</div>') temp = temp[0].split('<select name') viewstate, eventvalidation, previouspage, hiddenfield, view_page, even_page, pre_page, hidd_page = self.get_vars( response, hxs) if (len(temp) == 1): color = hxs.select('//div[@class="color"]/text()').extract() value = hxs.select( '//input[@id="ctl00_ContentPlaceHolder1_Variant1Hidden"]/@value' ).extract() color[0] = color[0].replace(" ", "") color = basic.clean_string(color[0]) value = value[0] # color = basic.clean_string(color[0]) # color = color.replace(" ","") # # dict['color'] = color # dict['color_value'] = value[0] else: test_color = basic.get_middle_text(temp[1], 'farge</option>', '</select>') color = basic.get_middle_text(test_color[0], '">', '</option>') value = basic.get_middle_text(test_color[0], 'value="', '">') for i in range(0, len(color)): color[i] = color[i].replace(" ", "") # # dict['color'] = color # dict['color_value'] = value size_temp = page.split('<div class="size">') size_temp = size_temp[1].split('</div>') size_temp = size_temp[0].split('<select name') if (len(size_temp) == 1): size = hxs.select('//div[@class="size"]/text()').extract() size = basic.clean_string(size[0]) size = [size.replace(" ", "")] size_val = hxs.select( '//input[@id="ctl00_ContentPlaceHolder1_Variant2Hidden"]/@value' ).extract() if size[0] == "": for i in range(len(value)): resp_page = self.get_data(response, hidd_page, view_page, pre_page, even_page, value[i]) a_page = resp_page.split('<div class="siz') a_page = a_page[1].split('</select>') if len(a_page) == 1: size = basic.get_middle_text(a_page[0], 'e">', '<input type="hidden"') size_val = basic.get_middle_text( a_page[0], 'value="', '"') size_val = size_val[0] size_val = [size_val] else: a_page = basic.get_middle_text(a_page[0], 'se</option>', '</select>') size = basic.get_middle_text(a_page[0], '">', '</option>') size_val = basic.get_middle_text( a_page[0], 'value="', '">') dict_one["color"] = color[i] dict_one["color_value"] = value[i] dict_one["size_value"] = size_val for x in range(0, len(size)): size[x] = basic.clean_string(size[x]) size[x] = size[x].replace(" ", "") dict_one["size"] = size test_one.append(basic.cdata(json.dumps(dict_one))) else: dict_one["color"] = color dict_one["color_value"] = value dict_one['size'] = size dict_one['size_value'] = size_val test_one.append(basic.cdata(simplejson.dumps(dict_one))) else: test_size = basic.get_middle_text(size_temp[1], 'se</option>', '</select>') size = basic.get_middle_text(test_size[0], '">', '</option>') size_val = basic.get_middle_text(test_size[0], 'value="', '">') for x in range(0, len(size)): size[x] = basic.clean_string(size[x]) size[x] = size[x].replace(" ", "") dict_one["color"] = color dict_one["color_value"] = value dict_one['size'] = size dict_one['size_value'] = size_val test_one.append(basic.cdata(json.dumps(dict_one))) return test_one def get_server_path(self, url): images_array = [] for i in range(0, len(url)): url[i] = basic.clean_string(url[i]) images_array.append(self.images_store + "/full/" + hashlib.sha1(url[i]).hexdigest() + ".jpg") return images_array def get_images(self, hxs): page = hxs.select('//html').extract() page = " ".join(page) images = [] temp = page.split('class="gallery_demo_unstyled"') temp = temp[1].split('<div class="right_container">') temp = basic.get_middle_text(temp[0], 'src="', '"') for i in range(0, len(temp)): image_url = "http://www.sportmann.no" + temp[i] images.append(image_url) return images def get_data(self, response, hidden, viewstate, previouspage, eventvalidation, colorvalue): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0', 'Host': 'www.sportmann.no', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-us,en;q=0.5', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Connection': 'keep-alive', 'Referer': 'http://www.sportmann.no/product.aspx?productid=613232', 'Cookie': '' } eventvalidation = urllib.urlencode( {"__EVENTVALIDATION": eventvalidation}) viewstate = urllib.urlencode({"__VIEWSTATE": viewstate}) previouspage = urllib.urlencode({"__PREVIOUSPAGE": previouspage}) hidden = urllib.urlencode({"ctl00_ScriptManager1_HiddenField": hidden}) data = "ctl00%24ScriptManager1=ctl00%24ContentPlaceHolder1%24dropdownPanel%7Cctl00%24ContentPlaceHolder1%24ddlVariant&" + hidden + "%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0&__EVENTTARGET=ctl00%24ContentPlaceHolder1%24ddlVariant&__EVENTARGUMENT=&__LASTFOCUS=&" + viewstate + "&" + previouspage + "&" + eventvalidation + "&ctl00%24ProductSearch%24txtProdSearch=&ctl00%24ProductSearch%24TextBoxWatermarkProdSearch_ClientState=&ctl00%24ContentPlaceHolder1%24ddlVariant=" + colorvalue + "&ctl00%24ContentPlaceHolder1%24Variant1Hidden=&ctl00%24ContentPlaceHolder1%24Variant2Hidden=&ctl00%24ContentPlaceHolder1%24tbAmount=1&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtComment=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceComment_ClientState=&__ASYNCPOST=true&" #r = requests.get(response.url, h) req = urllib2.Request(response.url, data, headers) resp_page = urllib2.urlopen(req).read() return resp_page def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}".format(datetime.now()) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d['upload']: exp = CommonExport() try: exp.xml_to_db(self.name, filename, "1ccd39a5-af4e-47cc-aebe-e0dede5b14d8") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "Sportmann: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "Sportmann: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15) self.products["product_ids"] = xls.read_excel_collumn_for_ids( 1, 15) self.products["names"] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format( self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format( self.d['file']) self.exc.code_handler(103, msg=msg) self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls) def add_properties(self, xml): xml.add_property("short_desc", "Short Description", "text") xml.add_property("old_price", "Old Price", "text") xml.add_property("custom_price", "New Price", "text") xml.add_property("color_value", "Color Value", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("size_val", "Size Value", "text_list") xml.add_property("sku", "Sku", "text") xml.add_property("size_options", "Size_options", "text_list") xml.add_property("viewstate1", "Viewstate1", "text_list") xml.add_property("viewstate2", "Viewstate2", "text_list") xml.add_property("viewstate3", "Viewstate3", "text_list") xml.add_property("viewstate4", "Viewstate4", "text_list") xml.add_property("viewstate5", "Viewstate5", "text_list") xml.add_property("viewstate6", "Viewstate6", "text_list") xml.add_property("eventval", "Eventval", "text_list") xml.add_property("hidden", "Hidden Field", "text_list") xml.add_property("prevpage", "Previous Page", "text_list") xml.add_property("recommended_product", "Recommended Product", "text_list")