def __init__(self, *a, **kw): super(ChomeSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products( self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.no_urls['product_ids'])
def __init__(self, *a, **kw): super(GuitarCenterSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.handle_not_provided() self.start_urls = self.products['urls'] self.total = len(self.products['urls'])
def __init__(self, *a, **kw): super(BurtonSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5, "Burton") if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products( self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.handle_not_provided() burton.add_properties(self.xml) self.start_urls = self.products['urls'] self.start_urls = [ "http://www.dickssportinggoods.com/product/index.jsp?productId=13243074" ] self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.start_urls)
def __init__(self, *a, **kw): super(PartyliteSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = PartyliteTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.images_store = "/" + settings['IMAGES_STORE'] self.users = party.get_users(settings, self.d) self.exc = ZmagsException(50) self.production = self.d['env'] self.upload = self.d['upload'] self.english = self.d['lang'] self.file_name = self.d['file'] if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() self.change_url_list() else: self.get_lists_from_excel() self.xml = CommonXml() party.add_properties(self.xml) self.total = len(self.products['urls'])
def __init__(self, *a, **kw): super(GuitarCenterSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d["database"]: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"]) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.handle_not_provided() self.start_urls = self.products["urls"] self.total = len(self.products["urls"])
def __init__(self, *a, **kw): super(ChomeSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.no_urls['product_ids'])
def __init__(self, *a, **kw): super(SportmanSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5, "Sportmann") if self.d["database"]: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"]) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.start_urls = self.products["urls"] self.images_store = "/" + settings["IMAGES_STORE"] self.total = len(self.start_urls)
def __init__(self, *a, **kw): super(BurtonSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5, "Burton") if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.handle_not_provided() burton.add_properties(self.xml) self.start_urls = self.products['urls'] self.start_urls = ["http://www.dickssportinggoods.com/product/index.jsp?productId=13243074"] self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.start_urls)
class SportmanSpider(CrawlSpider): name = "sportman" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(SportmanSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5, "Sportmann") if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products( self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.start_urls = self.products['urls'] self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.start_urls) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = SportmanItem() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: if 'redirect_urls' in response.request.meta: item['product_id'] = [self.products['product_ids'][index]] item['name'] = [self.products['names'][index]] item['in_stock'] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item["name"], item["short_desc"], item["description"], item[ "old_price"], item["custom_price"], item[ "product_id"], item["sku"] = self.get_basic_info(hxs) item['in_stock'] = ['IN_STOCK'] viewstate, eventval, prevpage, hidden, view_page, even_page, pre_page, hidd_page = self.get_vars( response, hxs) viewstate1 = viewstate[:2000] viewstate2 = viewstate[2000:4000] viewstate3 = viewstate[4000:6000] viewstate4 = viewstate[6000:8000] viewstate5 = viewstate[8000:10000] viewstate6 = viewstate[10000:] item["viewstate1"] = [basic.cdata(viewstate1)] item["viewstate2"] = [basic.cdata(viewstate2)] item["viewstate3"] = [basic.cdata(viewstate3)] item["viewstate4"] = [basic.cdata(viewstate4)] item["viewstate5"] = [basic.cdata(viewstate5)] item["viewstate6"] = [basic.cdata(viewstate6)] item["eventval"] = [basic.cdata(eventval)] item["size_options"] = self.get_variants(hxs, response) images_url = self.get_images(hxs) item["normal_image_url"] = self.get_server_path(images_url) self.xml.create_xml(item) item.clear() item['image_urls'] = self.get_images(hxs) self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item def get_basic_info(self, hxs): name = hxs.select('//div[@id="fragment-1"]/h2/text()').extract() short_desc = hxs.select( '//div[@class="description2"]/text()').extract() description = hxs.select( '//div[@id="fragment-1"]/div[@class="description"]').extract() description = sportman.delete_tags(re, description[0]) description = [basic.cdata(description)] old_price = hxs.select('//span[@class="oldprice"]/text()').extract() if (old_price != []): old_price = " ".join(old_price) old_price = old_price.split(':') old_price = old_price[1].replace('Kr', '') old_price = [old_price.replace(" ", "")] else: old_price = old_price price = hxs.select('//span[@class="nowprice"]/text()').extract() if (price != []): price = " ".join(price) price = price.split(':') price = price[1].replace('Kr', '') price = [price.replace(" ", "")] else: price = hxs.select('//span[@class="normalprice"]/text()').extract() price = " ".join(price) price = price.split(':') price = price[1].replace('Kr', '') price = [price.replace(" ", "")] id = hxs.select('//div[@class="articlenumber"]').extract() id = " ".join(id) id = id.replace(u"\xa0", "") id = basic.get_middle_text(id, 'Art.nr.', '</div>') sku = id id = [id[0]] return name, short_desc, description, old_price, price, id, sku def get_vars(self, response, hxs): headers1 = { 'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1', 'Host': 'www.sportmann.no', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-us,en;q=0.5', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Connection': 'keep-alive', 'Referer': '/product.aspx?productid=613232', 'Cookie': 'ASP.NET_SessionId=lurvsvrn3jxsfd45cedmsv45; Besok=922884e3-e9cb-4b69-b8c8-215f3cc988a9; __utma=184084580.1353376623.1312483243.1312483243.1312483243.1; __utmb=184084580.9.10.1312483243; __utmc=184084580; __utmz=184084580.1312483243.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)' } page = hxs.select('//html').extract() page = " ".join(page) viewst = basic.get_middle_text(page, 'id="__VIEWSTATE" value="', '"') eventval = basic.get_middle_text(page, 'id="__EVENTVALIDATION" value="', '"') prevpage = [""] hidden_field = [""] r = requests.get(response.url, headers=headers1) page_one = r.content viewst_page = basic.get_middle_text(page_one, 'id="__VIEWSTATE" value="', '"') eventval_page = basic.get_middle_text( page_one, 'id="__EVENTVALIDATION" value="', '"') prevpage_page = basic.get_middle_text(page_one, 'id="__PREVIOUSPAGE" value="', '"') hidden_temp = page_one.split('id="__VIEWSTATE"') hidden_temp = hidden_temp[1].split('id="__PREVIOUSPAGE"') hidden_temp = hidden_temp[0].split('<script sr') val_x = len(hidden_temp) - 1 hidden_temp = basic.get_middle_text(hidden_temp[val_x], 'c="', '"') hidden_temp_val = hidden_temp[0] hidden_temp_val = hidden_temp_val.replace('amp;', '') hidden_url = "http://www.sportmann.no" + hidden_temp_val request_hidden = urllib2.Request(hidden_url) response_hidden = urllib2.urlopen(request_hidden) hidden_field_page = basic.get_middle_text( response_hidden.read(), "ctl00_ScriptManager1_HiddenField').value += '", "';") return viewst[0], eventval[0], prevpage[0], hidden_field[ 0], viewst_page[0], eventval_page[0], prevpage_page[ 0], hidden_field_page[0] def get_variants(self, hxs, response): page = hxs.select('//html').extract() page = " ".join(page) dict_one = {} test_one = [] temp = page.split('<div class="color">') temp = temp[1].split('</div>') temp = temp[0].split('<select name') viewstate, eventvalidation, previouspage, hiddenfield, view_page, even_page, pre_page, hidd_page = self.get_vars( response, hxs) if (len(temp) == 1): color = hxs.select('//div[@class="color"]/text()').extract() value = hxs.select( '//input[@id="ctl00_ContentPlaceHolder1_Variant1Hidden"]/@value' ).extract() color[0] = color[0].replace(" ", "") color = basic.clean_string(color[0]) value = value[0] # color = basic.clean_string(color[0]) # color = color.replace(" ","") # # dict['color'] = color # dict['color_value'] = value[0] else: test_color = basic.get_middle_text(temp[1], 'farge</option>', '</select>') color = basic.get_middle_text(test_color[0], '">', '</option>') value = basic.get_middle_text(test_color[0], 'value="', '">') for i in range(0, len(color)): color[i] = color[i].replace(" ", "") # # dict['color'] = color # dict['color_value'] = value size_temp = page.split('<div class="size">') size_temp = size_temp[1].split('</div>') size_temp = size_temp[0].split('<select name') if (len(size_temp) == 1): size = hxs.select('//div[@class="size"]/text()').extract() size = basic.clean_string(size[0]) size = [size.replace(" ", "")] size_val = hxs.select( '//input[@id="ctl00_ContentPlaceHolder1_Variant2Hidden"]/@value' ).extract() if size[0] == "": for i in range(len(value)): resp_page = self.get_data(response, hidd_page, view_page, pre_page, even_page, value[i]) a_page = resp_page.split('<div class="siz') a_page = a_page[1].split('</select>') if len(a_page) == 1: size = basic.get_middle_text(a_page[0], 'e">', '<input type="hidden"') size_val = basic.get_middle_text( a_page[0], 'value="', '"') size_val = size_val[0] size_val = [size_val] else: a_page = basic.get_middle_text(a_page[0], 'se</option>', '</select>') size = basic.get_middle_text(a_page[0], '">', '</option>') size_val = basic.get_middle_text( a_page[0], 'value="', '">') dict_one["color"] = color[i] dict_one["color_value"] = value[i] dict_one["size_value"] = size_val for x in range(0, len(size)): size[x] = basic.clean_string(size[x]) size[x] = size[x].replace(" ", "") dict_one["size"] = size test_one.append(basic.cdata(json.dumps(dict_one))) else: dict_one["color"] = color dict_one["color_value"] = value dict_one['size'] = size dict_one['size_value'] = size_val test_one.append(basic.cdata(simplejson.dumps(dict_one))) else: test_size = basic.get_middle_text(size_temp[1], 'se</option>', '</select>') size = basic.get_middle_text(test_size[0], '">', '</option>') size_val = basic.get_middle_text(test_size[0], 'value="', '">') for x in range(0, len(size)): size[x] = basic.clean_string(size[x]) size[x] = size[x].replace(" ", "") dict_one["color"] = color dict_one["color_value"] = value dict_one['size'] = size dict_one['size_value'] = size_val test_one.append(basic.cdata(json.dumps(dict_one))) return test_one def get_server_path(self, url): images_array = [] for i in range(0, len(url)): url[i] = basic.clean_string(url[i]) images_array.append(self.images_store + "/full/" + hashlib.sha1(url[i]).hexdigest() + ".jpg") return images_array def get_images(self, hxs): page = hxs.select('//html').extract() page = " ".join(page) images = [] temp = page.split('class="gallery_demo_unstyled"') temp = temp[1].split('<div class="right_container">') temp = basic.get_middle_text(temp[0], 'src="', '"') for i in range(0, len(temp)): image_url = "http://www.sportmann.no" + temp[i] images.append(image_url) return images def get_data(self, response, hidden, viewstate, previouspage, eventvalidation, colorvalue): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0', 'Host': 'www.sportmann.no', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-us,en;q=0.5', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Connection': 'keep-alive', 'Referer': 'http://www.sportmann.no/product.aspx?productid=613232', 'Cookie': '' } eventvalidation = urllib.urlencode( {"__EVENTVALIDATION": eventvalidation}) viewstate = urllib.urlencode({"__VIEWSTATE": viewstate}) previouspage = urllib.urlencode({"__PREVIOUSPAGE": previouspage}) hidden = urllib.urlencode({"ctl00_ScriptManager1_HiddenField": hidden}) data = "ctl00%24ScriptManager1=ctl00%24ContentPlaceHolder1%24dropdownPanel%7Cctl00%24ContentPlaceHolder1%24ddlVariant&" + hidden + "%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0&__EVENTTARGET=ctl00%24ContentPlaceHolder1%24ddlVariant&__EVENTARGUMENT=&__LASTFOCUS=&" + viewstate + "&" + previouspage + "&" + eventvalidation + "&ctl00%24ProductSearch%24txtProdSearch=&ctl00%24ProductSearch%24TextBoxWatermarkProdSearch_ClientState=&ctl00%24ContentPlaceHolder1%24ddlVariant=" + colorvalue + "&ctl00%24ContentPlaceHolder1%24Variant1Hidden=&ctl00%24ContentPlaceHolder1%24Variant2Hidden=&ctl00%24ContentPlaceHolder1%24tbAmount=1&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtComment=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceComment_ClientState=&__ASYNCPOST=true&" #r = requests.get(response.url, h) req = urllib2.Request(response.url, data, headers) resp_page = urllib2.urlopen(req).read() return resp_page def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}".format(datetime.now()) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d['upload']: exp = CommonExport() try: exp.xml_to_db(self.name, filename, "1ccd39a5-af4e-47cc-aebe-e0dede5b14d8") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "Sportmann: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "Sportmann: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15) self.products["product_ids"] = xls.read_excel_collumn_for_ids( 1, 15) self.products["names"] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format( self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format( self.d['file']) self.exc.code_handler(103, msg=msg) self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls) def add_properties(self, xml): xml.add_property("short_desc", "Short Description", "text") xml.add_property("old_price", "Old Price", "text") xml.add_property("custom_price", "New Price", "text") xml.add_property("color_value", "Color Value", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("size_val", "Size Value", "text_list") xml.add_property("sku", "Sku", "text") xml.add_property("size_options", "Size_options", "text_list") xml.add_property("viewstate1", "Viewstate1", "text_list") xml.add_property("viewstate2", "Viewstate2", "text_list") xml.add_property("viewstate3", "Viewstate3", "text_list") xml.add_property("viewstate4", "Viewstate4", "text_list") xml.add_property("viewstate5", "Viewstate5", "text_list") xml.add_property("viewstate6", "Viewstate6", "text_list") xml.add_property("eventval", "Eventval", "text_list") xml.add_property("hidden", "Hidden Field", "text_list") xml.add_property("prevpage", "Previous Page", "text_list") xml.add_property("recommended_product", "Recommended Product", "text_list")
class GuitarCenterSpider(CrawlSpider): name = "guitar_center" allowed_domains = ["musiciansfriend.com"] start_urls = ["http://www.musiciansfriend.com"] counter = 0 def __init__(self, *a, **kw): super(GuitarCenterSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d["database"]: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"]) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.handle_not_provided() self.start_urls = self.products["urls"] self.total = len(self.products["urls"]) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = GuitarCenterItem() from scrapy.conf import settings if "redirect_urls" in response.request.meta: cur_url = response.request.meta["redirect_urls"][0] else: cur_url = response.url index = self.products["urls"].index(cur_url) try: item["product_id"] = [self.products["product_ids"][index]] item["name"], item["brand"] = self.get_basic_info(hxs) item["heading"], item["details"], item["specs"], item["call_to_action"] = self.get_description(hxs) item["brand_image"], item["brand_image_promo"], brand_images = self.get_description_images(hxs) item["old_price"], item["discount"], item["price"] = self.get_prices(hxs) item["image_json"], img = self.get_images(hxs) item["serial"] = self.get_serials(hxs) item["warranty"] = self.gold_coverage(hxs) item["in_stock"] = self.get_available(hxs) item["product_ref"], item["add_to_cart_id"] = self.get_add_to_cart(hxs) if not item["add_to_cart_id"]: item["in_stock"] = ["NOT_AVAILABLE"] item["shipping"] = self.get_shipping(hxs) item["colors"] = self.get_colors(hxs) self.products["status"][index] = "ran" except StandardError: self.products["status"][index] = "error" self.exc.code_handler(100, response.url) else: self.xml.create_xml(item) item["image_urls"] = img + brand_images return item def handle_not_provided(self): item = GuitarCenterItem() for n in self.no_urls["product_ids"]: item["product_id"] = [n] index = self.no_urls["product_ids"].index(n) item["name"] = [self.no_urls["names"][index]] item["in_stock"] = ["NOT_AVAILABLE"] self.xml.create_xml(item) def get_basic_info(self, hxs): name = hxs.select('//h1[@class="fn"]/text()').extract() name = [basic.clean_string("".join(name))] brand = hxs.select('//span[@class="brand"]/text()').extract() name = [name[0].replace(u"\xa0", "")] return name, brand def get_description_images(self, hxs): brand_image = hxs.select('//a[@class="brandImage"]/img/@src').extract() brand_image_promo = hxs.select('//div[@class="brandPromoLogo"]/img/@src').extract() images = brand_image + brand_image_promo if brand_image: brand_image = [self.get_server_path(brand_image[0])] if brand_image_promo: brand_image_promo = [self.get_server_path(brand_image_promo[0])] return brand_image, brand_image_promo, images def get_description(self, hxs): heading = hxs.select('//div[@id="description"]/p').extract() details = hxs.select('//p[@class="description"]').extract() specs = hxs.select('//div[@class="specs"]/ul').extract() last = hxs.select('//div[@class="callToAction"]/p/text()').extract() return basic.cdata_field(heading), basic.cdata_field(details), basic.cdata_field(specs), basic.cdata_field(last) # function for getting prices, returns tags and values or empty field if no option for one of them new is discount def get_prices(self, hxs): tag = hxs.select('//dl[@class="lineItemList"]/dt/text()').extract() value = hxs.select('//dl[@class="lineItemList"]/dd/text()').extract() old_price = [] discount = [] price = [] if len(tag) > 1: old_price = [basic.clean_string(value[0])] try: discount = [basic.clean_string(value[len(value) - 1])] except IndexError: print "This product has no price." try: price = hxs.select('//span[@class="topAlignedPrice"]/text()').extract() except IndexError: print "This product has no price." if not old_price and not discount and not price: price = hxs.select('//dl[@class="inlineList"]/dd/text()').extract() return self.clean_price(old_price), self.clean_price(discount), self.clean_price(price) # returning json with image url and serial number of product image refers to def get_images(self, hxs): images = hxs.select('//ul[@id="prodDetailThumbs"]/li/a/@href').extract() tags = hxs.select('//ul[@id="prodDetailThumbs"]/li/@class').extract() images_list = [] d = {} img = [] for i in range(0, len(images)): d["image_url"] = self.get_server_path(images[i]) img.append(images[i]) if "site1sku" in tags[i]: d["product_serial"] = tags[i].replace("site1sku", "") else: d["product_serial"] = tags[i] images_list.append(basic.cdata(simplejson.dumps(d))) return images_list, img # function for getting serials and all information about them, currently returns field with jsons with all # information, can be modified to return dicts if needed for subproducts for those one day def get_serials(self, hxs): serials = hxs.select('//var[@class="hidden styleInfo"]/text()').extract() new = [] for serial in serials: d = simplejson.loads(serial) new.append(basic.cdata(simplejson.dumps(d))) return new def get_server_path(self, url): # uncomment next line if you want to keep absolute image path from their site return url return IMAGES_STORE + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg" # function for getting gold coverage from the page which is actually additional warranty options def gold_coverage(self, hxs): ids = hxs.select('//div[@class="goldCoverage"]/input[@type="checkbox"]/@value').extract() labels = hxs.select('//div[@class="goldCoverage"]/label/text()').extract() d = {} new = [] for i in range(0, len(ids)): d["id"] = ids[i] d["name"] = labels[i] new.append(basic.cdata(simplejson.dumps(d))) return new # function for getting availability def get_available(self, hxs): p = hxs.select('//var[@class="hidden availability"]/text()').extract() if p: if p[0] == "in_stock": p = [p[0].upper()] else: # for those that have color options and in stock status for each of those # put IN_STOCK for the product as it has no that option on the page p = ["IN_STOCK"] return p # function for getting add to cart id and product reference def get_add_to_cart(self, hxs): try: temp = hxs.select('//span[@class="magicLink addToList"]/@data-rel').extract()[0] except: print "Product not available" else: return [temp.split("|")[0]], [temp.split("|")[1]] return [], [] # function for gatting shipping information def get_shipping(self, hxs): return hxs.select('//div[@id="targeter_pdpShipping"]/span/text()').extract() # function for getting colors, return jsons with all the data about options def get_colors(self, hxs): colors = hxs.select('//var[@class="styleInfo"]/text()').extract() new = [] for color in colors: d = simplejson.loads(color) new.append(basic.cdata(simplejson.dumps(d))) return new # cleaning price to leave only numbers def clean_price(self, price): new = [] for i in price: new.append(re.sub("[^0-9.]", "", i)) return new def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}".format(datetime.now()) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d["database"]: try: self.database.connect() filename = self.database.get_name(self.d["catalog_id"]) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d["file"] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d["upload"]: exp = CommonExport() try: exp.xml_to_db(self.name, filename, "4a9f5955-9b8e-4e13-84ef-95f937dbc00d") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" ## part for exporting to database here from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "GuitarCenter: {0}".format(filename)) if self.d["email"]: mail.send_mail(msg, "GuitarCenter: {0}".format(filename), self.d["email"]) except: msg += "\nSending mail failed." if self.d["database"]: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), "w") as f: f.write(msg) def add_properties(self, xml): xml.add_property("old_price", "Old Price", "decimal") xml.add_property("image_json", "Image Json", "text_list") xml.add_property("discount", "Discount", "decimal") xml.add_property("product_ref", "Product Ref.", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("serial", "Serial", "text_list") xml.add_property("colors", "Colors", "text_list") xml.add_property("add_to_cart_id", "Add To Cart ID", "text") xml.add_property("shipping", "Shipping", "text") xml.add_property("warranty", "Warranty", "text_list") xml.add_property("heading", "Heading", "text") xml.add_property("details", "Details", "text") xml.add_property("specs", "Specs", "text") xml.add_property("call_to_action", "Call To Action", "text") xml.add_property("brand_image", "Brand Image", "text") xml.add_property("brand_image_promo", "Brand Image Promo", "text") def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d["file"])) self.products = dict() try: self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15) self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15) self.products["names"] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d["file"]) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d["file"]) self.exc.code_handler(103, msg=msg) self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
class SportmanSpider(CrawlSpider): name = "sportman" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(SportmanSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5, "Sportmann") if self.d["database"]: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d["catalog_id"], self.d["product_id"]) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.start_urls = self.products["urls"] self.images_store = "/" + settings["IMAGES_STORE"] self.total = len(self.start_urls) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = SportmanItem() if "redirect_urls" in response.request.meta: cur_url = response.request.meta["redirect_urls"][0] else: cur_url = response.url index = self.products["urls"].index(cur_url) try: if "redirect_urls" in response.request.meta: item["product_id"] = [self.products["product_ids"][index]] item["name"] = [self.products["names"][index]] item["in_stock"] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item["name"], item["short_desc"], item["description"], item["old_price"], item["custom_price"], item[ "product_id" ], item["sku"] = self.get_basic_info(hxs) item["in_stock"] = ["IN_STOCK"] viewstate, eventval, prevpage, hidden, view_page, even_page, pre_page, hidd_page = self.get_vars( response, hxs ) viewstate1 = viewstate[:2000] viewstate2 = viewstate[2000:4000] viewstate3 = viewstate[4000:6000] viewstate4 = viewstate[6000:8000] viewstate5 = viewstate[8000:10000] viewstate6 = viewstate[10000:] item["viewstate1"] = [basic.cdata(viewstate1)] item["viewstate2"] = [basic.cdata(viewstate2)] item["viewstate3"] = [basic.cdata(viewstate3)] item["viewstate4"] = [basic.cdata(viewstate4)] item["viewstate5"] = [basic.cdata(viewstate5)] item["viewstate6"] = [basic.cdata(viewstate6)] item["eventval"] = [basic.cdata(eventval)] item["size_options"] = self.get_variants(hxs, response) images_url = self.get_images(hxs) item["normal_image_url"] = self.get_server_path(images_url) self.xml.create_xml(item) item.clear() item["image_urls"] = self.get_images(hxs) self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item def get_basic_info(self, hxs): name = hxs.select('//div[@id="fragment-1"]/h2/text()').extract() short_desc = hxs.select('//div[@class="description2"]/text()').extract() description = hxs.select('//div[@id="fragment-1"]/div[@class="description"]').extract() description = sportman.delete_tags(re, description[0]) description = [basic.cdata(description)] old_price = hxs.select('//span[@class="oldprice"]/text()').extract() if old_price != []: old_price = " ".join(old_price) old_price = old_price.split(":") old_price = old_price[1].replace("Kr", "") old_price = [old_price.replace(" ", "")] else: old_price = old_price price = hxs.select('//span[@class="nowprice"]/text()').extract() if price != []: price = " ".join(price) price = price.split(":") price = price[1].replace("Kr", "") price = [price.replace(" ", "")] else: price = hxs.select('//span[@class="normalprice"]/text()').extract() price = " ".join(price) price = price.split(":") price = price[1].replace("Kr", "") price = [price.replace(" ", "")] id = hxs.select('//div[@class="articlenumber"]').extract() id = " ".join(id) id = id.replace(u"\xa0", "") id = basic.get_middle_text(id, "Art.nr.", "</div>") sku = id id = [id[0]] return name, short_desc, description, old_price, price, id, sku def get_vars(self, response, hxs): headers1 = { "User-Agent": "Mozilla/5.0 (Windows NT 5.1; rv:13.0) Gecko/20100101 Firefox/13.0.1", "Host": "www.sportmann.no", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-us,en;q=0.5", "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7", "Connection": "keep-alive", "Referer": "/product.aspx?productid=613232", "Cookie": "ASP.NET_SessionId=lurvsvrn3jxsfd45cedmsv45; Besok=922884e3-e9cb-4b69-b8c8-215f3cc988a9; __utma=184084580.1353376623.1312483243.1312483243.1312483243.1; __utmb=184084580.9.10.1312483243; __utmc=184084580; __utmz=184084580.1312483243.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)", } page = hxs.select("//html").extract() page = " ".join(page) viewst = basic.get_middle_text(page, 'id="__VIEWSTATE" value="', '"') eventval = basic.get_middle_text(page, 'id="__EVENTVALIDATION" value="', '"') prevpage = [""] hidden_field = [""] r = requests.get(response.url, headers=headers1) page_one = r.content viewst_page = basic.get_middle_text(page_one, 'id="__VIEWSTATE" value="', '"') eventval_page = basic.get_middle_text(page_one, 'id="__EVENTVALIDATION" value="', '"') prevpage_page = basic.get_middle_text(page_one, 'id="__PREVIOUSPAGE" value="', '"') hidden_temp = page_one.split('id="__VIEWSTATE"') hidden_temp = hidden_temp[1].split('id="__PREVIOUSPAGE"') hidden_temp = hidden_temp[0].split("<script sr") val_x = len(hidden_temp) - 1 hidden_temp = basic.get_middle_text(hidden_temp[val_x], 'c="', '"') hidden_temp_val = hidden_temp[0] hidden_temp_val = hidden_temp_val.replace("amp;", "") hidden_url = "http://www.sportmann.no" + hidden_temp_val request_hidden = urllib2.Request(hidden_url) response_hidden = urllib2.urlopen(request_hidden) hidden_field_page = basic.get_middle_text( response_hidden.read(), "ctl00_ScriptManager1_HiddenField').value += '", "';" ) return ( viewst[0], eventval[0], prevpage[0], hidden_field[0], viewst_page[0], eventval_page[0], prevpage_page[0], hidden_field_page[0], ) def get_variants(self, hxs, response): page = hxs.select("//html").extract() page = " ".join(page) dict_one = {} test_one = [] temp = page.split('<div class="color">') temp = temp[1].split("</div>") temp = temp[0].split("<select name") viewstate, eventvalidation, previouspage, hiddenfield, view_page, even_page, pre_page, hidd_page = self.get_vars( response, hxs ) if len(temp) == 1: color = hxs.select('//div[@class="color"]/text()').extract() value = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant1Hidden"]/@value').extract() color[0] = color[0].replace(" ", "") color = basic.clean_string(color[0]) value = value[0] # color = basic.clean_string(color[0]) # color = color.replace(" ","") # # dict['color'] = color # dict['color_value'] = value[0] else: test_color = basic.get_middle_text(temp[1], "farge</option>", "</select>") color = basic.get_middle_text(test_color[0], '">', "</option>") value = basic.get_middle_text(test_color[0], 'value="', '">') for i in range(0, len(color)): color[i] = color[i].replace(" ", "") # # dict['color'] = color # dict['color_value'] = value size_temp = page.split('<div class="size">') size_temp = size_temp[1].split("</div>") size_temp = size_temp[0].split("<select name") if len(size_temp) == 1: size = hxs.select('//div[@class="size"]/text()').extract() size = basic.clean_string(size[0]) size = [size.replace(" ", "")] size_val = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant2Hidden"]/@value').extract() if size[0] == "": for i in range(len(value)): resp_page = self.get_data(response, hidd_page, view_page, pre_page, even_page, value[i]) a_page = resp_page.split('<div class="siz') a_page = a_page[1].split("</select>") if len(a_page) == 1: size = basic.get_middle_text(a_page[0], 'e">', '<input type="hidden"') size_val = basic.get_middle_text(a_page[0], 'value="', '"') size_val = size_val[0] size_val = [size_val] else: a_page = basic.get_middle_text(a_page[0], "se</option>", "</select>") size = basic.get_middle_text(a_page[0], '">', "</option>") size_val = basic.get_middle_text(a_page[0], 'value="', '">') dict_one["color"] = color[i] dict_one["color_value"] = value[i] dict_one["size_value"] = size_val for x in range(0, len(size)): size[x] = basic.clean_string(size[x]) size[x] = size[x].replace(" ", "") dict_one["size"] = size test_one.append(basic.cdata(json.dumps(dict_one))) else: dict_one["color"] = color dict_one["color_value"] = value dict_one["size"] = size dict_one["size_value"] = size_val test_one.append(basic.cdata(simplejson.dumps(dict_one))) else: test_size = basic.get_middle_text(size_temp[1], "se</option>", "</select>") size = basic.get_middle_text(test_size[0], '">', "</option>") size_val = basic.get_middle_text(test_size[0], 'value="', '">') for x in range(0, len(size)): size[x] = basic.clean_string(size[x]) size[x] = size[x].replace(" ", "") dict_one["color"] = color dict_one["color_value"] = value dict_one["size"] = size dict_one["size_value"] = size_val test_one.append(basic.cdata(json.dumps(dict_one))) return test_one def get_server_path(self, url): images_array = [] for i in range(0, len(url)): url[i] = basic.clean_string(url[i]) images_array.append(self.images_store + "/full/" + hashlib.sha1(url[i]).hexdigest() + ".jpg") return images_array def get_images(self, hxs): page = hxs.select("//html").extract() page = " ".join(page) images = [] temp = page.split('class="gallery_demo_unstyled"') temp = temp[1].split('<div class="right_container">') temp = basic.get_middle_text(temp[0], 'src="', '"') for i in range(0, len(temp)): image_url = "http://www.sportmann.no" + temp[i] images.append(image_url) return images def get_data(self, response, hidden, viewstate, previouspage, eventvalidation, colorvalue): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0", "Host": "www.sportmann.no", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-us,en;q=0.5", "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.7", "Connection": "keep-alive", "Referer": "http://www.sportmann.no/product.aspx?productid=613232", "Cookie": "", } eventvalidation = urllib.urlencode({"__EVENTVALIDATION": eventvalidation}) viewstate = urllib.urlencode({"__VIEWSTATE": viewstate}) previouspage = urllib.urlencode({"__PREVIOUSPAGE": previouspage}) hidden = urllib.urlencode({"ctl00_ScriptManager1_HiddenField": hidden}) data = ( "ctl00%24ScriptManager1=ctl00%24ContentPlaceHolder1%24dropdownPanel%7Cctl00%24ContentPlaceHolder1%24ddlVariant&" + hidden + "%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0%3B%3BAjaxControlToolkit%2C%20Version%3D3.0.20820.16598%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D28f01b0e84b6d53e%3Aen-US%3A707835dd-fa4b-41d1-89e7-6df5d518ffb5%3Ae2e86ef9%3A1df13a87%3A8ccd9c1b%3A9ea3f0e2%3A9e8e87e9%3A4c9865be%3Aba594826%3A757f92c2%3Ac7c04611%3Acd120801%3Ac4c00916%3A3858419b%3A96741c43%3A38ec41c0&__EVENTTARGET=ctl00%24ContentPlaceHolder1%24ddlVariant&__EVENTARGUMENT=&__LASTFOCUS=&" + viewstate + "&" + previouspage + "&" + eventvalidation + "&ctl00%24ProductSearch%24txtProdSearch=&ctl00%24ProductSearch%24TextBoxWatermarkProdSearch_ClientState=&ctl00%24ContentPlaceHolder1%24ddlVariant=" + colorvalue + "&ctl00%24ContentPlaceHolder1%24Variant1Hidden=&ctl00%24ContentPlaceHolder1%24Variant2Hidden=&ctl00%24ContentPlaceHolder1%24tbAmount=1&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtFriendsEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceFriendsEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourName=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourName_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtYourEmail=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceYourEmail_ClientState=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24txtComment=&ctl00%24ContentPlaceHolder1%24modTellFriend%24tellFriend%24vceComment_ClientState=&__ASYNCPOST=true&" ) # r = requests.get(response.url, h) req = urllib2.Request(response.url, data, headers) resp_page = urllib2.urlopen(req).read() return resp_page def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}".format(datetime.now()) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d["database"]: try: self.database.connect() filename = self.database.get_name(self.d["catalog_id"]) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d["file"] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d["upload"]: exp = CommonExport() try: exp.xml_to_db(self.name, filename, "1ccd39a5-af4e-47cc-aebe-e0dede5b14d8") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "Sportmann: {0}".format(filename)) if self.d["email"]: mail.send_mail(msg, "Sportmann: {0}".format(filename), self.d["email"]) except: msg += "\nSending mail failed." if self.d["database"]: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), "w") as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d["file"])) self.products = dict() try: self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15) self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15) self.products["names"] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d["file"]) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d["file"]) self.exc.code_handler(103, msg=msg) self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls) def add_properties(self, xml): xml.add_property("short_desc", "Short Description", "text") xml.add_property("old_price", "Old Price", "text") xml.add_property("custom_price", "New Price", "text") xml.add_property("color_value", "Color Value", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("size_val", "Size Value", "text_list") xml.add_property("sku", "Sku", "text") xml.add_property("size_options", "Size_options", "text_list") xml.add_property("viewstate1", "Viewstate1", "text_list") xml.add_property("viewstate2", "Viewstate2", "text_list") xml.add_property("viewstate3", "Viewstate3", "text_list") xml.add_property("viewstate4", "Viewstate4", "text_list") xml.add_property("viewstate5", "Viewstate5", "text_list") xml.add_property("viewstate6", "Viewstate6", "text_list") xml.add_property("eventval", "Eventval", "text_list") xml.add_property("hidden", "Hidden Field", "text_list") xml.add_property("prevpage", "Previous Page", "text_list") xml.add_property("recommended_product", "Recommended Product", "text_list")
class BurtonSpider(CrawlSpider): name = "burton" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(BurtonSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5, "Burton") if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.handle_not_provided() burton.add_properties(self.xml) self.start_urls = self.products['urls'] self.start_urls = ["http://www.dickssportinggoods.com/product/index.jsp?productId=13243074"] self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.start_urls) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = BurtonItem() page = hxs.extract() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: if 'redirect_urls' in response.request.meta: item['product_id'] = [self.products['product_ids'][index]] item['name'] = [self.products['names'][index]] item['in_stock'] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item['product_id'], item['name'] = self.get_basic_info(hxs) item['description'], item['features'] = self.get_description(hxs) item['variants'], thumb_urls, color_names = self.get_variants(page) item['all_sizes'] = self.get_all_sizes(page) item['color_json'], image_urls = self.get_colors(page, color_names) item['price'], item['old_price'] = self.get_prices(hxs) item['in_stock'] = ['IN_STOCK'] item['product_link'] = [basic.cdata(response.url)] self.xml.create_xml(item) item['image_urls'] = image_urls + thumb_urls self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item def handle_not_provided(self): item = BurtonItem() for n in self.no_urls['product_ids']: item['product_id'] = [n] index = self.no_urls['product_ids'].index(n) item['name'] = [self.no_urls['names'][index]] item['in_stock'] = ['NOT_AVAILABLE'] self.xml.create_xml(item) def get_basic_info(self, hxs): name = hxs.select('//h1[@class="productHeading"]/text()').extract() product_id = hxs.select('//input[@name="productId"]/@value').extract() return product_id, name def get_server_path(self, url): path = self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg" return path def get_prices(self, hxs): price = hxs.select('//div[@class="op"]/text()').extract() price = [basic.get_price(price[0])] old_price = hxs.select('//span[@class="lp"]/text()').extract() if old_price: old_price = [basic.get_price(old_price[0])] return price, old_price def get_description(self, hxs): description = hxs.select('//div[@id="FieldsetProductInfo"]/text()').extract()[3] features = hxs.select('//div[@id="FieldsetProductInfo"]/ul').extract() if features: features = [features[0][:2000]] return [basic.cdata(description)], basic.cdata_field(features) def get_variants(self, page): """Gets jsons for colors with all available sizes. In json are also fetched all information for sizes that are on the site """ script = basic.get_middle_text(page, 'var skuSizeColorObj = new Array();', '</script>')[0] sizes = [] image_urls = [] color_names = [] colors = script.split('skuSizeColorObj') for c in range(1, len(colors)): temp = basic.get_middle_text(colors[c], '= ', ';') # delete swatch image as it obviously won't be needed t = simplejson.loads(burton.replace_for_json(temp[0])) image_urls.append(t['swatchURL']) color_names.append(t['ColorDesc']) t['swatchURL'] = self.get_server_path(t['swatchURL']) sizes.append(basic.cdata(simplejson.dumps(t))) return sizes, image_urls, color_names def get_all_sizes(self, page): script = basic.get_middle_text(page, 'var distsizeobj=new Array();', 'var indexcolor=0;')[0] all_sizes = basic.get_middle_text(script, ']="','";') return [basic.cdata(simplejson.dumps(all_sizes))] def get_colors(self, page, color_names): """Gets color information with images from javascript on the page. Returns json with color name and imagself.images_store = "/" + settings['IMAGES_STORE']e url for that color, and returnes filed of image urls that can be used for download later""" script = basic.get_middle_text(page, 'var imageMap_0 = new Array();', '</script>')[0] colors = basic.get_middle_text(script, '] = ', ';') image_urls = [] colors_json = [] for i in range(0, len(color_names)): color = burton.replace_color_json(colors[i]) color = simplejson.loads(color) color['cname'] = color_names[i] color.pop('reg') image_urls.append(color['enh']) color['enh'] = self.get_server_path(color['enh']) colors_json.append(basic.cdata(simplejson.dumps(color))) return colors_json, image_urls def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}".format(datetime.now()) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d['upload']: exp = CommonExport() try: exp.xml_to_db(self.name, filename, "4ea95a81-90fb-49e2-837e-acf5ab58f574") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" # part for exporting to database here from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "Burton: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "Burton: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15) self.products["product_ids"] = xls.read_excel_collumn_for_ids(1, 15) self.products["names"] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) self.products= xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
class GuitarCenterSpider(CrawlSpider): name = "guitar_center" allowed_domains = ["musiciansfriend.com"] start_urls = ["http://www.musiciansfriend.com"] counter = 0 def __init__(self, *a, **kw): super(GuitarCenterSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.handle_not_provided() self.start_urls = self.products['urls'] self.total = len(self.products['urls']) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = GuitarCenterItem() from scrapy.conf import settings if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: item['product_id'] = [self.products['product_ids'][index]] item['name'], item['brand'] = self.get_basic_info(hxs) item['heading'], item['details'], item['specs'], item['call_to_action'] = self.get_description(hxs) item['brand_image'], item['brand_image_promo'], brand_images = self.get_description_images(hxs) item['old_price'], item['discount'], item['price'] = self.get_prices(hxs) item['image_json'], img = self.get_images(hxs) item['serial'] = self.get_serials(hxs) item['warranty'] = self.gold_coverage(hxs) item['in_stock'] = self.get_available(hxs) item['product_ref'], item['add_to_cart_id'] = self.get_add_to_cart(hxs) if not item['add_to_cart_id']: item['in_stock'] = ["NOT_AVAILABLE"] item['shipping'] = self.get_shipping(hxs) item['colors'] = self.get_colors(hxs) self.products['status'][index] = "ran" except StandardError: self.products['status'][index] = "error" self.exc.code_handler(100, response.url) else: self.xml.create_xml(item) item['image_urls'] = img + brand_images return item def handle_not_provided(self): item = GuitarCenterItem() for n in self.no_urls['product_ids']: item['product_id'] = [n] index = self.no_urls['product_ids'].index(n) item['name'] = [self.no_urls['names'][index]] item['in_stock'] = ['NOT_AVAILABLE'] self.xml.create_xml(item) def get_basic_info(self, hxs): name = hxs.select('//h1[@class="fn"]/text()').extract() name = [basic.clean_string("".join(name))] brand = hxs.select('//span[@class="brand"]/text()').extract() name = [name[0].replace(u"\xa0", "")] return name, brand def get_description_images(self, hxs): brand_image = hxs.select('//a[@class="brandImage"]/img/@src').extract() brand_image_promo = hxs.select('//div[@class="brandPromoLogo"]/img/@src').extract() images = brand_image + brand_image_promo if brand_image: brand_image = [self.get_server_path(brand_image[0])] if brand_image_promo: brand_image_promo = [self.get_server_path(brand_image_promo[0])] return brand_image, brand_image_promo, images def get_description(self, hxs): heading = hxs.select('//div[@id="description"]/p').extract() details = hxs.select('//p[@class="description"]').extract() specs = hxs.select('//div[@class="specs"]/ul').extract() last = hxs.select('//div[@class="callToAction"]/p/text()').extract() return basic.cdata_field(heading), basic.cdata_field(details), basic.cdata_field(specs), basic.cdata_field(last) #function for getting prices, returns tags and values or empty field if no option for one of them new is discount def get_prices(self, hxs): tag = hxs.select('//dl[@class="lineItemList"]/dt/text()').extract() value = hxs.select('//dl[@class="lineItemList"]/dd/text()').extract() old_price = [] discount = [] price = [] if len(tag) > 1: old_price = [basic.clean_string(value[0])] try: discount = [basic.clean_string(value[len(value) - 1])] except IndexError: print "This product has no price." try: price = hxs.select('//span[@class="topAlignedPrice"]/text()').extract() except IndexError: print "This product has no price." if not old_price and not discount and not price: price = hxs.select('//dl[@class="inlineList"]/dd/text()').extract() return self.clean_price(old_price), self.clean_price(discount), self.clean_price(price) # returning json with image url and serial number of product image refers to def get_images(self, hxs): images = hxs.select('//ul[@id="prodDetailThumbs"]/li/a/@href').extract() tags = hxs.select('//ul[@id="prodDetailThumbs"]/li/@class').extract() images_list = [] d = {} img = [] for i in range(0, len(images)): d['image_url'] = self.get_server_path(images[i]) img.append(images[i]) if "site1sku" in tags[i]: d['product_serial'] = tags[i].replace("site1sku", "") else: d['product_serial'] = tags[i] images_list.append(basic.cdata(simplejson.dumps(d))) return images_list, img # function for getting serials and all information about them, currently returns field with jsons with all # information, can be modified to return dicts if needed for subproducts for those one day def get_serials(self, hxs): serials = hxs.select('//var[@class="hidden styleInfo"]/text()').extract() new = [] for serial in serials: d = simplejson.loads(serial) new.append(basic.cdata(simplejson.dumps(d))) return new def get_server_path(self, url): #uncomment next line if you want to keep absolute image path from their site return url return IMAGES_STORE + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg" # function for getting gold coverage from the page which is actually additional warranty options def gold_coverage(self, hxs): ids = hxs.select('//div[@class="goldCoverage"]/input[@type="checkbox"]/@value').extract() labels = hxs.select('//div[@class="goldCoverage"]/label/text()').extract() d = {} new = [] for i in range(0, len(ids)): d['id'] = ids[i] d['name'] = labels[i] new.append(basic.cdata(simplejson.dumps(d))) return new # function for getting availability def get_available(self, hxs): p = hxs.select('//var[@class="hidden availability"]/text()').extract() if p: if p[0] == "in_stock": p = [p[0].upper()] else: #for those that have color options and in stock status for each of those #put IN_STOCK for the product as it has no that option on the page p = ["IN_STOCK"] return p # function for getting add to cart id and product reference def get_add_to_cart(self, hxs): try: temp = hxs.select('//span[@class="magicLink addToList"]/@data-rel').extract()[0] except: print "Product not available" else: return [temp.split("|")[0]], [temp.split("|")[1]] return [], [] # function for gatting shipping information def get_shipping(self, hxs): return hxs.select('//div[@id="targeter_pdpShipping"]/span/text()').extract() # function for getting colors, return jsons with all the data about options def get_colors(self, hxs): colors = hxs.select('//var[@class="styleInfo"]/text()').extract() new = [] for color in colors: d = simplejson.loads(color) new.append(basic.cdata(simplejson.dumps(d))) return new # cleaning price to leave only numbers def clean_price(self, price): new = [] for i in price: new.append(re.sub('[^0-9.]', '', i)) return new def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}".format(datetime.now()) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d['upload']: exp = CommonExport() try: exp.xml_to_db(self.name, filename, "4a9f5955-9b8e-4e13-84ef-95f937dbc00d") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" ## part for exporting to database here from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "GuitarCenter: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "GuitarCenter: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def add_properties(self, xml): xml.add_property("old_price", "Old Price", "decimal") xml.add_property("image_json", "Image Json", "text_list") xml.add_property("discount", "Discount", "decimal") xml.add_property("product_ref", "Product Ref.", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("serial", "Serial", "text_list") xml.add_property("colors", "Colors", "text_list") xml.add_property("add_to_cart_id", "Add To Cart ID", "text") xml.add_property("shipping", "Shipping", "text") xml.add_property("warranty", "Warranty", "text_list") xml.add_property("heading", "Heading", "text") xml.add_property("details", "Details", "text") xml.add_property("specs", "Specs", "text") xml.add_property("call_to_action", "Call To Action", "text") xml.add_property("brand_image", "Brand Image", "text") xml.add_property("brand_image_promo", "Brand Image Promo", "text") def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15) self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15) self.products['names'] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) self.products= xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
class BurtonSpider(CrawlSpider): name = "burton" allowed_domains = ["example.com"] start_urls = ["http://www.example.com"] counter = 0 def __init__(self, *a, **kw): super(BurtonSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5, "Burton") if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products( self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.handle_not_provided() burton.add_properties(self.xml) self.start_urls = self.products['urls'] self.start_urls = [ "http://www.dickssportinggoods.com/product/index.jsp?productId=13243074" ] self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.start_urls) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = BurtonItem() page = hxs.extract() if 'redirect_urls' in response.request.meta: cur_url = response.request.meta['redirect_urls'][0] else: cur_url = response.url index = self.products['urls'].index(cur_url) try: if 'redirect_urls' in response.request.meta: item['product_id'] = [self.products['product_ids'][index]] item['name'] = [self.products['names'][index]] item['in_stock'] = ["NOT_AVAILABLE"] self.exc.code_handler(102, response.url) self.xml.create_xml(item) self.products["status"][index] = "no_avail" else: item['product_id'], item['name'] = self.get_basic_info(hxs) item['description'], item['features'] = self.get_description( hxs) item['variants'], thumb_urls, color_names = self.get_variants( page) item['all_sizes'] = self.get_all_sizes(page) item['color_json'], image_urls = self.get_colors( page, color_names) item['price'], item['old_price'] = self.get_prices(hxs) item['in_stock'] = ['IN_STOCK'] item['product_link'] = [basic.cdata(response.url)] self.xml.create_xml(item) item['image_urls'] = image_urls + thumb_urls self.products["status"][index] = "ran" except: self.exc.code_handler(100, response.url) self.products["status"][index] = "error" else: return item def handle_not_provided(self): item = BurtonItem() for n in self.no_urls['product_ids']: item['product_id'] = [n] index = self.no_urls['product_ids'].index(n) item['name'] = [self.no_urls['names'][index]] item['in_stock'] = ['NOT_AVAILABLE'] self.xml.create_xml(item) def get_basic_info(self, hxs): name = hxs.select('//h1[@class="productHeading"]/text()').extract() product_id = hxs.select('//input[@name="productId"]/@value').extract() return product_id, name def get_server_path(self, url): path = self.images_store + "/full/" + hashlib.sha1( url).hexdigest() + ".jpg" return path def get_prices(self, hxs): price = hxs.select('//div[@class="op"]/text()').extract() price = [basic.get_price(price[0])] old_price = hxs.select('//span[@class="lp"]/text()').extract() if old_price: old_price = [basic.get_price(old_price[0])] return price, old_price def get_description(self, hxs): description = hxs.select( '//div[@id="FieldsetProductInfo"]/text()').extract()[3] features = hxs.select('//div[@id="FieldsetProductInfo"]/ul').extract() if features: features = [features[0][:2000]] return [basic.cdata(description)], basic.cdata_field(features) def get_variants(self, page): """Gets jsons for colors with all available sizes. In json are also fetched all information for sizes that are on the site """ script = basic.get_middle_text(page, 'var skuSizeColorObj = new Array();', '</script>')[0] sizes = [] image_urls = [] color_names = [] colors = script.split('skuSizeColorObj') for c in range(1, len(colors)): temp = basic.get_middle_text(colors[c], '= ', ';') # delete swatch image as it obviously won't be needed t = simplejson.loads(burton.replace_for_json(temp[0])) image_urls.append(t['swatchURL']) color_names.append(t['ColorDesc']) t['swatchURL'] = self.get_server_path(t['swatchURL']) sizes.append(basic.cdata(simplejson.dumps(t))) return sizes, image_urls, color_names def get_all_sizes(self, page): script = basic.get_middle_text(page, 'var distsizeobj=new Array();', 'var indexcolor=0;')[0] all_sizes = basic.get_middle_text(script, ']="', '";') return [basic.cdata(simplejson.dumps(all_sizes))] def get_colors(self, page, color_names): """Gets color information with images from javascript on the page. Returns json with color name and imagself.images_store = "/" + settings['IMAGES_STORE']e url for that color, and returnes filed of image urls that can be used for download later""" script = basic.get_middle_text(page, 'var imageMap_0 = new Array();', '</script>')[0] colors = basic.get_middle_text(script, '] = ', ';') image_urls = [] colors_json = [] for i in range(0, len(color_names)): color = burton.replace_color_json(colors[i]) color = simplejson.loads(color) color['cname'] = color_names[i] color.pop('reg') image_urls.append(color['enh']) color['enh'] = self.get_server_path(color['enh']) colors_json.append(basic.cdata(simplejson.dumps(color))) return colors_json, image_urls def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}".format(datetime.now()) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d['upload']: exp = CommonExport() try: exp.xml_to_db(self.name, filename, "4ea95a81-90fb-49e2-837e-acf5ab58f574") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" # part for exporting to database here from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "Burton: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "Burton: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products["urls"] = xls.read_excel_collumn_for_urls(3, 15) self.products["product_ids"] = xls.read_excel_collumn_for_ids( 1, 15) self.products["names"] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format( self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format( self.d['file']) self.exc.code_handler(103, msg=msg) self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
class BootsSpider(CrawlSpider): name = "boots" allowed_domains = ["zmags.com"] start_urls = ["http://www.zmags.com"] counter = 0 def __init__(self, *a, **kw): super(BootsSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.handle_not_provided() self.start_urls = self.products['urls'] self.total = len(self.products['urls']) def parse(self, response): self.counter += 1 basic.print_status(self.counter, self.total) hxs = HtmlXPathSelector(response) item = BootsItem() item['product_id'], item['store_id'], item['lang_id'], item['catalog_id'] = self.get_ids(hxs) item['name'] = self.get_name(hxs) item['short_description'], sponsored, description, in_stock, item['ingredients'], patient_information_url, item['offer'], item['promotion'] = self.get_description(hxs) item['rating'] = self.get_rating(hxs) size, price_per_size = self.get_size(hxs) item['normal_image_url'], image_urls = self.get_images(hxs) brand, brand_image_url = self.get_brand(hxs) item['save_money'], item['old_price'] = self.get_oldies(hxs) for i in range(0, len(description)): tag = 'description_%d' % (i + 1) item[tag] = [basic.cdata(description[i])] if sponsored is not None: item['sponsored'] = sponsored item['in_stock'] = ["NOT_IN_STOCK"] if in_stock == "In stock": item['in_stock'] = ["IN_STOCK"] item['order_id'] = hxs.select('//input[@name="orderId"]/@value').extract() item['cat_entry_id'] = hxs.select('//input[@name="catEntryId"]/@value').extract() item['calculation_usage_id'] = hxs.select('//input[@name="calculationUsageId"]/@value').extract() if brand_image_url is not None: item['brand'] = brand item['brand_image_url'] = ["43662980-f344-11e1-a21f-0800200c9a66/full/" + self.get_image_sha1(brand_image_url)] image_urls.append(brand_image_url) if patient_information_url is not None: item['patient_information_url'] = [basic.cdata(patient_information_url)] prices, point_prices, collect_points, colors, color_image_urls, variant_ids = self.get_color_variants(hxs) if size is not None: item['size'] = size item['price_per_size'] = price_per_size elif variant_ids is None: prices, point_prices, collect_points, sizes, variant_ids = self.get_size_variants(hxs) if color_image_urls is not None: image_urls.extend(color_image_urls) if variant_ids is not None: self.xml.create_xml(item) if colors is not None: self.create_color_variants(prices, point_prices, colors, color_image_urls, variant_ids, collect_points, item['product_id']) else: self.create_size_variants(prices, point_prices, sizes, variant_ids, collect_points, item['product_id']) else: prices = hxs.select('//p[@class="price"]/text()').extract()[0] point_prices = hxs.select('//span[@class="pointsPrice"]/text()').extract()[0] collect_points = [basic.get_price(hxs.select('//p[@class="collectPoints"]/text()').extract()[0])] item['price'] = [basic.get_price(prices)] item['points_price'] = [basic.get_price(point_prices)] item['collect_points'] = collect_points self.xml.create_xml(item) item['image_urls'] = image_urls #raw_input("Press Enter to continue...") return item def handle_not_provided(self): item = BootsItem() for n in self.no_urls['product_ids']: item['product_id'] = [n] index = self.no_urls['product_ids'].index(n) item['name'] = [self.no_urls['names'][index]] item['in_stock'] = ['NOT_AVAILABLE'] self.xml.create_xml(item) def get_ids(self, hxs): product_id = hxs.select('//input[@name="productId"]/@value').extract()[0] store_id = hxs.select('//input[@name="storeId"]/@value').extract()[0] lang_id = hxs.select('//input[@name="langId"]/@value').extract()[0] catalog_id = hxs.select('//input[@name="catalogId"]/@value').extract()[0] return [product_id], [store_id], [lang_id], [catalog_id] def get_name(self, hxs): name = hxs.select('//span[@class="pd_productNameSpan"]/text()').extract()[0] return [name] def get_description(self, hxs): short_description = hxs.select('//div[@class="productIntroCopy"]').extract()[0] try: suitable_for = ''.join(hxs.select('//div[@id="suitableFor"]//h4 | //div[@id="suitableFor"]//p | //div[@id="suitableFor"]//div').extract()) short_description += suitable_for except: print "There's no suitable_for section" try: ingredients = basic.clean_string(' '.join(hxs.select('//div[@class="pd_panel"][not(@id)]//div[@class="pd_HTML"]/p | //div[@class="pd_panel"][not(@id)]//div[@class="pd_HTML"]//div').extract())) if ingredients != '': ingredients = basic.cdata(ingredients) except: print "No ingredients found!" ingredients = None try: patient_information_url = hxs.select('//div[@class="downloadMedia"]//a/@href').extract()[0] except: print "No patient information found!" patient_information_url = None try: offer = hxs.select('//div[@id="mainOffer"]//a/text()').extract()[0] except: print "No special offer found!" offer = None try: promotion = hxs.select('//div[@id="otherOffers"]//a/text()').extract() except: print "No promotion found!" promotion = None try: sponsored = hxs.select('//div[@class="sponsored"]//p/text()').extract()[0] except: print "No sponsor message found!" sponsored = None description = ''.join(hxs.select('//div[@id="detailedInfo"]//div[@class="pd_panelInner"]//div[@class="pd_HTML"]').extract()) description = basic.clean_string(description) description_overflow = len(description)/2000 desc = [] if description_overflow > 0: for i in range(0, description_overflow + 1): if i < description_overflow: desc.append(description[2000*(i):2000*(i+1)-1]) else: desc.append(description[2000*i:]) else: desc = [description] try: in_stock = hxs.select('//div[@class="icon_pl_stock"]/text()').extract()[0] except: in_stock = "" return [basic.cdata(basic.clean_string(short_description))], [sponsored], desc, in_stock, [ingredients], patient_information_url, [offer], promotion def get_images(self, hxs): image_urls = [] normal_image_url = hxs.select('//meta[@property="og:image"]//@content').extract()[0] image_urls.append(normal_image_url) normal_image_url = "43662980-f344-11e1-a21f-0800200c9a66/full/" + self.get_image_sha1(normal_image_url) return [normal_image_url], image_urls def get_brand(self, hxs): try: brand = hxs.select('//div[@class="pd_brand"]//div//a//span//img/@alt').extract()[0] brand_image_url = hxs.select('//div[@class="pd_brand"]//div//a//span//img/@src').extract()[0] return [brand], brand_image_url except: print "No brand name or image found!" return None, None def get_rating(self, hxs): try: rating = hxs.select('//span[@property="v:average"]/text()').extract()[0] except: rating = "0.0" return [rating] def get_size(self, hxs): try: size = hxs.select('//span[@class="size"]/text()').extract()[0] size = basic.clean_string(size) size = size.replace("|", "") price_per_size = hxs.select('//span[@class="pricePerSize"]/text()').extract()[0] return [size], [price_per_size] except: print "No size found" return None, None def get_oldies(self, hxs): try: save = hxs.select('//span[@class="save"]/text()').extract()[0] old = hxs.select('//span[@class="oldPrice"]/text()').extract()[0] save = basic.get_price(save) old = basic.get_price(old) except: save = None old = None return [save], [old] def get_color_variants(self, hxs): try: variants = hxs.select('//script').re('productCode:\".*\d\"')[0].split(",") colors = hxs.select('//div[@class="gp_80-20a column"]//div[@class="innerColumn"]//fieldset//div//label//span/text()').extract() color_image_urls = hxs.select('//div[@class="gp_80-20a column"]//div[@class="innerColumn"]//fieldset//div//label//img//@src').extract() collect_points = [] prices = [] point_prices = [] variant_ids = [] for i in range(0, len(variants), 8): price = basic.get_price(variants[i+2]) prices.append(price) points = str(int(float(price) * 100)) point_prices.append(points) variant_id = basic.get_price(variants[i]) variant_ids.append(variant_id) points = basic.get_price(variants[i+5]) collect_points.append(points) return prices, point_prices, collect_points, colors, color_image_urls, variant_ids except: print "No color variants found" return None, None, None, None, None, None def get_size_variants(self, hxs): try: variants = hxs.select('//script').re('productCode:\".*\d\"')[0].split(",") except: print "No size variants found" return None, None, None, None, None sizes = hxs.select('//select[@id="size_x"]//option/text()').extract()[1:] collect_points = [] prices = [] point_prices = [] variant_ids = [] for i in range(7, len(variants), 7): price = basic.get_price(variants[i+2]) prices.append(price) points = str(int(float(price) * 100)) point_prices.append(points) variant_id = basic.get_price(variants[i+4]) variant_ids.append(variant_id) points = basic.get_price(variants[i+1]) collect_points.append(points) return prices, point_prices, collect_points, sizes, variant_ids def create_color_variants(self, prices, point_prices, colors, color_image_urls, variant_ids, collect_points, product_id): for i in range(0, len(colors)): variant = BootsItem() variant['master_product_id'] = product_id variant['product_id'] = [variant_ids[i]] variant['price'] = [prices[i]] variant['points_price'] = [point_prices[i]] variant['collect_points'] = [collect_points[0]] variant['color'] = [colors[i]] variant['color_image_url'] = ["43662980-f344-11e1-a21f-0800200c9a66/full/" + self.get_image_sha1(color_image_urls[i])] self.xml.create_xml(variant) def create_size_variants(self, prices, point_prices, sizes, variant_ids, collect_points, product_id): for i in range(0, len(sizes)): variant = BootsItem() variant['master_product_id'] = product_id variant['product_id'] = [variant_ids[i]] variant['price'] = [prices[i]] variant['points_price'] = [point_prices[i]] variant['collect_points'] = [collect_points[0]] variant['size'] = [sizes[i]] self.xml.create_xml(variant) def get_image_sha1(self, image_url): h = hashlib.sha1() h.update(image_url) return h.hexdigest() def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}".format(datetime.now()) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml""" if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) if self.d['upload']: exp = CommonExport() try: exp.xml_to_db(self.name, filename, "5097450b-2c49-49d4-b47a-55b1bc652c78") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" ## part for exporting to database here from modules.mail import Mail mail = Mail() """try: mail.send_mail(msg, "Boots: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "Boots: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg)""" def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15) self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15) self.products['names'] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) self.products= xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls) def add_properties(self, xml): xml.add_property("in_stock", "In Stock", "text") xml.add_property("store_id", "Store ID", "text") xml.add_property("lang_id", "Lang ID", "text") xml.add_property("catalog_id", "Catalog ID", "text") xml.add_property("order_id", "Order ID", "text") xml.add_property("cat_entry_id", "Cat Entry ID", "text") xml.add_property("calculation_usage_id", "Calculation Usage ID", "text") xml.add_property("ingredients", "Ingredients", "text") xml.add_property("patient_information_url", "Patient Information Url", "text") xml.add_property("points_price", "Points Price", "integer") xml.add_property("collect_points", "Collect Points", "integer") xml.add_property("brand_image_url", "Brand Image Url", "text") xml.add_property("description_1", "Description 1", "text") xml.add_property("description_2", "Description 2", "text") xml.add_property("description_3", "Description 3", "text") xml.add_property("description_4", "Description 4", "text") xml.add_property("description_5", "Description 5", "text") xml.add_property("description_6", "Description 6", "text") xml.add_property("sponsored", "Sponsored", "text") xml.add_property("offer", "Offer", "text") xml.add_property("promotion", "Promotion", "text") xml.add_property("old_price", "Old Price", "decimal") xml.add_property("save_money", "Save Money", "decimal") xml.add_property("price_per_size", "Price Per Size", "text")
class PartyliteSpider(CrawlSpider): name = "partylite" allowed_domains = ["partylite.biz"] start_urls = ["http://www.zmags.com"] counter = 0 def __init__(self, *a, **kw): super(PartyliteSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = PartyliteTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.images_store = "/" + settings['IMAGES_STORE'] self.users = party.get_users(settings, self.d) self.exc = ZmagsException(50) self.production = self.d['env'] self.upload = self.d['upload'] self.english = self.d['lang'] self.file_name = self.d['file'] if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() self.change_url_list() else: self.get_lists_from_excel() self.xml = CommonXml() party.add_properties(self.xml) self.total = len(self.products['urls']) def parse(self, response): for url in self.products['urls']: if self.d['lang'] == 'us': request = Request(url, callback=self.parse_can, dont_filter=True) yield request elif self.d['lang'] == 'english': c_url = url.replace(self.users['us'], self.users['canada_en']) request = Request(c_url, callback=self.parse_can, dont_filter=True) request.meta['language'] = "eng" yield request elif self.d['lang'] == 'french': c_url = url.replace(self.users['us'], self.users['canada_fr']) request = Request(c_url, callback=self.parse_can, dont_filter=True) request.meta['language'] = "fr" yield request def change_url_list(self): for i in range(0, len(self.products['urls'])): if not self.production: self.products['urls'][i] = self.products['urls'][i].replace('www', 'qa') self.products['urls'][i] = self.products['urls'][i].replace('XXXXX', self.users['us']) def get_in_stock(self, hxs): """Gets in stock information about product.""" stock = hxs.select('//div[@id="availability_container"]').extract() if not stock: return ["IN_STOCK"] else: return ["NOT_IN_STOCK"] def get_basic_info(self, hxs): """Getting basic info about products (name, shown with).""" name = hxs.select('//div[@id="product_name"]/text()').extract() if name: name = basic.cdata_field(name) shown_with = hxs.select('//div[@id="shown_with_container"]').extract() if shown_with: shown_with = [basic.cdata(shown_with[0])] return name, shown_with def get_description(self, hxs): description = description = hxs.select('//div[@id="item_description"]').extract() description = [basic.cdata(basic.remove_tags(description[0]))] description = [description[0].replace(u"\u2044", "/")] return description def get_price(self, hxs): """Getting product prices. Gets regular and discount price if there is one.""" price = hxs.select('//span[@id="divUnitPrice"]/text()').extract() if not price: price = hxs.select('//div[@id="product_price"]/span[1]/text()').extract() if not price: price = hxs.select('//div[@id="product_price"]/text()').extract() discount = hxs.select('//div[@id="product_price"]/span[@class="pc-salePrice"]/text()').extract() price = basic.clean_string(price[0]) price = re.sub(" +", " ", price) price = price.replace("Price:", "") price = price.replace("Prix:", "") price = basic.cdata(price.strip()) if discount: discount = basic.cdata_field(discount) return [price], discount def get_add_to_cart_id(self, page): """Gets add to cart id from the javascript on the page.""" tmp = basic.get_middle_text(page, "if(isOrderStarted){", "}else")[0] tmp = basic.get_middle_text(tmp, "addItemToCart(", ",") return tmp def create_subproducts(self, page): """Gets information about colors from javascript. Returns field of dicts with information about colors. Those are really color variants for product.""" try: tmp = page.split("var largeImages = new Array();")[1] except IndexError: print "This product has no images" else: tmp = tmp.split("colorDropdownArray")[0] images = basic.get_middle_text(tmp, "ProductGroupProduct(", ");") image_names = self.get_image_names(page) color_products = [] for im in images: product = {} attributes = im.split("',") product['normal_image_url'] = "http://qa.partylite.biz/imaging/resize?fileName=/productcatalog/production" product['normal_image_url'] += self.custom_clean_string(attributes[26], True) product['description'] = basic.cdata(self.custom_clean_string(attributes[27])) product['color_id'] = self.custom_clean_string(attributes[7], True) product['swatch_color'] = basic.cdata(self.custom_clean_string(attributes[9]).replace(" ", "")) product['name'] = basic.cdata(image_names[product['color_id']]) product['add_to_cart_id'] = self.custom_clean_string(attributes[0], True).replace(" ", "") product['price'] = self.custom_clean_string(attributes[10], True) color_products.append(product) return color_products return [] def custom_clean_string(self, string, spaces=False): """Custom function for cleaning strings. Replaces new line, return and tab signs, also replaces multiple spaces with only one.""" string = string.replace("\r", "") string = string.replace("\n", "") string = string.replace("\t", "") if not spaces: string = re.sub(' +', ' ', string) else: string = re.sub(' ', '', string) string = string.replace("'", "") return string def get_image_names(self, page): """Gets color names for color swatches.""" temp = page.split("new DropDownInfo") names = {} for i in range(1, len(temp)): names[basic.get_middle_text(temp[i], "('", "'")[0]] = basic.get_middle_text(temp[i], "'", "')")[2] return names def get_recommended(self, hxs): """Gets recommended product information. Returns information about recommended products as dict""" rec = hxs.select('//div[@id="right_column_container"]/div') new = [] i = 0 for r in rec: d = {} #to do: see how to get full href(different accounts) if not i: d['link'] = r.select('div/a/@href').extract()[0] d['image'] = "http://www.partylite.biz/imaging/resize" d['image'] += r.select('div/a/img/@src').extract()[0] d['name'] = r.select('div/a/text()').extract()[0] new.append(basic.cdata(simplejson.dumps(d))) i += 1 return new def get_reviews(self, page): """Gets average product rating. Returns string like 4.6 of 5 reviews.""" id = self.get_review_id(page) url = "http://partylite.ugc.bazaarvoice.com/8504-en_us/" + id + "/reviews.djs?format=embeddedhtml" url = url.replace(" ", "") page = urllib2.urlopen(url).read() page = basic.get_middle_text(page, '<div class=\\"BVRRRatingNormalImage\\">', '<\/div>') if page: rating = basic.get_middle_text(page[0], 'alt=\\"', '\\"')[0] return [rating] else: return [] def get_more_images(self, page): """Gets field of images.""" try: script = basic.get_middle_text(page, "var moreImages", "var numberOfImages")[0] except IndexError: print "This product has no images." else: r = basic.get_middle_text(script, "moreImages[", "';") images = [] # return cdata here if needed to go with absolute links for i in range(0, len(r)): if self.production: images.append("http://www.partylite.biz" + r[i].split("= '")[1]) else: images.append("http://qa.partylite.biz" + r[i].split("= '")[1]) return images return [] def get_absolute(self, relatives): """Creates absolute path for images. [DEPRECATED] Please check if there is a need for this function again. If needed dimensions of images got from the client server can be changed here.""" new = [] print relatives os._exit(0) for i in range(0, len(relatives)): #add width, height here for different dimensions #don't change the url in here from qa to www it's meant to be qa always new.append("http://www.partylite.biz/imaging/resize?fileName=/productcatalog/production" + relatives[i]) return new def get_review_id(self, page): """Gets review id that is used in javascript for reviews.""" return basic.get_middle_text(page, 'productId: "', '"')[0] def write_subproducts(self, id, list, xml): """Writes child products to xml. Receives id, list and xml attributes, id is master product id, list is list of child products and xml is Xml instance""" for i in range(0, len(list)): item = PartyliteItem() item['master_product_id'] = id item['product_id'] = [id[0] + "_" + str(i)] item['in_stock'] = ["IN_STOCK"] for k, v in list[i].iteritems(): item[k] = [v] xml.create_xml(item) return 1 def parse_can(self, response): """Parse function for scraping canadian sites. There is meta information send in request in this function about language.""" self.counter += 1 basic.print_status(self.counter, self.total) item = PartyliteItem() hxs = HtmlXPathSelector(response) image_urls = [] if 'redirect_urls' in response.request.meta: item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0]] self.exc.code_handler(102, response.request.meta['redirect_urls']) if 'language' in response.request.meta: item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0] + "_can" + "_" + response.meta['language']] try: index = self.products['product_ids'].index(self.get_id (response.request.meta['redirect_urls'][0])[0]) item['name'] = [basic.cdata(item['product_id'][0] + self.products['names'][index])] self.products['status'][index] = 'no_avail' except KeyError as e: print "This %s id is not in list" % (item['product_id'][0]) item['in_stock'] = ['NOT_AVAILABLE'] item['product_id'] = self.remove_spaces(item['product_id']) self.xml.create_xml(item) else: index = self.products['product_ids'].index(self.get_id(response.url)[0]) try: item['product_id'] = self.get_id(response.url) item['name'], item['shown_with'] = self.get_basic_info(hxs) item['description'] = self.get_description(hxs) if 'language' in response.meta: item['product_id'] = [item['product_id'][0] + "_can" + "_" + response.meta['language']] response.meta['item'] = item page = " ".join(hxs.select('//html').extract()) image_urls = self.get_more_images(page) item['normal_image_url'] = self.get_server_path_field(image_urls) item['in_stock'] = self.get_in_stock(hxs) color_products = self.create_subproducts(page) if color_products: self.write_subproducts(item['product_id'], color_products, xml) else: item['add_to_cart_id'] = self.get_add_to_cart_id(page) item['custom_price'], item['custom_discount'] = self.get_price(hxs) self.products['status'][index] = "ran" except StandardError: basic.print_error() self.products['status'][index] = "error" self.exc.code_handler(100, response.url) else: item['product_id'] = self.remove_spaces(item['product_id']) self.xml.create_xml(item) if image_urls: item['image_urls'] = image_urls return item def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = party.get_settings_message(self.d) if self.counter < self.total: msg += "\nScraper didn't go through all products, please report" msg += "\n\nScraped %d product out of %d\n\n" % (self.counter, self.total) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.products) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] logname = filename filename = "{0}_{1}".format(filename, self.d['lang']) self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) from modules.mail import Mail from modules.export_to_db import CommonExport exp = CommonExport() if self.upload: try: if self.d['lang'] == 'us': exp.xml_to_db(self.name, filename, "55892247-1b92-4ff9-a8a3-33cc976f9341") else: exp.xml_to_db(self.name, filename, "9cb6c676-c14f-403b-b94f-b981184e1de0") msg += "\n\nExport to database successful" except StandardError: msg += "\n\nExport to database failed" else: msg += "\n\nUpload to database not selected" mail = Mail() try: mail.send_mail(msg, "Partylite: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "Partylite: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = 'logs/{0}'.format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, logname), 'w') as f: f.write(msg) def get_id(self, url): """Gets id from product url.""" return [url.split("&sku=")[1]] def get_server_path(self, url): """Gets server path for image url.""" url = url.split("partylite.biz")[1] return self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg" def get_server_path_field(self, urls): """Getting server path for field of image urls.""" new = [] for url in urls: url = url.split("partylite.biz")[1] new.append(self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg") return new def remove_spaces(self, field): new = [] for i in field: new.append(i.replace(' ', '')) return new def get_lists_from_excel(self): excel_path = "xls/{0}/{1}.xls".format(self.name, self.d['file']) xls = PartyliteExcel(path=excel_path, user=self.users['us'], production=self.production) self.products = dict() try: self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15) self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15) self.products['names'] = xls.read_excel_collumn(2, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) exc.code_handler(103, msg=msg) self.products= xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls)
class ChomeSpider(CrawlSpider): name = "chome" allowed_domains = ["zmags.com"] start_urls = ["http://www.zmags.com/"] counter = 0 def __init__(self, *a, **kw): super(ChomeSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products(self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.no_urls['product_ids']) def parse(self, response): self.counter += 1 hxs = HtmlXPathSelector(response) item = ChomeItem() print "IDs in excel feed: {0}".format(self.total) item['image_urls'] = self.parse_whole_xml() return item def parse_whole_xml(self): xml_dir = "xml/{0}".format(self.name) file_url = "https://svc.celebratinghome.com/ZMags.svc/ProductInfo1" downloader = Downloader() if self.d['download']: downloader.get_file(xml_dir, file_url, "client_feed") else: if not os.path.exists('xml/{0}/client_feed.xml'.format(self.name)): basic.warning("Feed file doesn't exist please de-select no download option") os._exit(2) self.number = 0 xml_item = ChomeItem() urls_all = [] for event, elem in iterparse('xml/{0}/client_feed.xml'.format(self.name)): if elem.tag == "{http://schemas.microsoft.com/ado/2007/08/dataservices/metadata}properties": for r in elem: p = "{http://schemas.microsoft.com/ado/2007/08/dataservices}" if r.tag == p + "Id" and r.text in self.no_urls['product_ids']: index = self.no_urls['product_ids'].index(r.text) self.no_urls['status'][index] = 'ran' self.number += 1 urls = [] flag = 0 for x in elem: if x.tag == p + "Id": xml_item['product_id'] = [x.text] elif x.tag == p + "EngLongDesc" and x.text is not None: xml_item['description_english'] = [self.escape(basic.cdata(x.text))] elif x.tag == p + "RetailPrice": xml_item['custom_price'] = [x.text[:-2]] elif x.tag == p + "SpnLongDesc" and x.text is not None: xml_item['description_spanish'] = [self.escape(basic.cdata(x.text))] elif x.tag == p + "PartNumber": xml_item['add_to_cart_id'] = [x.text] elif x.tag == p + "MaxQty": xml_item['max_qty'] = [x.text] elif x.tag == p + "TimeType": xml_item['time_type'] = [x.text] elif x.tag == p + "SpnName" and x.text is not None: xml_item['name_spanish'] = [x.text] elif x.tag == p + "EngName": xml_item['name_english'] = [x.text] elif x.tag == p + "ImagePath_Large" and x.text is not None: urls.append(self.get_absolute(x.text)) xml_item['normal_image_url'] = [self.get_server_path(self.get_absolute(x.text))] elif x.tag == p + "IsActive": if x.text == 0: xml_item['in_stock'] = ["NOT_IN_STOCK"] else: xml_item['in_stock'] = ['IN_STOCK'] else: for i in range(1, 4): tag = p + "Alternate%sImagePath_Large" % (str(i)) if x.tag == tag and x.text is not None: urls.append(self.get_absolute(x.text)) xml_item['normal_image_url'].append(self.get_server_path(self.get_absolute(x.text))) # change image paths for normal_image_url and return urls self.xml.create_xml(xml_item) urls_all += urls for i in range(0, len(self.no_urls['status'])): if self.no_urls['status'][i] != 'ran': self.no_urls['status'][i] = 'not_found' return urls_all def get_server_path(self, url): path = self.images_store + "/full/" + hashlib.sha1(url).hexdigest() + ".jpg" return path def get_absolute(self, url): return "http://www.celebratinghome.com/" + url def escape(self, string): temp = HTMLParser.HTMLParser().unescape(string) return HTMLParser.HTMLParser().unescape(temp) def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}\n".format(datetime.now()) if self.total - self.number: msg += "{0} id(s) from id list weren't found in feed".format(self.total - self.number) basic.warning(msg) else: msg += "All ids found in feed." basic.green(msg) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.no_urls) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) #if self.d['upload']: #exp = CommonExport() #try: #exp.xml_to_db(self.name, self.d['file'], "40b029c9-dff7-4bc1-b8bc-ef062960b24d") #msg += "\n\nExport to database successful" #except StandardError: #msg += "\n\nExport to database failed" #else: #msg += "\n\nUpload to database not selected" from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "CelebratingHome: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "CelebratingHome: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products['product_ids'] = xls.read_excel_collumn_for_ids(1, 15) self.products['names'] = xls.read_excel_collumn(2, 15) self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format(self.d['file']) self.exc.code_handler(103, msg=msg) self.products= xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls) def add_properties(self, xml): xml.add_property("description_english", "Description English", "text") xml.add_property("description_spanish", "Description Spanish", "text") xml.add_property("add_to_cart_id", "Add To Cart ID", "text") xml.add_property("max_qty", "Max Quantity", "text") xml.add_property("time_type", "Time Type", "text") xml.add_property("name_english", "Name English", "text") xml.add_property("name_spanish", "Name Spanish", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("custom_price", "Custom Price", "text")
class ChomeSpider(CrawlSpider): name = "chome" allowed_domains = ["zmags.com"] start_urls = ["http://www.zmags.com/"] counter = 0 def __init__(self, *a, **kw): super(ChomeSpider, self).__init__(*a, **kw) dispatcher.connect(self.spider_closed, signals.spider_closed) terminal = DatabaseTerminal(sys.argv, self.name) self.d = terminal.get_arguments() self.xml = CommonXml() self.exc = ZmagsException(5) if self.d['database']: self.database = Database() self.database.connect() self.products, self.no_urls = self.database.select_products( self.d['catalog_id'], self.d['product_id']) self.database.disconnect() else: self.get_lists_from_excel() self.add_properties(self.xml) self.images_store = "/" + settings['IMAGES_STORE'] self.total = len(self.no_urls['product_ids']) def parse(self, response): self.counter += 1 hxs = HtmlXPathSelector(response) item = ChomeItem() print "IDs in excel feed: {0}".format(self.total) item['image_urls'] = self.parse_whole_xml() return item def parse_whole_xml(self): xml_dir = "xml/{0}".format(self.name) file_url = "https://svc.celebratinghome.com/ZMags.svc/ProductInfo1" downloader = Downloader() if self.d['download']: downloader.get_file(xml_dir, file_url, "client_feed") else: if not os.path.exists('xml/{0}/client_feed.xml'.format(self.name)): basic.warning( "Feed file doesn't exist please de-select no download option" ) os._exit(2) self.number = 0 xml_item = ChomeItem() urls_all = [] for event, elem in iterparse('xml/{0}/client_feed.xml'.format( self.name)): if elem.tag == "{http://schemas.microsoft.com/ado/2007/08/dataservices/metadata}properties": for r in elem: p = "{http://schemas.microsoft.com/ado/2007/08/dataservices}" if r.tag == p + "Id" and r.text in self.no_urls[ 'product_ids']: index = self.no_urls['product_ids'].index(r.text) self.no_urls['status'][index] = 'ran' self.number += 1 urls = [] flag = 0 for x in elem: if x.tag == p + "Id": xml_item['product_id'] = [x.text] elif x.tag == p + "EngLongDesc" and x.text is not None: xml_item['description_english'] = [ self.escape(basic.cdata(x.text)) ] elif x.tag == p + "RetailPrice": xml_item['custom_price'] = [x.text[:-2]] elif x.tag == p + "SpnLongDesc" and x.text is not None: xml_item['description_spanish'] = [ self.escape(basic.cdata(x.text)) ] elif x.tag == p + "PartNumber": xml_item['add_to_cart_id'] = [x.text] elif x.tag == p + "MaxQty": xml_item['max_qty'] = [x.text] elif x.tag == p + "TimeType": xml_item['time_type'] = [x.text] elif x.tag == p + "SpnName" and x.text is not None: xml_item['name_spanish'] = [x.text] elif x.tag == p + "EngName": xml_item['name_english'] = [x.text] elif x.tag == p + "ImagePath_Large" and x.text is not None: urls.append(self.get_absolute(x.text)) xml_item['normal_image_url'] = [ self.get_server_path( self.get_absolute(x.text)) ] elif x.tag == p + "IsActive": if x.text == 0: xml_item['in_stock'] = ["NOT_IN_STOCK"] else: xml_item['in_stock'] = ['IN_STOCK'] else: for i in range(1, 4): tag = p + "Alternate%sImagePath_Large" % ( str(i)) if x.tag == tag and x.text is not None: urls.append(self.get_absolute(x.text)) xml_item['normal_image_url'].append( self.get_server_path( self.get_absolute(x.text))) # change image paths for normal_image_url and return urls self.xml.create_xml(xml_item) urls_all += urls for i in range(0, len(self.no_urls['status'])): if self.no_urls['status'][i] != 'ran': self.no_urls['status'][i] = 'not_found' return urls_all def get_server_path(self, url): path = self.images_store + "/full/" + hashlib.sha1( url).hexdigest() + ".jpg" return path def get_absolute(self, url): return "http://www.celebratinghome.com/" + url def escape(self, string): temp = HTMLParser.HTMLParser().unescape(string) return HTMLParser.HTMLParser().unescape(temp) def spider_closed(self, spider): """Handles spider_closed signal from end of scraping. Handles usual end operations for scraper like writing xml, exporting to database and sending appropriate mail message.""" msg = "Ran: {0}\n".format(datetime.now()) if self.total - self.number: msg += "{0} id(s) from id list weren't found in feed".format( self.total - self.number) basic.warning(msg) else: msg += "All ids found in feed." basic.green(msg) # filename for writing xml if self.d['database']: try: self.database.connect() filename = self.database.get_name(self.d['catalog_id']) self.database.update_db(self.no_urls) self.database.disconnect() msg += "\nRan from interface.\n" except: msg += "\nUpdating database failed, please report." else: msg += "\nRan from console.\n" filename = self.d['file'] self.xml.write_xml(self.name, filename) msg += self.exc.create_message(self.counter) #if self.d['upload']: #exp = CommonExport() #try: #exp.xml_to_db(self.name, self.d['file'], "40b029c9-dff7-4bc1-b8bc-ef062960b24d") #msg += "\n\nExport to database successful" #except StandardError: #msg += "\n\nExport to database failed" #else: #msg += "\n\nUpload to database not selected" from modules.mail import Mail mail = Mail() try: mail.send_mail(msg, "CelebratingHome: {0}".format(filename)) if self.d['email']: mail.send_mail(msg, "CelebratingHome: {0}".format(filename), self.d['email']) except: msg += "\nSending mail failed." if self.d['database']: path = "logs/{0}".format(self.name) if not os.path.exists(path): os.makedirs(path) with open("{0}/{1}".format(path, filename), 'w') as f: f.write(msg) def get_lists_from_excel(self): xls = DictExcel(basic.get_excel_path(self.name, self.d['file'])) self.products = dict() try: self.products['product_ids'] = xls.read_excel_collumn_for_ids( 1, 15) self.products['names'] = xls.read_excel_collumn(2, 15) self.products['urls'] = xls.read_excel_collumn_for_urls(3, 15) except IOError as e: msg = "I/O error {0}: {1}".format(e.errno, e.strerror) msg += "\nError occurred for given file: {0}".format( self.d['file']) self.exc.code_handler(103, msg=msg) except StandardError: msg = "Error reading excel file" msg += "\nError occurred for given file: {0}".format( self.d['file']) self.exc.code_handler(103, msg=msg) self.products = xls.delete_duplicates_dict(self.products) self.products, self.no_urls = xls.separate_no_urls(self.products) self.products = xls._add_none_status(self.products) self.no_urls = xls._add_none_status(self.no_urls) def add_properties(self, xml): xml.add_property("description_english", "Description English", "text") xml.add_property("description_spanish", "Description Spanish", "text") xml.add_property("add_to_cart_id", "Add To Cart ID", "text") xml.add_property("max_qty", "Max Quantity", "text") xml.add_property("time_type", "Time Type", "text") xml.add_property("name_english", "Name English", "text") xml.add_property("name_spanish", "Name Spanish", "text") xml.add_property("in_stock", "In Stock", "text") xml.add_property("custom_price", "Custom Price", "text")