Python cdata примеры, modules.basic_func.cdata Python примеры использования

Пример #1

0

Показать файл

Файл: partylite_spider.py Проект: marjevtic/testMarko

 def create_subproducts(self, page):
     """Gets information about colors from javascript.
     Returns field of dicts with information about colors.
     Those are really color variants for product."""
     try:
         tmp = page.split("var largeImages = new Array();")[1]
     except IndexError:
         print "This product has no images"
     else:
         tmp = tmp.split("colorDropdownArray")[0]
         images = basic.get_middle_text(tmp, "ProductGroupProduct(", ");")
         image_names = self.get_image_names(page)
         color_products = []
         for im in images:
             product = {}
             attributes = im.split("',")
             product['normal_image_url'] = "http://qa.partylite.biz/imaging/resize?fileName=/productcatalog/production"
             product['normal_image_url'] += self.custom_clean_string(attributes[26], True)
             product['description'] = basic.cdata(self.custom_clean_string(attributes[27]))
             product['color_id'] = self.custom_clean_string(attributes[7], True)
             product['swatch_color'] = basic.cdata(self.custom_clean_string(attributes[9]).replace(" ", ""))
             product['name'] = basic.cdata(image_names[product['color_id']])
             product['add_to_cart_id'] = self.custom_clean_string(attributes[0], True).replace(" ", "")
             product['price'] = self.custom_clean_string(attributes[10], True)
             color_products.append(product)
         return color_products
     return []

Пример #2

0

Показать файл

 def parse(self, response):
     self.counter += 1
     basic.print_status(self.counter, self.total)
     hxs = HtmlXPathSelector(response)
     item = BootsItem()
     item['product_id'], item['store_id'], item['lang_id'], item['catalog_id'] = self.get_ids(hxs)
     item['name'] = self.get_name(hxs)
     item['short_description'], sponsored, description, in_stock, item['ingredients'], patient_information_url, item['offer'], item['promotion'] = self.get_description(hxs)
     item['rating'] = self.get_rating(hxs)
     size, price_per_size = self.get_size(hxs)
     item['normal_image_url'], image_urls = self.get_images(hxs)
     brand, brand_image_url = self.get_brand(hxs)
     item['save_money'], item['old_price'] = self.get_oldies(hxs)
     for i in range(0, len(description)):
         tag = 'description_%d' % (i + 1)
         item[tag] = [basic.cdata(description[i])]
     if sponsored is not None:
         item['sponsored'] = sponsored
     item['in_stock'] = ["NOT_IN_STOCK"]
     if in_stock == "In stock":
         item['in_stock'] = ["IN_STOCK"]
         item['order_id'] = hxs.select('//input[@name="orderId"]/@value').extract()
         item['cat_entry_id'] = hxs.select('//input[@name="catEntryId"]/@value').extract()
         item['calculation_usage_id'] = hxs.select('//input[@name="calculationUsageId"]/@value').extract()
     if brand_image_url is not None:
         item['brand'] = brand
         item['brand_image_url'] = ["43662980-f344-11e1-a21f-0800200c9a66/full/" + self.get_image_sha1(brand_image_url)]
         image_urls.append(brand_image_url)
     if patient_information_url is not None:
         item['patient_information_url'] = [basic.cdata(patient_information_url)]
     prices, point_prices, collect_points, colors, color_image_urls, variant_ids = self.get_color_variants(hxs)
     if size is not None:
         item['size'] = size
         item['price_per_size'] = price_per_size
     elif variant_ids is None:
         prices, point_prices, collect_points, sizes, variant_ids = self.get_size_variants(hxs)
     if color_image_urls is not None:
         image_urls.extend(color_image_urls)
     if variant_ids is not None:
         self.xml.create_xml(item)
         if colors is not None:
             self.create_color_variants(prices, point_prices, colors, color_image_urls, variant_ids, collect_points, item['product_id'])
         else:
             self.create_size_variants(prices, point_prices, sizes, variant_ids, collect_points, item['product_id'])
     else:
         prices = hxs.select('//p[@class="price"]/text()').extract()[0]
         point_prices = hxs.select('//span[@class="pointsPrice"]/text()').extract()[0]
         collect_points = [basic.get_price(hxs.select('//p[@class="collectPoints"]/text()').extract()[0])]
         item['price'] = [basic.get_price(prices)]
         item['points_price'] = [basic.get_price(point_prices)]
         item['collect_points'] = collect_points
         self.xml.create_xml(item)
     item['image_urls'] = image_urls
     #raw_input("Press Enter to continue...")
     return item

Пример #3

0

Показать файл

Файл: shop.py Проект: marjevtic/testMarko

 def _create_shop_looks(self, ids, names, urls):
     item = ExpressItem()
     for i in range(0, len(ids)):
         item['product_id'] = [ids[i]]
         item['name'] = [basic.cdata(names[i])]
         item['normal_image_url'] = [basic.cdata(urls[i])]
         item['shop_look'] = ['True']
         item['normal'] = ['False']
         item['shop_line'] = ['False']
         item['in_stock'] = ['IN_STOCK']
         self.xml.create_xml(item)

Пример #4

0

Показать файл

Файл: shop.py Проект: marjevtic/testMarko

 def _create_shop_looks(self, ids, names, urls):
     item = ExpressItem()
     for i in range(0, len(ids)):
         item['product_id'] = [ids[i]]
         item['name'] = [basic.cdata(names[i])]
         item['normal_image_url'] = [basic.cdata(urls[i])]
         item['shop_look'] = ['True']
         item['normal'] = ['False']
         item['shop_line'] = ['False']
         item['in_stock'] = ['IN_STOCK']
         self.xml.create_xml(item)

Пример #5

0

Показать файл

 def get_description(self, hxs):
     short_description = hxs.select('//div[@class="productIntroCopy"]').extract()[0]
     try:
         suitable_for = ''.join(hxs.select('//div[@id="suitableFor"]//h4 | //div[@id="suitableFor"]//p | //div[@id="suitableFor"]//div').extract())
         short_description += suitable_for
     except:
         print "There's no suitable_for section"
     try:
         ingredients = basic.clean_string(' '.join(hxs.select('//div[@class="pd_panel"][not(@id)]//div[@class="pd_HTML"]/p | //div[@class="pd_panel"][not(@id)]//div[@class="pd_HTML"]//div').extract()))
         if ingredients != '':
             ingredients = basic.cdata(ingredients)
     except:
         print "No ingredients found!"
         ingredients = None
     try:
         patient_information_url = hxs.select('//div[@class="downloadMedia"]//a/@href').extract()[0]
     except:
         print "No patient information found!"
         patient_information_url = None
     try:
         offer = hxs.select('//div[@id="mainOffer"]//a/text()').extract()[0]
     except:
         print "No special offer found!"
         offer = None
     try:
         promotion = hxs.select('//div[@id="otherOffers"]//a/text()').extract()
     except:
         print "No promotion found!"
         promotion = None
     try:
         sponsored = hxs.select('//div[@class="sponsored"]//p/text()').extract()[0]
     except:
         print "No sponsor message found!"
         sponsored = None
     description = ''.join(hxs.select('//div[@id="detailedInfo"]//div[@class="pd_panelInner"]//div[@class="pd_HTML"]').extract())
     description = basic.clean_string(description)
     description_overflow = len(description)/2000
     desc = []
     if description_overflow > 0:
         for i in range(0, description_overflow + 1):
             if i < description_overflow:
                 desc.append(description[2000*(i):2000*(i+1)-1])
             else:
                 desc.append(description[2000*i:])
     else:
         desc = [description]
     try:
         in_stock = hxs.select('//div[@class="icon_pl_stock"]/text()').extract()[0]
     except:
         in_stock = ""
     return [basic.cdata(basic.clean_string(short_description))], [sponsored], desc, in_stock, [ingredients], patient_information_url, [offer], promotion

Пример #6

0

Показать файл

    def get_emb(self, hxs):
        emb = hxs.select('//div[@id="emb"]').extract()
        lettering_colors = hxs.select(
            '//select[@id="threadcolor"]/option/@value').extract()
        urls = []
        d = {}
        colors = []
        for i in range(1, len(lettering_colors)):
            d['type'] = "lettering colors"
            d['name'] = lettering_colors[i]
            url = "http://www.lydiasuniforms.com/images/lydias/threadcolor_"
            url += lettering_colors[i].lower().replace(' ', '_') + ".gif"
            d['url'] = self.get_server_path_single(url)

            urls.append(url)
            colors.append(basic.cdata(simplejson.dumps(d)))
        lettering = hxs.select(
            '//select[@id="lettering"]/option/@value').extract()
        l = {}
        letterings = []
        for i in range(1, len(lettering)):
            l['type'] = "lettering"
            l['name'] = lettering[i]
            url = "http://www.lydiasuniforms.com/images/lydias/lettering_"
            url += lettering[i].lower().replace(' ', '_') + ".gif"
            l['url'] = self.get_server_path_single(url)
            letterings.append(basic.cdata(simplejson.dumps(l)))
            urls.append(url)
        logo = hxs.select('//select[@id="logoname"]/option/@value').extract()
        logos = {}
        log = []
        for i in range(1, len(logo)):
            logos['type'] = "logo"
            logos['name'] = logo[i]
            url = "http://www.lydiasuniforms.com/images/logos/"
            url += logo[i].lower() + ".jpg"
            logos['url'] = self.get_server_path_single(url)
            urls.append(url)
            log.append(basic.cdata(simplejson.dumps(logos)))
        item = LydiasItem()
        item['color'] = colors
        item['lettering'] = letterings
        item['log'] = log
        xml.create_xml(item)
        xml.write_xml("emb")

        return urls
        print colors, letterings, log
        os._exit(0)

Пример #7

0

Показать файл

Файл: guitar_center_spider.py Проект: marjevtic/testMarko

 def get_serials(self, hxs):
     serials = hxs.select('//var[@class="hidden styleInfo"]/text()').extract()
     new = []
     for serial in serials:
         d = simplejson.loads(serial)
         new.append(basic.cdata(simplejson.dumps(d)))
     return new

Пример #8

0

Показать файл

 def get_serials(self, hxs):
     serials = hxs.select('//var[@class="hidden styleInfo"]/text()').extract()
     new = []
     for serial in serials:
         d = simplejson.loads(serial)
         new.append(basic.cdata(simplejson.dumps(d)))
     return new

Пример #9

0

Показать файл

Файл: guitar_center_spider.py Проект: marjevtic/testMarko

 def get_colors(self, hxs):
     colors = hxs.select('//var[@class="styleInfo"]/text()').extract()
     new = []
     for color in colors:
         d = simplejson.loads(color)
         new.append(basic.cdata(simplejson.dumps(d)))
     return new

Пример #10

0

Показать файл

 def get_colors(self, hxs):
     colors = hxs.select('//var[@class="styleInfo"]/text()').extract()
     new = []
     for color in colors:
         d = simplejson.loads(color)
         new.append(basic.cdata(simplejson.dumps(d)))
     return new

Пример #11

0

Показать файл

Файл: burton_spider.py Проект: marjevtic/testMarko

 def get_description(self, hxs):
     description = hxs.select(
         '//div[@id="FieldsetProductInfo"]/text()').extract()[3]
     features = hxs.select('//div[@id="FieldsetProductInfo"]/ul').extract()
     if features:
         features = [features[0][:2000]]
     return [basic.cdata(description)], basic.cdata_field(features)

Пример #12

0

Показать файл

Файл: lydias_spider.py Проект: marjevtic/testMarko

    def get_emb(self, hxs):
        emb = hxs.select('//div[@id="emb"]').extract()
        lettering_colors = hxs.select('//select[@id="threadcolor"]/option/@value').extract()
        urls = []
        d = {}
        colors = []
        for i in range(1, len(lettering_colors)):
            d['type'] = "lettering colors"
            d['name'] = lettering_colors[i]
            url = "http://www.lydiasuniforms.com/images/lydias/threadcolor_"
            url += lettering_colors[i].lower().replace(' ', '_') + ".gif"
            d['url'] = self.get_server_path_single(url)

            urls.append(url)
            colors.append(basic.cdata(simplejson.dumps(d)))
        lettering = hxs.select('//select[@id="lettering"]/option/@value').extract()
        l = {}
        letterings = []
        for i in range(1, len(lettering)):
            l['type'] = "lettering"
            l['name'] = lettering[i]
            url = "http://www.lydiasuniforms.com/images/lydias/lettering_"
            url += lettering[i].lower().replace(' ', '_') + ".gif"
            l['url'] = self.get_server_path_single(url)
            letterings.append(basic.cdata(simplejson.dumps(l)))
            urls.append(url)
        logo = hxs.select('//select[@id="logoname"]/option/@value').extract()
        logos = {}
        log = []
        for i in range(1, len(logo)):
            logos['type'] = "logo"
            logos['name'] = logo[i]
            url = "http://www.lydiasuniforms.com/images/logos/"
            url += logo[i].lower() + ".jpg"
            logos['url'] = self.get_server_path_single(url)
            urls.append(url)
            log.append(basic.cdata(simplejson.dumps(logos)))
        item = LydiasItem()
        item['color'] = colors
        item['lettering'] = letterings
        item['log'] = log
        xml.create_xml(item)
        xml.write_xml("emb")

        return urls
        print  colors, letterings, log
        os._exit(0)

Пример #13

0

Показать файл

    def create_sizes_subproducts(self, main_id, id, color_code, hxs):
        print color_code
        jsons = []
        # if block for cases when color is provided
        if color_code != "":
            showmode = hxs.select(
                '//input[@name="showmode"]/@value').extract()[0]
            itemmode = hxs.select(
                '//input[@name="itemmode"]/@value').extract()[0]
            salemode = hxs.select(
                '//input[@name="salemode"]/@value').extract()[0]
            url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=%s&opt2=-1&type2=l1type&" \
                "type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=193" % (main_id, color_code, showmode, itemmode, salemode)
            page = urllib2.urlopen(url).read()
            page = page.replace("'", "")
            page = page.replace("[", ",")
            page = page.replace(",,", "")
            temp = page.split("]")
            for i in range(0, len(temp) - 2):
                tmp = temp[i].split(",")
                item = {}
                #                item['master_product_id'] = [id]
                item['size_short'] = tmp[0]
                item['price_url'] = self.get_size_price(
                    str(main_id), str(color_code), tmp[0])
                item['size'] = tmp[1]
                #                item['product_id'] = [id + "_" + str(i)]
                #                item['in_stock'] = ["IN_STOCK"]
                #                xml.create_xml(item)
                jsons.append(basic.cdata(simplejson.dumps(item)))
            return jsons

        # when the color is not provided different block of code cause it's done differently on the page
        else:
            temp = hxs.select('//div[@class="not_size"]/text()').extract()
            for i in range(0, len(temp)):
                item = {}
                #                item['master_product_id'] = [id]
                #                item['product_id'] = [id + "_" + str(i)]
                item['size_short'] = temp[i]
                item['price_url'] = self.get_size_price(
                    str(main_id), "", temp[i])
                #                item['in_stock'] = ["IN_STOCK"]
                #                xml.create_xml(item)
                jsons.append(basic.cdata(simplejson.dumps(item)))
            return jsons

Пример #14

0

Показать файл

Файл: partylite_spider.py Проект: marjevtic/testMarko

 def get_basic_info(self, hxs):
     """Getting basic info about products (name, shown with)."""
     name = hxs.select('//div[@id="product_name"]/text()').extract()
     if name:
         name = basic.cdata_field(name)
     shown_with = hxs.select('//div[@id="shown_with_container"]').extract()
     if shown_with:
         shown_with = [basic.cdata(shown_with[0])]
     return name, shown_with

Пример #15

0

Показать файл

Файл: lydias_spider.py Проект: marjevtic/testMarko

 def make_colors_json(self, color_urls, color_names, color_codes):
     dict = {}
     jsons = []
     for i in range(0, len(color_urls)):
         dict['color_url'] = self.get_server_path_single(color_urls[i])
         dict['color_name'] = color_names[i]
         dict['color_short'] = color_codes[i]
         json = basic.cdata(simplejson.dumps(dict))
         jsons.append(json)
     return jsons

Пример #16

0

Показать файл

 def make_colors_json(self, color_urls, color_names, color_codes):
     dict = {}
     jsons = []
     for i in range(0, len(color_urls)):
         dict['color_url'] = self.get_server_path_single(color_urls[i])
         dict['color_name'] = color_names[i]
         dict['color_short'] = color_codes[i]
         json = basic.cdata(simplejson.dumps(dict))
         jsons.append(json)
     return jsons

Пример #17

0

Показать файл

 def gold_coverage(self, hxs):
     ids = hxs.select('//div[@class="goldCoverage"]/input[@type="checkbox"]/@value').extract()
     labels = hxs.select('//div[@class="goldCoverage"]/label/text()').extract()
     d = {}
     new = []
     for i in range(0, len(ids)):
         d['id'] = ids[i]
         d['name'] = labels[i]
         new.append(basic.cdata(simplejson.dumps(d)))
     return new

Пример #18

0

Показать файл

Файл: guitar_center_spider.py Проект: marjevtic/testMarko

 def gold_coverage(self, hxs):
     ids = hxs.select('//div[@class="goldCoverage"]/input[@type="checkbox"]/@value').extract()
     labels = hxs.select('//div[@class="goldCoverage"]/label/text()').extract()
     d = {}
     new = []
     for i in range(0, len(ids)):
         d["id"] = ids[i]
         d["name"] = labels[i]
         new.append(basic.cdata(simplejson.dumps(d)))
     return new

Пример #19

0

Показать файл

Файл: partylite_spider.py Проект: marjevtic/testMarko

 def parse_can(self, response):
     """Parse function for scraping canadian sites.
     There is meta information send in request in this function about language."""
     self.counter += 1
     basic.print_status(self.counter, self.total)
     item = PartyliteItem()
     hxs = HtmlXPathSelector(response)
     image_urls = []
     if  'redirect_urls' in response.request.meta:
         item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0]]
         self.exc.code_handler(102, response.request.meta['redirect_urls'])
         if 'language' in response.request.meta:
             item['product_id'] = [self.get_id(response.request.meta['redirect_urls'][0])[0]
                                   + "_can" + "_" + response.meta['language']]
         try:
             index = self.products['product_ids'].index(self.get_id
                             (response.request.meta['redirect_urls'][0])[0])
             item['name'] = [basic.cdata(item['product_id'][0]
                             + self.products['names'][index])]
             self.products['status'][index] = 'no_avail'
         except KeyError as e:
             print "This %s id is not in list" % (item['product_id'][0])
         item['in_stock'] = ['NOT_AVAILABLE']
         item['product_id'] = self.remove_spaces(item['product_id'])
         self.xml.create_xml(item)
     else:
         index = self.products['product_ids'].index(self.get_id(response.url)[0])
         try:
             item['product_id'] = self.get_id(response.url)
             item['name'], item['shown_with'] = self.get_basic_info(hxs)
             item['description'] = self.get_description(hxs)
             if 'language' in response.meta:
                 item['product_id'] = [item['product_id'][0] + "_can" + "_" + response.meta['language']]
             response.meta['item'] = item
             page = " ".join(hxs.select('//html').extract())
             image_urls = self.get_more_images(page)
             item['normal_image_url'] = self.get_server_path_field(image_urls)
             item['in_stock'] = self.get_in_stock(hxs)
             color_products = self.create_subproducts(page)
             if color_products:
                 self.write_subproducts(item['product_id'], color_products, xml)
             else:
                 item['add_to_cart_id'] = self.get_add_to_cart_id(page)
                 item['custom_price'], item['custom_discount'] = self.get_price(hxs)
             self.products['status'][index] = "ran"
         except StandardError:
             basic.print_error()
             self.products['status'][index] = "error"
             self.exc.code_handler(100, response.url)
         else:
             item['product_id'] = self.remove_spaces(item['product_id'])
             self.xml.create_xml(item)
     if image_urls:
         item['image_urls'] = image_urls
     return item

Пример #20

0

Показать файл

Файл: sportman_spider.py Проект: marjevtic/testMarko

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = SportmanItem()
        if "redirect_urls" in response.request.meta:
            cur_url = response.request.meta["redirect_urls"][0]
        else:
            cur_url = response.url
        index = self.products["urls"].index(cur_url)
        try:
            if "redirect_urls" in response.request.meta:
                item["product_id"] = [self.products["product_ids"][index]]
                item["name"] = [self.products["names"][index]]
                item["in_stock"] = ["NOT_AVAILABLE"]
                self.exc.code_handler(102, response.url)
                self.xml.create_xml(item)
                self.products["status"][index] = "no_avail"
            else:
                item["name"], item["short_desc"], item["description"], item["old_price"], item["custom_price"], item[
                    "product_id"
                ], item["sku"] = self.get_basic_info(hxs)
                item["in_stock"] = ["IN_STOCK"]
                viewstate, eventval, prevpage, hidden, view_page, even_page, pre_page, hidd_page = self.get_vars(
                    response, hxs
                )

                viewstate1 = viewstate[:2000]
                viewstate2 = viewstate[2000:4000]
                viewstate3 = viewstate[4000:6000]
                viewstate4 = viewstate[6000:8000]
                viewstate5 = viewstate[8000:10000]
                viewstate6 = viewstate[10000:]

                item["viewstate1"] = [basic.cdata(viewstate1)]
                item["viewstate2"] = [basic.cdata(viewstate2)]
                item["viewstate3"] = [basic.cdata(viewstate3)]
                item["viewstate4"] = [basic.cdata(viewstate4)]
                item["viewstate5"] = [basic.cdata(viewstate5)]
                item["viewstate6"] = [basic.cdata(viewstate6)]
                item["eventval"] = [basic.cdata(eventval)]
                item["size_options"] = self.get_variants(hxs, response)

                images_url = self.get_images(hxs)

                item["normal_image_url"] = self.get_server_path(images_url)

                self.xml.create_xml(item)
                item.clear()
                item["image_urls"] = self.get_images(hxs)
                self.products["status"][index] = "ran"
        except:
            self.exc.code_handler(100, response.url)
            self.products["status"][index] = "error"
        else:
            return item

Пример #21

0

Показать файл

Файл: lydias_spider.py Проект: marjevtic/testMarko

    def create_sizes_subproducts(self, main_id, id, color_code, hxs):
        print color_code
        jsons = []
        # if block for cases when color is provided
        if color_code != "":
            showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0]
            itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0]
            salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0]
            url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=%s&opt2=-1&type2=l1type&" \
                "type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=193" % (main_id, color_code, showmode, itemmode, salemode)
            page = urllib2.urlopen(url).read()
            page = page.replace("'", "")
            page = page.replace("[", ",")
            page = page.replace(",,", "")
            temp = page.split("]")
            for i in range(0, len(temp) - 2):
                tmp = temp[i].split(",")
                item = {}
#                item['master_product_id'] = [id]
                item['size_short'] = tmp[0]
                item['price_url'] = self.get_size_price(str(main_id), str(color_code), tmp[0])
                item['size'] = tmp[1]
#                item['product_id'] = [id + "_" + str(i)]
#                item['in_stock'] = ["IN_STOCK"]
#                xml.create_xml(item)
                jsons.append(basic.cdata(simplejson.dumps(item)))
            return jsons

        # when the color is not provided different block of code cause it's done differently on the page
        else:
            temp = hxs.select('//div[@class="not_size"]/text()').extract()
            for i in range(0, len(temp)):
                item = {}
#                item['master_product_id'] = [id]
#                item['product_id'] = [id + "_" + str(i)]
                item['size_short'] = temp[i]
                item['price_url'] = self.get_size_price(str(main_id), "", temp[i])
#                item['in_stock'] = ["IN_STOCK"]
#                xml.create_xml(item)
                jsons.append(basic.cdata(simplejson.dumps(item)))
            return jsons

Пример #22

0

Показать файл

Файл: lydias_spider.py Проект: marjevtic/testMarko

 def get_basic_info(self, hxs):
     name = hxs.select('//div[@id="proddetail"]/h1/text()').extract()
     price = hxs.select('//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()').extract()
     description = basic.cdata(hxs.select('//div[@id="details"]').extract()[0])
     description = basic.clean_string(description)
     old_price = hxs.select('//span[@class="yourprice_product"]/text()').extract()
     if not price:
         price = hxs.select('//span[@id="PriceDisplay"]/text()').extract()
     if old_price:
         old_price = [re.sub('[^0-9.]', '', old_price[0])]
     price = [re.sub('[^0-9.]', '', price[0])]
     return name, price, old_price, [description]

Пример #23

0

Показать файл

Файл: kenneth_spider.py Проект: marjevtic/testMarko

 def make_json(self, ids, names, prices, images, urls):
     jsons = []
     for i in range(0, len(ids)):
         json = "{" + ' "id" : "' + str(ids[i][0]) + '", '
         json += '"name" : "' + str(names[i][0]) + '", '
         # insert function for storing the right image path
         json += '"image_url" : "' + str(images[i]) + '", '
         json += '"product_url" : "' + urls[i] + '", '
         json += '"price" : "' + str(prices[i][0]) + '" } '
         json = basic.cdata(json)
         jsons.append(json)
     return jsons

Пример #24

0

Показать файл

 def get_basic_info(self, hxs):
     """Gets basic info about products.
     Returns description and promo text"""
     description = hxs.select('//li[@class="cat-pro-desc"]').extract()[0]
     description = basic.clean_string(description)
     description = [basic.cdata(description)]
     promo_text = hxs.select('//span[@class="cat-pro-promo-text"]/text()').extract()
     if not promo_text:
         promo_text = hxs.select('//span[@class="cat-pro-promo-text"]/font').extract()
     if promo_text:
         promo_text = basic.cdata_field(promo_text)
     return description, promo_text

Пример #25

0

Показать файл

    def parse(self, response):
        self.counter += 1
        basic.print_status(self.counter, self.total)
        hxs = HtmlXPathSelector(response)
        item = SportmanItem()
        if 'redirect_urls' in response.request.meta:
            cur_url = response.request.meta['redirect_urls'][0]
        else:
            cur_url = response.url
        index = self.products['urls'].index(cur_url)
        try:
            if 'redirect_urls' in response.request.meta:
                item['product_id'] = [self.products['product_ids'][index]]
                item['name'] = [self.products['names'][index]]
                item['in_stock'] = ["NOT_AVAILABLE"]
                self.exc.code_handler(102, response.url)
                self.xml.create_xml(item)
                self.products["status"][index] = "no_avail"
            else:
                item["name"], item["short_desc"], item["description"], item[
                    "old_price"], item["custom_price"], item[
                        "product_id"], item["sku"] = self.get_basic_info(hxs)
                item['in_stock'] = ['IN_STOCK']
                viewstate, eventval, prevpage, hidden, view_page, even_page, pre_page, hidd_page = self.get_vars(
                    response, hxs)

                viewstate1 = viewstate[:2000]
                viewstate2 = viewstate[2000:4000]
                viewstate3 = viewstate[4000:6000]
                viewstate4 = viewstate[6000:8000]
                viewstate5 = viewstate[8000:10000]
                viewstate6 = viewstate[10000:]

                item["viewstate1"] = [basic.cdata(viewstate1)]
                item["viewstate2"] = [basic.cdata(viewstate2)]
                item["viewstate3"] = [basic.cdata(viewstate3)]
                item["viewstate4"] = [basic.cdata(viewstate4)]
                item["viewstate5"] = [basic.cdata(viewstate5)]
                item["viewstate6"] = [basic.cdata(viewstate6)]
                item["eventval"] = [basic.cdata(eventval)]
                item["size_options"] = self.get_variants(hxs, response)

                images_url = self.get_images(hxs)

                item["normal_image_url"] = self.get_server_path(images_url)

                self.xml.create_xml(item)
                item.clear()
                item['image_urls'] = self.get_images(hxs)
                self.products["status"][index] = "ran"
        except:
            self.exc.code_handler(100, response.url)
            self.products["status"][index] = "error"
        else:
            return item

Пример #26

0

Показать файл

Файл: lydias_spider.py Проект: marjevtic/testMarko

 def make_reviews_json(self, title, text, author, location):
     jsons = []
     print len(title)
     print len(text)
     print len(author)
     print len(location)
     os._exit(0)
     for i in range(0, len(title)):
         json = '{ "title" : " %s ", "text" : "%s", "author" : "%s", "location" :\
                 "%s" }' % (title[i], text[i], author[i], location[i])
         json = basic.cdata(json)
         jsons.append(json)
     return jsons

Пример #27

0

Показать файл

 def make_reviews_json(self, title, text, author, location):
     jsons = []
     print len(title)
     print len(text)
     print len(author)
     print len(location)
     os._exit(0)
     for i in range(0, len(title)):
         json = '{ "title" : " %s ", "text" : "%s", "author" : "%s", "location" :\
                 "%s" }' % (title[i], text[i], author[i], location[i])
         json = basic.cdata(json)
         jsons.append(json)
     return jsons

Пример #28

0

Показать файл

Файл: guitar_center_spider.py Проект: marjevtic/testMarko

 def get_images(self, hxs):
     images = hxs.select('//ul[@id="prodDetailThumbs"]/li/a/@href').extract()
     tags = hxs.select('//ul[@id="prodDetailThumbs"]/li/@class').extract()
     images_list = []
     d = {}
     img = []
     for i in range(0, len(images)):
         d["image_url"] = self.get_server_path(images[i])
         img.append(images[i])
         if "site1sku" in tags[i]:
             d["product_serial"] = tags[i].replace("site1sku", "")
         else:
             d["product_serial"] = tags[i]
         images_list.append(basic.cdata(simplejson.dumps(d)))
     return images_list, img

Пример #29

0

Показать файл

 def get_images(self, hxs):
     images = hxs.select('//ul[@id="prodDetailThumbs"]/li/a/@href').extract()
     tags = hxs.select('//ul[@id="prodDetailThumbs"]/li/@class').extract()
     images_list = []
     d = {}
     img = []
     for i in range(0, len(images)):
         d['image_url'] = self.get_server_path(images[i])
         img.append(images[i])
         if "site1sku" in tags[i]:
             d['product_serial'] = tags[i].replace("site1sku", "")
         else:
             d['product_serial'] = tags[i]
         images_list.append(basic.cdata(simplejson.dumps(d)))
     return images_list, img

Пример #30

0

Показать файл

 def get_basic_info(self, hxs):
     name = hxs.select('//div[@id="proddetail"]/h1/text()').extract()
     price = hxs.select(
         '//div[@id="proddetail"]/div[@class="yourprice bigprice"]/text()'
     ).extract()
     description = basic.cdata(
         hxs.select('//div[@id="details"]').extract()[0])
     description = basic.clean_string(description)
     old_price = hxs.select(
         '//span[@class="yourprice_product"]/text()').extract()
     if not price:
         price = hxs.select('//span[@id="PriceDisplay"]/text()').extract()
     if old_price:
         old_price = [re.sub('[^0-9.]', '', old_price[0])]
     price = [re.sub('[^0-9.]', '', price[0])]
     return name, price, old_price, [description]

Пример #31

0

Показать файл

Файл: burton_spider.py Проект: marjevtic/testMarko

 def get_colors(self, page, color_names):
     """Gets color information with images from javascript on the page.
     Returns  json with color name and imagself.images_store = "/" + settings['IMAGES_STORE']e url for that color, and
     returnes filed of image urls that can be used for download later"""
     script = basic.get_middle_text(page, 'var imageMap_0 = new Array();', '</script>')[0]
     colors = basic.get_middle_text(script, '] = ', ';')
     image_urls = []
     colors_json = []
     for i in range(0, len(color_names)):
         color = burton.replace_color_json(colors[i])
         color = simplejson.loads(color)
         color['cname'] = color_names[i]
         color.pop('reg')
         image_urls.append(color['enh'])
         color['enh'] = self.get_server_path(color['enh'])
         colors_json.append(basic.cdata(simplejson.dumps(color)))
     return colors_json, image_urls

Пример #32

0

Показать файл

Файл: partylite_spider.py Проект: marjevtic/testMarko

 def get_price(self, hxs):
     """Getting product prices.
     Gets regular and discount price if there is one."""
     price = hxs.select('//span[@id="divUnitPrice"]/text()').extract()
     if not price:
         price = hxs.select('//div[@id="product_price"]/span[1]/text()').extract()
     if not price:
         price = hxs.select('//div[@id="product_price"]/text()').extract()
     discount = hxs.select('//div[@id="product_price"]/span[@class="pc-salePrice"]/text()').extract()
     price = basic.clean_string(price[0])
     price = re.sub(" +", " ", price)
     price = price.replace("Price:", "")
     price = price.replace("Prix:", "")
     price = basic.cdata(price.strip())
     if discount:
         discount = basic.cdata_field(discount)
     return [price], discount

Пример #33

0

Показать файл

Файл: partylite_spider.py Проект: marjevtic/testMarko

 def get_recommended(self, hxs):
     """Gets recommended product information.
     Returns information about recommended products as dict"""
     rec = hxs.select('//div[@id="right_column_container"]/div')
     new = []
     i = 0
     for r in rec:
         d = {}
         #to do: see how to get full href(different accounts)
         if not i:
             d['link'] = r.select('div/a/@href').extract()[0]
             d['image'] = "http://www.partylite.biz/imaging/resize"
             d['image'] += r.select('div/a/img/@src').extract()[0]
             d['name'] = r.select('div/a/text()').extract()[0]
             new.append(basic.cdata(simplejson.dumps(d)))
         i += 1
     return  new

Пример #34

0

Показать файл

Файл: burton_spider.py Проект: marjevtic/testMarko

 def get_colors(self, page, color_names):
     """Gets color information with images from javascript on the page.
     Returns  json with color name and imagself.images_store = "/" + settings['IMAGES_STORE']e url for that color, and
     returnes filed of image urls that can be used for download later"""
     script = basic.get_middle_text(page, 'var imageMap_0 = new Array();',
                                    '</script>')[0]
     colors = basic.get_middle_text(script, '] = ', ';')
     image_urls = []
     colors_json = []
     for i in range(0, len(color_names)):
         color = burton.replace_color_json(colors[i])
         color = simplejson.loads(color)
         color['cname'] = color_names[i]
         color.pop('reg')
         image_urls.append(color['enh'])
         color['enh'] = self.get_server_path(color['enh'])
         colors_json.append(basic.cdata(simplejson.dumps(color)))
     return colors_json, image_urls

Пример #35

0

Показать файл

Файл: burton_spider.py Проект: marjevtic/testMarko

 def get_variants(self, page):
     """Gets jsons for colors with all available sizes.
     In json are also fetched all information for sizes that are on the site
     """
     script = basic.get_middle_text(page, 'var skuSizeColorObj = new Array();', '</script>')[0]
     sizes = []
     image_urls = []
     color_names = []
     colors = script.split('skuSizeColorObj')
     for c in range(1, len(colors)):
         temp = basic.get_middle_text(colors[c], '= ', ';')
         # delete swatch image as it obviously won't be needed
         t = simplejson.loads(burton.replace_for_json(temp[0]))
         image_urls.append(t['swatchURL'])
         color_names.append(t['ColorDesc'])
         t['swatchURL'] = self.get_server_path(t['swatchURL'])
         sizes.append(basic.cdata(simplejson.dumps(t)))
     return sizes, image_urls, color_names

Пример #36

0

Показать файл

Файл: burton_spider.py Проект: marjevtic/testMarko

 def get_variants(self, page):
     """Gets jsons for colors with all available sizes.
     In json are also fetched all information for sizes that are on the site
     """
     script = basic.get_middle_text(page,
                                    'var skuSizeColorObj = new Array();',
                                    '</script>')[0]
     sizes = []
     image_urls = []
     color_names = []
     colors = script.split('skuSizeColorObj')
     for c in range(1, len(colors)):
         temp = basic.get_middle_text(colors[c], '= ', ';')
         # delete swatch image as it obviously won't be needed
         t = simplejson.loads(burton.replace_for_json(temp[0]))
         image_urls.append(t['swatchURL'])
         color_names.append(t['ColorDesc'])
         t['swatchURL'] = self.get_server_path(t['swatchURL'])
         sizes.append(basic.cdata(simplejson.dumps(t)))
     return sizes, image_urls, color_names

Пример #37

0

Показать файл

    def get_basic_info(self, hxs):
        name = hxs.select('//div[@id="fragment-1"]/h2/text()').extract()

        short_desc = hxs.select(
            '//div[@class="description2"]/text()').extract()

        description = hxs.select(
            '//div[@id="fragment-1"]/div[@class="description"]').extract()
        description = sportman.delete_tags(re, description[0])
        description = [basic.cdata(description)]

        old_price = hxs.select('//span[@class="oldprice"]/text()').extract()
        if (old_price != []):
            old_price = " ".join(old_price)
            old_price = old_price.split(':')
            old_price = old_price[1].replace('Kr', '')
            old_price = [old_price.replace(" ", "")]
        else:
            old_price = old_price

        price = hxs.select('//span[@class="nowprice"]/text()').extract()
        if (price != []):
            price = " ".join(price)
            price = price.split(':')
            price = price[1].replace('Kr', '')
            price = [price.replace(" ", "")]
        else:
            price = hxs.select('//span[@class="normalprice"]/text()').extract()
            price = " ".join(price)
            price = price.split(':')
            price = price[1].replace('Kr', '')
            price = [price.replace(" ", "")]

        id = hxs.select('//div[@class="articlenumber"]').extract()
        id = " ".join(id)
        id = id.replace(u"\xa0", "")
        id = basic.get_middle_text(id, 'Art.nr.', '</div>')
        sku = id
        id = [id[0]]

        return name, short_desc, description, old_price, price, id, sku

Пример #38

0

Показать файл

Файл: lydias_spider.py Проект: marjevtic/testMarko

 def get_sizes(self, id, hxs):
     showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0]
     itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0]
     salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0]
     url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=AV&opt2=-1&type2=l1type" % (id)
     url += "&type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=429" % (showmode, itemmode, salemode)
     jsons = []
     print "reading page..."
     page = urllib2.urlopen(url).read()
     print "page read"
     page = page.replace("'", "")
     page = page.replace("[", ",")
     page = page.replace(",,", "")
     temp = page.split("]")
     for i in range(0, len(temp) - 2):
         tmp = temp[i].split(",")
         json = '{ "size_short" : " %s ", "size_full" : "%s", "some_number" :\
                 "%s", "some_id" : "%s" }' % (tmp[0], tmp[1], tmp[2], tmp[3])
         json = basic.cdata(json)
         jsons.append(json)
     return jsons

Пример #39

0

Показать файл

Файл: burton_spider.py Проект: marjevtic/testMarko

 def parse(self, response):
     self.counter += 1
     basic.print_status(self.counter, self.total)
     hxs = HtmlXPathSelector(response)
     item = BurtonItem()
     page = hxs.extract()
     if 'redirect_urls' in response.request.meta:
         cur_url = response.request.meta['redirect_urls'][0]
     else:
         cur_url = response.url
     index = self.products['urls'].index(cur_url)
     try:
         if 'redirect_urls' in response.request.meta:
             item['product_id'] = [self.products['product_ids'][index]]
             item['name'] = [self.products['names'][index]]
             item['in_stock'] = ["NOT_AVAILABLE"]
             self.exc.code_handler(102, response.url)
             self.xml.create_xml(item)
             self.products["status"][index] = "no_avail"
         else:
             item['product_id'], item['name'] = self.get_basic_info(hxs)
             item['description'], item['features'] = self.get_description(
                 hxs)
             item['variants'], thumb_urls, color_names = self.get_variants(
                 page)
             item['all_sizes'] = self.get_all_sizes(page)
             item['color_json'], image_urls = self.get_colors(
                 page, color_names)
             item['price'], item['old_price'] = self.get_prices(hxs)
             item['in_stock'] = ['IN_STOCK']
             item['product_link'] = [basic.cdata(response.url)]
             self.xml.create_xml(item)
             item['image_urls'] = image_urls + thumb_urls
             self.products["status"][index] = "ran"
     except:
         self.exc.code_handler(100, response.url)
         self.products["status"][index] = "error"
     else:
         return item

Пример #40

0

Показать файл

Файл: sportman_spider.py Проект: marjevtic/testMarko

    def get_basic_info(self, hxs):
        name = hxs.select('//div[@id="fragment-1"]/h2/text()').extract()

        short_desc = hxs.select('//div[@class="description2"]/text()').extract()

        description = hxs.select('//div[@id="fragment-1"]/div[@class="description"]').extract()
        description = sportman.delete_tags(re, description[0])
        description = [basic.cdata(description)]

        old_price = hxs.select('//span[@class="oldprice"]/text()').extract()
        if old_price != []:
            old_price = " ".join(old_price)
            old_price = old_price.split(":")
            old_price = old_price[1].replace("Kr", "")
            old_price = [old_price.replace(" ", "")]
        else:
            old_price = old_price

        price = hxs.select('//span[@class="nowprice"]/text()').extract()
        if price != []:
            price = " ".join(price)
            price = price.split(":")
            price = price[1].replace("Kr", "")
            price = [price.replace(" ", "")]
        else:
            price = hxs.select('//span[@class="normalprice"]/text()').extract()
            price = " ".join(price)
            price = price.split(":")
            price = price[1].replace("Kr", "")
            price = [price.replace(" ", "")]

        id = hxs.select('//div[@class="articlenumber"]').extract()
        id = " ".join(id)
        id = id.replace(u"\xa0", "")
        id = basic.get_middle_text(id, "Art.nr.", "</div>")
        sku = id
        id = [id[0]]

        return name, short_desc, description, old_price, price, id, sku

Пример #41

0

Показать файл

Файл: burton_spider.py Проект: marjevtic/testMarko

 def parse(self, response):
     self.counter += 1
     basic.print_status(self.counter, self.total)
     hxs = HtmlXPathSelector(response)
     item = BurtonItem()
     page = hxs.extract()
     if 'redirect_urls' in response.request.meta:
         cur_url = response.request.meta['redirect_urls'][0]
     else:
         cur_url = response.url
     index = self.products['urls'].index(cur_url)
     try:
         if 'redirect_urls' in response.request.meta:
             item['product_id'] = [self.products['product_ids'][index]]
             item['name'] = [self.products['names'][index]]
             item['in_stock'] = ["NOT_AVAILABLE"]
             self.exc.code_handler(102, response.url)
             self.xml.create_xml(item)
             self.products["status"][index] = "no_avail"
         else:
             item['product_id'], item['name'] = self.get_basic_info(hxs)
             item['description'], item['features'] = self.get_description(hxs)
             item['variants'], thumb_urls, color_names = self.get_variants(page)
             item['all_sizes'] = self.get_all_sizes(page)
             item['color_json'], image_urls = self.get_colors(page, color_names)
             item['price'], item['old_price'] = self.get_prices(hxs)
             item['in_stock'] = ['IN_STOCK']
             item['product_link'] = [basic.cdata(response.url)]
             self.xml.create_xml(item)
             item['image_urls'] = image_urls + thumb_urls
             self.products["status"][index] = "ran"
     except:
         self.exc.code_handler(100, response.url)
         self.products["status"][index] = "error"
     else:
         return item

Пример #42

0

Показать файл

 def get_sizes(self, id, hxs):
     showmode = hxs.select('//input[@name="showmode"]/@value').extract()[0]
     itemmode = hxs.select('//input[@name="itemmode"]/@value').extract()[0]
     salemode = hxs.select('//input[@name="salemode"]/@value').extract()[0]
     url = "http://www.lydiasuniforms.com/ajaxed/product-showoptions.asp?sku=%s&opt1=AV&opt2=-1&type2=l1type" % (
         id)
     url += "&type3=&showmode=%s&itemmode=%s&salemode=%s&rnum=429" % (
         showmode, itemmode, salemode)
     jsons = []
     print "reading page..."
     page = urllib2.urlopen(url).read()
     print "page read"
     page = page.replace("'", "")
     page = page.replace("[", ",")
     page = page.replace(",,", "")
     temp = page.split("]")
     for i in range(0, len(temp) - 2):
         tmp = temp[i].split(",")
         json = '{ "size_short" : " %s ", "size_full" : "%s", "some_number" :\
                 "%s", "some_id" : "%s" }' % (tmp[0], tmp[1], tmp[2],
                                              tmp[3])
         json = basic.cdata(json)
         jsons.append(json)
     return jsons

Пример #43

0

Показать файл

 def parse_whole_xml(self):
     xml_dir = "xml/{0}".format(self.name)
     file_url = "https://svc.celebratinghome.com/ZMags.svc/ProductInfo1"
     downloader = Downloader()
     if self.d['download']:
         downloader.get_file(xml_dir, file_url, "client_feed")
     else:
         if not os.path.exists('xml/{0}/client_feed.xml'.format(self.name)):
             basic.warning(
                 "Feed file doesn't exist please de-select no download option"
             )
             os._exit(2)
     self.number = 0
     xml_item = ChomeItem()
     urls_all = []
     for event, elem in iterparse('xml/{0}/client_feed.xml'.format(
             self.name)):
         if elem.tag == "{http://schemas.microsoft.com/ado/2007/08/dataservices/metadata}properties":
             for r in elem:
                 p = "{http://schemas.microsoft.com/ado/2007/08/dataservices}"
                 if r.tag == p + "Id" and r.text in self.no_urls[
                         'product_ids']:
                     index = self.no_urls['product_ids'].index(r.text)
                     self.no_urls['status'][index] = 'ran'
                     self.number += 1
                     urls = []
                     flag = 0
                     for x in elem:
                         if x.tag == p + "Id":
                             xml_item['product_id'] = [x.text]
                         elif x.tag == p + "EngLongDesc" and x.text is not None:
                             xml_item['description_english'] = [
                                 self.escape(basic.cdata(x.text))
                             ]
                         elif x.tag == p + "RetailPrice":
                             xml_item['custom_price'] = [x.text[:-2]]
                         elif x.tag == p + "SpnLongDesc" and x.text is not None:
                             xml_item['description_spanish'] = [
                                 self.escape(basic.cdata(x.text))
                             ]
                         elif x.tag == p + "PartNumber":
                             xml_item['add_to_cart_id'] = [x.text]
                         elif x.tag == p + "MaxQty":
                             xml_item['max_qty'] = [x.text]
                         elif x.tag == p + "TimeType":
                             xml_item['time_type'] = [x.text]
                         elif x.tag == p + "SpnName" and x.text is not None:
                             xml_item['name_spanish'] = [x.text]
                         elif x.tag == p + "EngName":
                             xml_item['name_english'] = [x.text]
                         elif x.tag == p + "ImagePath_Large" and x.text is not None:
                             urls.append(self.get_absolute(x.text))
                             xml_item['normal_image_url'] = [
                                 self.get_server_path(
                                     self.get_absolute(x.text))
                             ]
                         elif x.tag == p + "IsActive":
                             if x.text == 0:
                                 xml_item['in_stock'] = ["NOT_IN_STOCK"]
                             else:
                                 xml_item['in_stock'] = ['IN_STOCK']
                         else:
                             for i in range(1, 4):
                                 tag = p + "Alternate%sImagePath_Large" % (
                                     str(i))
                                 if x.tag == tag and x.text is not None:
                                     urls.append(self.get_absolute(x.text))
                                     xml_item['normal_image_url'].append(
                                         self.get_server_path(
                                             self.get_absolute(x.text)))
                                     # change image paths for normal_image_url and return urls
                     self.xml.create_xml(xml_item)
                     urls_all += urls
     for i in range(0, len(self.no_urls['status'])):
         if self.no_urls['status'][i] != 'ran':
             self.no_urls['status'][i] = 'not_found'
     return urls_all

Пример #44

0

Показать файл

Файл: kenneth_spider.py Проект: marjevtic/testMarko

 def get_basic_info(self, hxs):
     name = hxs.select('//div[@id="productInfoTop"]/h1/text()').extract()
     description = basic.cdata(hxs.select('//div[@id="productDescription"]').extract()[0])
     return name, [description]

Пример #45

0

Показать файл

Файл: burton_spider.py Проект: marjevtic/testMarko

 def get_description(self, hxs):
     description = hxs.select('//div[@id="FieldsetProductInfo"]/text()').extract()[3]
     features = hxs.select('//div[@id="FieldsetProductInfo"]/ul').extract()
     if features:
         features = [features[0][:2000]]
     return [basic.cdata(description)], basic.cdata_field(features)

Пример #46

0

Показать файл

Файл: burton_spider.py Проект: marjevtic/testMarko

 def get_all_sizes(self, page):
     script = basic.get_middle_text(page, 'var distsizeobj=new Array();',
                                    'var indexcolor=0;')[0]
     all_sizes = basic.get_middle_text(script, ']="', '";')
     return [basic.cdata(simplejson.dumps(all_sizes))]

Пример #47

0

Показать файл

Файл: burton_spider.py Проект: marjevtic/testMarko

 def get_all_sizes(self, page):
     script = basic.get_middle_text(page, 'var distsizeobj=new Array();', 'var indexcolor=0;')[0]
     all_sizes = basic.get_middle_text(script, ']="','";')
     return [basic.cdata(simplejson.dumps(all_sizes))]

Пример #48

0

Показать файл

Файл: sportman_spider.py Проект: marjevtic/testMarko

    def get_variants(self, hxs, response):
        page = hxs.select("//html").extract()
        page = " ".join(page)
        dict_one = {}
        test_one = []

        temp = page.split('<div class="color">')
        temp = temp[1].split("</div>")
        temp = temp[0].split("<select name")

        viewstate, eventvalidation, previouspage, hiddenfield, view_page, even_page, pre_page, hidd_page = self.get_vars(
            response, hxs
        )

        if len(temp) == 1:
            color = hxs.select('//div[@class="color"]/text()').extract()
            value = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant1Hidden"]/@value').extract()
            color[0] = color[0].replace("  ", "")
            color = basic.clean_string(color[0])
            value = value[0]

        #            color = basic.clean_string(color[0])
        #            color = color.replace("  ","")
        #
        #            dict['color'] = color
        #            dict['color_value'] = value[0]

        else:
            test_color = basic.get_middle_text(temp[1], "farge</option>", "</select>")
            color = basic.get_middle_text(test_color[0], '">', "</option>")
            value = basic.get_middle_text(test_color[0], 'value="', '">')

            for i in range(0, len(color)):
                color[i] = color[i].replace("  ", "")
            #
            #                dict['color'] = color
            #                dict['color_value'] = value

        size_temp = page.split('<div class="size">')
        size_temp = size_temp[1].split("</div>")
        size_temp = size_temp[0].split("<select name")

        if len(size_temp) == 1:
            size = hxs.select('//div[@class="size"]/text()').extract()
            size = basic.clean_string(size[0])
            size = [size.replace("   ", "")]

            size_val = hxs.select('//input[@id="ctl00_ContentPlaceHolder1_Variant2Hidden"]/@value').extract()

            if size[0] == "":
                for i in range(len(value)):
                    resp_page = self.get_data(response, hidd_page, view_page, pre_page, even_page, value[i])

                    a_page = resp_page.split('<div class="siz')
                    a_page = a_page[1].split("</select>")

                    if len(a_page) == 1:

                        size = basic.get_middle_text(a_page[0], 'e">', '<input type="hidden"')
                        size_val = basic.get_middle_text(a_page[0], 'value="', '"')
                        size_val = size_val[0]
                        size_val = [size_val]

                    else:
                        a_page = basic.get_middle_text(a_page[0], "se</option>", "</select>")
                        size = basic.get_middle_text(a_page[0], '">', "</option>")
                        size_val = basic.get_middle_text(a_page[0], 'value="', '">')

                    dict_one["color"] = color[i]
                    dict_one["color_value"] = value[i]
                    dict_one["size_value"] = size_val

                    for x in range(0, len(size)):
                        size[x] = basic.clean_string(size[x])
                        size[x] = size[x].replace("   ", "")

                        dict_one["size"] = size

                    test_one.append(basic.cdata(json.dumps(dict_one)))

            else:
                dict_one["color"] = color

                dict_one["color_value"] = value
                dict_one["size"] = size
                dict_one["size_value"] = size_val
                test_one.append(basic.cdata(simplejson.dumps(dict_one)))

        else:
            test_size = basic.get_middle_text(size_temp[1], "se</option>", "</select>")
            size = basic.get_middle_text(test_size[0], '">', "</option>")
            size_val = basic.get_middle_text(test_size[0], 'value="', '">')

            for x in range(0, len(size)):
                size[x] = basic.clean_string(size[x])
                size[x] = size[x].replace("   ", "")

            dict_one["color"] = color
            dict_one["color_value"] = value
            dict_one["size"] = size
            dict_one["size_value"] = size_val

            test_one.append(basic.cdata(json.dumps(dict_one)))

        return test_one

Пример #49

0

Показать файл

Файл: partylite_spider.py Проект: marjevtic/testMarko

 def get_description(self, hxs):
     description = description = hxs.select('//div[@id="item_description"]').extract()
     description = [basic.cdata(basic.remove_tags(description[0]))]
     description = [description[0].replace(u"\u2044", "/")]
     return description

Пример #50

0

Показать файл

Файл: chome_spider.py Проект: marjevtic/testMarko

 def parse_whole_xml(self):
     xml_dir = "xml/{0}".format(self.name)
     file_url = "https://svc.celebratinghome.com/ZMags.svc/ProductInfo1"
     downloader = Downloader()
     if self.d['download']:
         downloader.get_file(xml_dir, file_url, "client_feed")
     else:
         if not os.path.exists('xml/{0}/client_feed.xml'.format(self.name)):
             basic.warning("Feed file doesn't exist please de-select no download option")
             os._exit(2)
     self.number = 0
     xml_item = ChomeItem()
     urls_all = []
     for event, elem in iterparse('xml/{0}/client_feed.xml'.format(self.name)):
         if elem.tag == "{http://schemas.microsoft.com/ado/2007/08/dataservices/metadata}properties":
             for r in elem:
                 p = "{http://schemas.microsoft.com/ado/2007/08/dataservices}"
                 if r.tag == p + "Id" and r.text in self.no_urls['product_ids']:
                     index = self.no_urls['product_ids'].index(r.text)
                     self.no_urls['status'][index] = 'ran'
                     self.number += 1
                     urls = []
                     flag = 0
                     for x in elem:
                         if x.tag == p + "Id":
                             xml_item['product_id'] = [x.text]
                         elif x.tag == p + "EngLongDesc" and x.text is not None:
                             xml_item['description_english'] = [self.escape(basic.cdata(x.text))]
                         elif x.tag == p + "RetailPrice":
                             xml_item['custom_price'] = [x.text[:-2]]
                         elif x.tag == p + "SpnLongDesc" and x.text is not None:
                             xml_item['description_spanish'] = [self.escape(basic.cdata(x.text))]
                         elif x.tag == p + "PartNumber":
                             xml_item['add_to_cart_id'] = [x.text]
                         elif x.tag == p + "MaxQty":
                             xml_item['max_qty'] = [x.text]
                         elif x.tag == p + "TimeType":
                             xml_item['time_type'] = [x.text]
                         elif x.tag == p + "SpnName" and x.text is not None:
                             xml_item['name_spanish'] = [x.text]
                         elif x.tag == p + "EngName":
                             xml_item['name_english'] = [x.text]
                         elif x.tag == p + "ImagePath_Large" and x.text is not None:
                             urls.append(self.get_absolute(x.text))
                             xml_item['normal_image_url'] = [self.get_server_path(self.get_absolute(x.text))]
                         elif x.tag == p + "IsActive":
                             if x.text == 0:
                                 xml_item['in_stock'] = ["NOT_IN_STOCK"]
                             else:
                                 xml_item['in_stock'] = ['IN_STOCK']
                         else:
                             for i in range(1, 4):
                                 tag = p + "Alternate%sImagePath_Large" % (str(i))
                                 if x.tag == tag and x.text is not None:
                                     urls.append(self.get_absolute(x.text))
                                     xml_item['normal_image_url'].append(self.get_server_path(self.get_absolute(x.text)))
                                     # change image paths for normal_image_url and return urls
                     self.xml.create_xml(xml_item)
                     urls_all += urls
     for i in range(0, len(self.no_urls['status'])):
         if self.no_urls['status'][i] != 'ran':
             self.no_urls['status'][i] = 'not_found'
     return urls_all

Python cdata примеры использования