예제 #1
0
    def parse_ann_family(self, response):
        hxs = HtmlXPathSelector(response)

        # find name of item
        item_name_path = hxs.select('//div[@class="hd-info"]//h1/text()')
        if len(item_name_path) == 0:
            self.invalid_links += 1
            print "Invalid link:  " + str(response.url)
            return (False, None)
        item_name = item_name_path.extract()[0]
        logging.critical("Name: " + str(item_name))

        self.count_scraped += 1


        meta_tag_url = hxs.select('//meta[@property="og:url"]/@content')

        prod_url = meta_tag_url.extract()[0]
        logging.critical("PRODUCT URL:" + str(prod_url) + " ITEM_NAME " + str(item_name) + " TOTAL SO FAR " + str(self.count_scraped))

        # Ann Taylor is for women only
        gender = 'F'

        # find price and sale price
        item_id_, price_, sale_price_ = self._find_price(hxs, prod_url)

        if item_id_ in self.items_scraped:
            logging.critical("ITEM ALREADY SCRAPED " + str(item_id_))
            return (False, None)
        else:
            self.items_scraped.append(item_id_)

        logging.critical("ITEM_ID " + str(item_id_) + " PRICE " + str(price_) + " SALE PRICE " + str(sale_price_))
        if price_ > sale_price_:
            logging.critical("SALE on ITEM_ID " + str(item_id_) + " PRICE " + str(price_) + " SALE PRICE " + str(sale_price_))


        # extract image URL
        prod_img_path = hxs.select('//img[@id="productImage"]/@src')
        prod_img_url = str(prod_img_path.extract()[0])
        logging.critical("Image URL: " + str(prod_img_url))


        # find description and keywords: these will be useful in categorization
        desc = hxs.select('//div[@class="gu gu-first description"]/p/text()').extract()
        prod_desc = ''.join(desc)
        logging.critical("Description: " + prod_desc)

        # promo text
        # DIDN'T FIND ANY
        #promo_path = hxs.select('//span[@class="cat-pro-promo-text"]//font/text()').extract()
        #promo_str = str(promo_path)
        #logging.critical("Promotion: ")
        #logging.critical(promo_str)
        promo_str = ""



        product, created_new = self._create_product_item(item_name, item_id_, str(prod_url), price_, \
                                                         sale_price_, gender, str(prod_img_url), promo_str, prod_desc)

        if created_new:
            new_cat = simple_product_categorization(product)
            product.cat1 = new_cat["cat1"]
            product.cat2 = new_cat["cat2"]
            product.cat3 = new_cat["cat3"]
            product.save()


        #self._store_in_file(response, item_id_)
        #raise CloseSpider('Blah')
        logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\
                          str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped)))
        #raise SystemExit

        return (True, product)
예제 #2
0
    def parse_pinterest(self, response):
        #self.check_shelfit_validity(response)
        #return (False, None)
        hxs = HtmlXPathSelector(response)

        # find name of item
        item_name_path = hxs.select('//title/text()')
        if len(item_name_path) == 0:
            self.invalid_links += 1
            print "Invalid link:  " + str(response.url)
            return (False, None)
        item_name = item_name_path.extract()[0]
        if '|' in item_name:
            index = item_name.find('|')
            item_name = item_name[0:index]
        logging.critical("Name: " + item_name.encode('utf-8'))

        self.count_scraped += 1

        prod_url = response.url
        logging.critical("PRODUCT URL:" + str(prod_url) + " ITEM_NAME " +
                         item_name.encode('utf-8') + " TOTAL SO FAR " +
                         str(self.count_scraped))

        gender = 'F'
        if "www.bananarepublic.com" in prod_url or 'www.gap.com' in prod_url:
            gender_path = hxs.select(
                '//a/img[contains (@class, "_selected")]/@alt')
            if len(gender_path) > 0:
                gender__ = gender_path.extract()[0]
                if 'men' in gender__ or 'boy' in gender__:
                    gender = 'M'

        logging.critical("GENDER: " + gender)
        # find price and sale price
        item_id_, price_, sale_price_ = self._find_price(hxs, prod_url)

        if item_id_ in self.items_scraped:
            logging.critical("ITEM ALREADY SCRAPED " + str(item_id_))
            return (False, None)
        else:
            self.items_scraped.append(item_id_)

        logging.critical("ITEM_ID " + str(item_id_) + " PRICE " + str(price_) +
                         " SALE PRICE " + str(sale_price_))
        if price_ > sale_price_:
            logging.critical("SALE on ITEM_ID " + str(item_id_) + " PRICE " +
                             str(price_) + " SALE PRICE " + str(sale_price_))

        # extract image URL
        prod_img_path = hxs.select('//img[@id="productImage"]/@src')
        if len(prod_img_path) > 0:
            prod_img_url = str(prod_img_path.extract()[0])
            logging.critical("Image URL: " + str(prod_img_url))
        else:
            prod_img_url = ""

        # find description and keywords: these will be useful in categorization
        prod_desc = ''
        logging.critical("Description: " + prod_desc)

        # promo text
        promo_str = ""



        product, created_new = self._create_product_item(item_name, item_id_, str(prod_url), price_, \
                                                         sale_price_, gender, str(prod_img_url), promo_str, prod_desc)

        if product == None:
            logging.critical(
                "Product is None----SHOULDN'T HAPPEN!!!!!******************")
            #import sys
            #sys.exit(1)

        if created_new:
            new_cat = simple_product_categorization(product)
            product.cat1 = new_cat["cat1"]
            product.cat2 = new_cat["cat2"]
            product.cat3 = new_cat["cat3"]
            product.save()

        logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\
                          str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped)))

        return (True, product)
예제 #3
0
    def parse_product(self, response):
        print "parse_product %s" % response
        #        return

        #self.check_shelfit_validity(response)
        #return
        hxs = HtmlXPathSelector(response)

        # find name of item
        item_name_path = hxs.select('//div[@id="product_info"]/h2/text()')
        if len(item_name_path) == 0:
            self.invalid_links += 1
            return (False, None)
        item_name = item_name_path.extract()
        logging.critical("Name: " + str(item_name))

        self.count_scraped += 1
        '''
        PLAYING NICE: sleeping for 1min after crawling every 100 pages
        '''
        #        if self.count_scraped % 100 == 0:
        #            sleep(60) # sleep for 1 mins for express

        prod_url = response.url
        logging.critical("PRODUCT URL:" + str(prod_url) + " TITLE " +
                         str(item_name) + " TOTAL SO FAR " +
                         str(self.count_scraped))

        # find gender
        gender = 'Nil'
        try:
            gen_strs = hxs.select('//h3[@class="s_here open"]').extract()
            for gen_str in gen_strs:
                if 'boy' in gen_str.lower():
                    gender = 'M'
                if 'girl' in gen_str.lower():
                    gender = 'F'
        except:
            pass
#        if prod_url.lower().find('women') >= 0 or prod_url.lower().find('girl') >= 0:
#            gender = 'F'
        logging.critical("Gender: " + gender)
        '''
        TODO: if same page has multiple items, our logic will not work.
        So, leaving it for future.
        '''
        #        if len(item_name) == 0:
        #            logging.critical("DIDN'T FIND TITLE AT NORMAL PLACE, MUST BE SUIT. RETURNING." + str(prod_url))
        #            print item_name_path
        #            print "Size of response " + str(len(str(response)))
        #            print str(response)
        #            return (False, None)

        # find price and sale price
        item_id_, price_, sale_price_ = self._find_price(hxs, prod_url)

        #        print item_id_
        #        return

        if item_id_ in self.all_items_scraped:
            print "RETURNING since we have already scraped " + str(item_id_)

        self.all_items_scraped.add(item_id_)

        logging.critical("ITEM_ID " + str(item_id_) + " PRICE " + str(price_) +
                         " SALE PRICE " + str(sale_price_))

        # extract image URL
        # <meta content="http://www.childrensplace.com/www/b/TCP/images/cloudzoom/p/136532_p.jpg" property="og:image">
        prod_img_path = hxs.select('//meta[@property="og:image"]/@content')
        prod_img_url = str(prod_img_path.extract()[0])
        #        prod_img_url = prod_img_str[28: len(prod_img_str) - 2]
        logging.critical("Image URL: " + str(prod_img_url))

        # find description and keywords: these will be useful in categorization
        #        <div id="tab-content">
        #          <dl class="tabs">
        #            <dt id="tab_description" class="tab_here tab_here" width="91" style="display: block; left: 0px;">Description</dt>
        #            <dd style="display: block;">
        #              <p>Rev up his look with this cute style!</p>

        #        desc = hxs.select('//div[@id="tab-content"]/dl[@class="tabs"]/dd[0]/p[0]/text()').extract()
        desc = hxs.select(
            '//div[@id="tab-content"]/dl[@class="tabs"]/dd/p/text()').extract(
            )
        logging.critical("Description: ")
        logging.critical(desc)
        prod_desc = desc[0]

        # promo text
        promo_str = 'Nil'
        #        promo_path = hxs.select('//span[@class="cat-pro-promo-text"]//font/text()').extract()
        #        promo_str = str(promo_path)
        #        logging.critical("Promotion: ")
        #        logging.critical(promo_str)




        product, created_new = self._create_product_item(item_name[0], item_id_, str(prod_url), price_, \
                                            sale_price_, gender, str(prod_img_url), promo_str, prod_desc)

        if (not created_new):
            return (False, product)

        if created_new:
            new_cat = simple_product_categorization(product)
            product.cat1 = new_cat["cat1"]
            product.cat2 = new_cat["cat2"]
            product.cat3 = new_cat["cat3"]
            product.save()
        #self._create_category(product, categories)

        #self._store_in_file(response, item_id_)
        #raise CloseSpider('Blah')
        logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\
                          str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped)))
        #raise SystemExit

        return (True, product)
예제 #4
0
파일: aeagle.py 프로젝트: khsr/django-shelf
    def parse_aeaglefamily(self, response):
        #self.check_shelfit_validity(response)
        #return (False, None)
        hxs = HtmlXPathSelector(response)

        # find name of item
        item_name_path = hxs.select('//h1[@class="pName"]/text()')
        if len(item_name_path) == 0:
            self.invalid_links += 1
            print "Invalid link:  " + str(response.url)
            return (False, None)
        item_name = item_name_path.extract()[0]
        logging.critical("Name: " + item_name.encode('utf-8'))

        self.count_scraped += 1

        meta_tag_url = hxs.select('//meta[@property="og:url"]/@content')
        if len(meta_tag_url) > 0:
            prod_url = meta_tag_url.extract()[0]
        else:
            prod_url = response.url

        logging.critical("PRODUCT URL:" + str(prod_url) + " ITEM_NAME " +
                         item_name.encode('utf-8') + " TOTAL SO FAR " +
                         str(self.count_scraped))

        gender = 'F'

        logging.critical("GENDER: " + gender)
        # find price and sale price
        item_id_, price_, sale_price_ = self._find_price(hxs, prod_url)

        if item_id_ in self.items_scraped:
            logging.critical("ITEM ALREADY SCRAPED " + str(item_id_))
            return (False, None)
        else:
            self.items_scraped.append(item_id_)

        logging.critical("ITEM_ID " + str(item_id_) + " PRICE " + str(price_) +
                         " SALE PRICE " + str(sale_price_))
        if price_ > sale_price_:
            logging.critical("SALE on ITEM_ID " + str(item_id_) + " PRICE " +
                             str(price_) + " SALE PRICE " + str(sale_price_))

        # extract image URL
        meta_tag_url = hxs.select('//meta[@property="og:image"]/@content')
        if len(meta_tag_url) > 0:
            prod_img_url = meta_tag_url.extract()[0]
        else:
            prod_img_url = ""

        # find description and keywords: these will be useful in categorization
        prod_desc = ''
        logging.critical("Description: " + prod_desc)

        # promo text
        promo_str = ""



        product, created_new = self._create_product_item(item_name, item_id_, str(prod_url), price_, \
                                                         sale_price_, gender, str(prod_img_url), promo_str, prod_desc)

        if product == None:
            logging.critical(
                "Product is None----SHOULDN'T HAPPEN!!!!!******************")
            #import sys
            #sys.exit(1)

        ### HANDLE CATEGORIZATION
        if created_new:
            new_cat = simple_product_categorization(product)
            product.cat1 = new_cat["cat1"]
            product.cat2 = new_cat["cat2"]
            product.cat3 = new_cat["cat3"]
            product.save()

        logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\
                          str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped)))

        return (True, product)
예제 #5
0
    def parse_express(self, response):
        #self.check_shelfit_validity(response)
        #return
        hxs = HtmlXPathSelector(response)

        # find name of item
        item_name_path = hxs.select(
            '//div[@id="cat-pro-con-detail"]//h1/text()')
        if len(item_name_path) == 0:
            self.invalid_links += 1
            return (False, None)
        item_name = item_name_path.extract()
        logging.critical("Name: " + str(item_name))

        self.count_scraped += 1

        prod_url = response.url
        logging.critical("PRODUCT URL:" + str(prod_url) + " TITLE " +
                         str(item_name) + " TOTAL SO FAR " +
                         str(self.count_scraped))

        # find gender
        gender = 'M'
        if prod_url.lower().find('women') >= 0 or prod_url.lower().find(
                'girl') >= 0:
            gender = 'F'
        logging.critical("Gender: " + gender)
        '''
        TODO: if same page has multiple items, our logic will not work.
        So, leaving it for future.
        '''
        if len(item_name) == 0:
            logging.critical(
                "DIDN'T FIND TITLE AT NORMAL PLACE, MUST BE SUIT. RETURNING." +
                str(prod_url))
            print item_name_path
            print "Size of response " + str(len(str(response)))
            print str(response)
            return (False, None)

        # find price and sale price
        item_id_, price_, sale_price_ = self._find_price(hxs, prod_url)

        if item_id_ in self.all_items_scraped:
            print "RETURNING since we have already scraped " + str(item_id_)

        self.all_items_scraped.add(item_id_)

        logging.critical("ITEM_ID " + str(item_id_) + " PRICE " + str(price_) +
                         " SALE PRICE " + str(sale_price_))

        # extract image URL
        prod_img_path = hxs.select('//link[@rel="image_src"]')
        prod_img_str = str(prod_img_path.extract()[0])
        prod_img_url = prod_img_str[28:len(prod_img_str) - 2]
        logging.critical("Image URL: " + str(prod_img_url))

        # find description and keywords: these will be useful in categorization
        desc = hxs.select(
            '//div[@id="cat-pro-con-detail"]//li[@class="cat-pro-desc"]/text()'
        ).extract()
        logging.critical("Description: ")
        logging.critical(desc)
        prod_desc = desc

        # promo text
        promo_path = hxs.select(
            '//span[@class="cat-pro-promo-text"]//font/text()').extract()
        promo_str = str(promo_path)
        logging.critical("Promotion: ")
        logging.critical(promo_str)




        product, created_new = self._create_product_item(item_name[0], int(item_id_), str(prod_url), price_, \
                                            sale_price_, gender, str(prod_img_url), promo_str, prod_desc)

        if (not created_new):
            return (False, product)

        if created_new:
            new_cat = simple_product_categorization(product)
            product.cat1 = new_cat["cat1"]
            product.cat2 = new_cat["cat2"]
            product.cat3 = new_cat["cat3"]
            product.save()
        #self._create_category(product, categories)

        #self._store_in_file(response, item_id_)
        #raise CloseSpider('Blah')
        logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\
                          str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped)))
        #raise SystemExit

        return (True, product)
예제 #6
0
    def parse_dsw(self, response):
        hxs = HtmlXPathSelector(response)

        meta_tag_item_name = hxs.select('//meta[@property="og:title"]/@content')
        if len(meta_tag_item_name) > 0:
            item_name = meta_tag_item_name.extract()[0]
        else:
            item_name_path = hxs.select('//title/text()')
            if len(item_name_path) > 0:
                item_name = item_name_path.extract()[0]
            else:
                logging.error("Not a product page: " + response.url)
                return (False, None)
        logging.critical(smart_str(item_name))

        self.count_scraped += 1


        meta_tag_url = hxs.select('//meta[@property="og:url"]/@content')
        if len(meta_tag_url) > 0:
            prod_url = meta_tag_url.extract()[0]
        else:
            prod_url = response.url

        logging.critical("PRODUCT URL:" + smart_str(prod_url) + " TITLE " + smart_str(item_name) + \
                         " TOTAL SO FAR " + str(self.count_scraped))


        # find gender
        gender = 'M'
        if prod_url.lower().find('women') >= 0 or prod_url.lower().find('girl') >= 0:
            gender = 'F'
        logging.critical("Gender: " + gender)


        # find price and sale price
        item_id_, price_, sale_price_ = self._find_price(hxs)

        if item_id_ in self.items_scraped:
            logging.critical("ITEM ALREADY SCRAPED " + smart_str(item_id_) + ". RETURNING.")
            return  (True, None)
        else:
            self.items_scraped.append(item_id_)

        logging.critical("ITEM_ID " + item_id_ + " PRICE " + smart_str(price_) + " SALE PRICE " + smart_str(sale_price_))
        if price_ > sale_price_:
            logging.critical("SALE on ITEM_ID " + smart_str(item_id_) + " PRICE " + smart_str(price_) +\
                             " SALE PRICE " + smart_str(sale_price_))


        meta_img_url = hxs.select('//meta[@property="og:image"]/@content')
        if len(meta_img_url) > 0:
            prod_img_url = meta_img_url.extract()[0]
        else:
            prod_img_url = ""
        logging.critical("Image URL: " + smart_str(prod_img_url))


        # find description and keywords: these will be useful in categorization
        desc = hxs.select('//meta[@property="og:description"]/@content')
        if len(desc) > 0:
            desc_content = desc.extract()[0]
        else:
            desc_content = ''
        logging.critical("Description: " + str(desc_content.encode('utf-8')))
        prod_desc = desc_content

        promo_str = ''

        product, created_new = self._create_product_item(item_name, item_id_, str(prod_url), price_, \
                                            sale_price_, gender, prod_img_url, promo_str, prod_desc)


        if created_new:
            new_cat = simple_product_categorization(product)
            product.cat1 = new_cat["cat1"]
            product.cat2 = new_cat["cat2"]
            product.cat3 = new_cat["cat3"]
            product.save()

        logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\
                          str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped)))
        #raise SystemExit

        return (True, product)
예제 #7
0
파일: nyco.py 프로젝트: khsr/django-shelf
    def parse_nyc(self, response):
        hxs = HtmlXPathSelector(response)
        # find name of item
        item_name_path = hxs.select('//h1/text()')
        if len(item_name_path) == 0:
            self.invalid_links += 1
            return (False, None)
        item_name = item_name_path.extract()
        logging.critical("Name: " + str(item_name))

        self.count_scraped += 1

        '''
        PLAYING NICE: sleeping for 1min after crawling every 100 pages
        '''
        if self.count_scraped % 100 == 0:
            sleep(0) # sleep for 1 mins for express

        can_url_path = hxs.select('//link[@rel="canonical"]/@href')
        if len(can_url_path) > 0:
            prod_url = can_url_path.extract()[0]
        else:
            prod_url = response.url
        logging.critical("PRODUCT URL:" + str(prod_url) + " TITLE " + str(item_name) + " TOTAL SO FAR " + str(self.count_scraped))

        # find gender
        gender = 'F'
        logging.critical("Gender: " + gender)


        # find price and sale price
        item_id_, price_, sale_price_ = self._find_price(hxs, prod_url)

        if item_id_ in self.all_items_scraped:
            print "RETURNING since we have already scraped " + str(item_id_)

        self.all_items_scraped.add(item_id_)

        logging.critical("ITEM_ID " + str(item_id_) + " PRICE " + str(price_) + " SALE PRICE " + str(sale_price_))

        # extract image URL
        img_str = re.findall('strLarge = ["\w\d\/:_\$.?]+', str(response.body))
        prod_img_url = ""
        if len(img_str) > 0:
            img_str_ = img_str[0]
            img_str_parts = img_str_.split()
            if len(img_str_parts) > 2:
                prod_img_url = img_str_parts[2].strip('"')
        if prod_img_url == "":
            logging.critical("PROBLEM with Image URL for " + str(response.url))
        logging.critical("Image URL: " + str(prod_img_url))


        # find description and keywords: these will be useful in categorization
        desc = hxs.select('//p[@class="itemstyle_pdp"]/span[@class="details"]/text()').extract()
        logging.critical("Description: ")
        logging.critical(desc)
        prod_desc = desc

        # promo text
        promo_str = ""

        product, created_new = self._create_product_item(response.url, item_name[0], item_id_, str(prod_url), price_, \
                                            sale_price_, gender, str(prod_img_url), promo_str, prod_desc)

        if product == None:
            logging.critical("PROBLEM: product is None for URL " + str(response.url))

        if created_new:
            new_cat = simple_product_categorization(product)
            product.cat1 = new_cat["cat1"]
            product.cat2 = new_cat["cat2"]
            product.cat3 = new_cat["cat3"]
            product.save()

        logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\
                          str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped)))

        return (True, product)
예제 #8
0
    def parse_jcrew(self, response):
        hxs = HtmlXPathSelector(response)

        meta_tag_item_name = hxs.select(
            '//meta[@property="og:title"]/@content')
        if len(meta_tag_item_name) > 0:
            item_name = meta_tag_item_name.extract()[0]
        else:
            item_name_path = hxs.select('//title/text()')
            if len(item_name_path) > 0:
                item_name = item_name_path.extract()[0]
            else:
                logging.error("Not a product page: " + response.url)
                return (False, None)
        logging.critical(item_name.encode('utf-8'))

        self.count_scraped += 1

        meta_tag_url = hxs.select('//meta[@property="og:url"]/@content')
        if len(meta_tag_url) > 0:
            prod_url = meta_tag_url.extract()[0]
        else:
            prod_url = response.url

        logging.critical("PRODUCT URL:" + str(prod_url) + " TITLE " + str(item_name.encode('utf-8')) + \
                         " TOTAL SO FAR " + str(self.count_scraped))

        # find gender
        gender = 'M'
        if prod_url.lower().find('women') >= 0 or prod_url.lower().find(
                'girl') >= 0:
            gender = 'F'
        logging.critical("Gender: " + gender)

        # find price and sale price
        item_id_, price_, sale_price_ = self._find_price(hxs)

        if item_id_ in self.items_scraped:
            logging.critical("ITEM ALREADY SCRAPED " + str(item_id_) +
                             ". RETURNING.")
            return (True, None)
        else:
            self.items_scraped.append(item_id_)

        logging.critical("ITEM_ID " + str(item_id_) + " PRICE " + str(price_) +
                         " SALE PRICE " + str(sale_price_))
        if price_ > sale_price_:
            logging.critical("SALE on ITEM_ID " + str(item_id_) + " PRICE " + str(price_) +\
                             " SALE PRICE " + str(sale_price_))

        # extract image URL
        prod_img_path = hxs.select(
            '//div[contains (@class, "prod_main_img")]/a/img[contains (@src, "http")]/@src'
        )
        prod_img_url = prod_img_path.extract()
        logging.critical("Image URL: " + str(prod_img_url))

        # find description and keywords: these will be useful in categorization
        desc = hxs.select('//meta[@property="og:description"]/@content')
        if len(desc) > 0:
            desc_content = desc.extract()[0]
        else:
            desc_content = ''
        logging.critical("Description: " + str(desc_content.encode('utf-8')))

        keywords = hxs.select('//meta[@name="keywords"]/@content').extract()
        keywords_content = keywords[0]
        logging.critical("Keywords: ")
        logging.critical(keywords_content)

        prod_desc = desc_content + "\n" + keywords_content
        print "Length of prod_desc " + str(len(prod_desc))

        promo_str = ''

        product, created_new = self._create_product_item(item_name, item_id_, str(prod_url), price_, \
                                            sale_price_, gender, str(prod_img_url[0]), promo_str, prod_desc)
        print "gender " + str(product.gender)
        if created_new:
            new_cat = simple_product_categorization(product)
            product.cat1 = new_cat["cat1"]
            product.cat2 = new_cat["cat2"]
            product.cat3 = new_cat["cat3"]
            product.save()

        error = hxs.select('//span[@class="select-error"]/text()')
        if len(error) > 0:
            logging.critical("Error: " + (error.extract()[0]).encode('utf-8'))
        #self._store_in_file(response, item_id_)
        #raise CloseSpider('Blah')
        logging.critical("Total unique items: " + str(len(self.all_items_scraped)) + " we have scraped so far: " +\
                          str(self.count_scraped) + " Unique URLs scraped: " + str(len(self.urls_scraped)))
        #raise SystemExit

        return (True, product)