示例#1
0
    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)

        item = response.meta['item']

        #TODO: test if this xpath should include other types of pages
        description_text_holder = hxs.select(
            "//p[@class='subtitle grey']/text()").extract()
        description_title_holder = hxs.select(
            "//h1/text()[normalize-space()!='']").extract()

        if description_text_holder:
            item['description_text'] = description_text_holder[0]
            item['description_title'] = description_title_holder[0]

            description_tokenized = Utils.normalize_text(
                item['description_text'])
            item['description_wc'] = len(description_tokenized)

            (item['keyword_count'],
             item['keyword_density']) = Utils.phrases_freq(
                 item['description_title'], item['description_text'])
        else:
            item['description_wc'] = 0

        yield item
示例#2
0
 def _parse_category(self, response):
     category = response.meta['category']
     parent = response.meta.get('parent', {})
     category['catid'] = self._get_catid()
     category['url'] = response.url
     category['parent_text'] = parent.get('text')
     category['parent_url'] = parent.get('url')
     category['parent_catid'] = parent.get('catid')
     category['grandparent_text'] = parent.get('parent_text')
     category['grandparent_url'] = parent.get('parent_url')
     category['level'] = parent.get('level', 0) + 1
     category['department_text'] = response.meta['department']['text']
     category['department_url'] = response.meta['department']['url']
     category['department_id'] = response.meta['department']['catid']
     #category['description_text'] = self._description_text.first(response)
     description_text = first(response.xpath(self._xpath_description_text).extract())
     if description_text:
         category['description_wc'] = len(Utils.normalize_text(description_text))
     keywords = first(response.xpath(self._xpath_keywords).extract())
     if description_text:
         category['description_text'] = description_text
     if description_text and keywords:
         (category['keyword_count'], category['keyword_density']) = Utils.phrases_freq(keywords, description_text)
     if category.get('nr_products') is None:
         nr_products = re_find('\d+', first(response.css(self._css_product_numbers_text).extract()))
         category['nr_products'] = int(nr_products) if nr_products is not None else None
     subcategory_links = LinkExtractor(restrict_xpaths=self._xpath_category_links)
     for link in subcategory_links.extract_links(response):
         text, nr_products = re.search('(.+?) \((\d+)\) *', link.text).groups()
         nr_products = int(nr_products)
         child = CategoryItem(text=text, nr_products=nr_products)
         meta = {'category': child, 'department': response.meta['department'], 'parent': category}
         yield Request(link.url, callback=self._parse_category, meta=meta)
     yield category
示例#3
0
    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)

        item = response.meta['item']

        # extract number of products if available
        #TODO check
        count_holder = hxs.select("//div[@class='recordCount']/span[@id='RecordCount_1']/text()")
        if count_holder:
            item['nr_products'] = int(count_holder.extract()[0])

        #TODO
        # try to change URL "Category" to "SubCategory", see if you find the product count there

        # extract description if available
        description_holders = hxs.select("//div[@id='bcaShopWindowSEO']")
        # if the list is not empty and contains at least one non-whitespace item
        if description_holders:
            description_texts = description_holders.select(".//text()[not(ancestor::h2)]").extract()

            # replace all whitespace with one space, strip, and remove empty texts; then join them
            item['description_text'] = " ".join([re.sub("\s+"," ", description_text.strip()) for description_text in description_texts if description_text.strip()])

            tokenized = Utils.normalize_text(item['description_text'])
            item['description_wc'] = len(tokenized)

            description_title = description_holders.select(".//h2/text()").extract()
            if description_title:
                item['description_title'] = description_title[0].strip()

                (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq(item['description_title'], item['description_text'])
        else:
            item['description_wc'] = 0

        yield item

        parent = item

        #TODO
        # extract and parse subcategories
        subcats = hxs.select("//dl[@class='categoryList primaryNav']/dd/a")
        for subcat in subcats:
            item = CategoryItem()
            
            item['text'] = subcat.select("text()").extract()[0].strip()

            #TODO: check out some huge URLs
            item['url'] = self.clean_url(subcat.select("@href").extract()[0])

            item['parent_text'] = parent['text']
            item['parent_url'] = parent['url']
            item['level'] = parent['level'] - 1
            item['department_text'] = response.meta['department_text']
            item['department_url'] = response.meta['department_url']
            item['department_id'] = response.meta['department_id']

            yield Request(url = item['url'], callback = self.parseCategory, meta = {"item" : item, \
                "department_text" : response.meta['department_text'], "department_url" : response.meta['department_url'], "department_id" : response.meta['department_id']})
示例#4
0
    def parseSubcategory(self, response):
        hxs = HtmlXPathSelector(response)

        subcategory = response.meta['item']

        # yield this subcategory
        yield subcategory

        # if subcategory was special, we'll mark all subsubcategories as special
        if 'special' in subcategory:
            special = True
        else:
            special = False

        # get its subcategories
        subsubcategories = hxs.select(
            "//div[@class='product-category-expanded']//h3[@class='title']")

        for subsubcategory in subsubcategories:
            item = CategoryItem()
            item['text'] = subsubcategory.select("a/text()").extract()[0]
            item['url'] = Utils.add_domain(
                subsubcategory.select("a/@href").extract()[0], self.base_url)

            if special:
                item['special'] = 1

            item['parent_text'] = subcategory['text']
            item['parent_url'] = subcategory['url']
            item['department_text'] = subcategory['department_text']
            item['department_url'] = subcategory['department_url']
            item['department_id'] = subcategory['department_id']

            item['level'] = subcategory['level'] - 1

            description_text_holder = subsubcategory.select(
                "following-sibling::p[@class='description'][1]/text()"
            ).extract()
            if description_text_holder:
                item['description_text'] = description_text_holder[0]
                item['description_title'] = item['text']
                description_tokenized = Utils.normalize_text(
                    item['description_text'])
                item['description_wc'] = len(description_tokenized)

                (item['keyword_count'],
                 item['keyword_density']) = Utils.phrases_freq(
                     item['description_title'], item['description_text'])

            else:
                item['description_wc'] = 0

            # parse subcategory page to get product count, or further subsubcategory
            yield Request(item['url'],
                          callback=self.parseSubcategoryPage,
                          meta={'item': item})
示例#5
0
 def _populate_from_html(self, response):
     """Set html-dependant fields"""
     category = response.meta['category']
     #description = response.xpath('//div[@class="category-description std"]/*[not(a[@class="viewAllCats"])]')
     description = response.xpath('//div[@class="category-description std"]/node()')
     description = SelectorList(filter(lambda itm: not len(itm.css('.viewAllCats')), description))
     description = ' '.join(description.extract()) or None
     description = description.strip(' \n\r\t')
     desc_title = (response.css('.category-title h1::text').extract() or [None])[0]
     self._set_value(category, 'description_text', description)
     self._set_value(category, 'description_title', desc_title)
     tokenized = Utils.normalize_text(description) if description else []
     category['description_wc'] = len(tokenized)
     if description and desc_title:
         category['keyword_count'], category['keyword_density'] = Utils.phrases_freq(desc_title, description)
示例#6
0
    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)

        # get parent item from response, extract additional info and return it
        item = response.meta['parent']

        # add department name, url and id for item
        item['department_text'] = response.meta['department_text']
        item['department_url'] = response.meta['department_url']
        item['department_id'] = response.meta['department_id']

        # extract product count if available
        nr_items_holder = hxs.select(
            "//div[@id='showing']/strong[position()=2]/text()").extract()
        if nr_items_holder:
            item['nr_products'] = int(str(nr_items_holder[0]))

        # extract description if available
        # these are descriptions for  services pages
        desc_title_holder = hxs.select(
            "//div[@id='searchstate']/a[position()=2]/text()").extract()
        if desc_title_holder:
            item['description_title'] = desc_title_holder[0].strip()
        desc_content_holder = hxs.select(
            "//div[@class='content']/h3/text()").extract()
        if desc_content_holder:
            item['description_text'] = desc_content_holder[0].strip()
            tokenized = Utils.normalize_text(item['description_text'])
            item['description_wc'] = len(tokenized)
            (item['keyword_count'],
             item['keyword_density']) = Utils.phrases_freq(
                 item['description_title'], item['description_text'])
        else:
            item['description_wc'] = 0

        yield item

        # extract its subcategories
        #subcats_holders = hxs.select("//div[@class='narrowcontent']/ul[@class='search']")
        subcats_holders = hxs.select(
            "//div[@class='narrowcontent']/ul[contains(@class,'search')]")
        if subcats_holders:
            subcats_holder = subcats_holders[0]
            # these are subcategories if they are preceded by the title "Shop ..."
            title = subcats_holder.select(
                "parent::node()/preceding-sibling::node()//text()").extract(
                )[0]
            if str(title).startswith("Shop"):
                subcats = subcats_holder.select(".//li/a")
                for subcat in subcats:
                    item = CategoryItem()
                    item['text'] = subcat.select("text()").extract()[0].strip()
                    item['url'] = Utils.add_domain(
                        subcat.select("@href").extract()[0],
                        "http://www.bestbuy.com")
                    parent = response.meta['parent']
                    item['level'] = int(response.meta['level']) - 1
                    # if parent was special, this category is special too
                    if 'special' in parent:
                        item['special'] = 1
                    item['parent_text'] = parent['text']
                    item['parent_url'] = parent['url']

                    request = Request(url = item['url'], callback = self.parseCategory, meta = {'parent' : item, 'level' : item['level'], \
                        'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']})
                    yield request
    def parseDept(self, response):

        # for "copy & print" there's an exception, we don't need zipcode

        # # use selenium to complete the zipcode form and get the first results page
        # driver = webdriver.Firefox()
        # driver.get(response.url)

        # # set a hardcoded value for zipcode
        # zipcode = "12345"

        # textbox = driver.find_element_by_name("zipCode")
        # textbox.send_keys(zipcode)

        # button = driver.find_element_by_id("submitLink")
        # button.click()

        # cookie = {"zipcode": zipcode}
        # driver.add_cookie(cookie)

        # time.sleep(5)

        # # convert html to "nice format"
        # text_html = driver.page_source.encode('utf-8')
        # #print "TEXT_HTML", text_html
        # html_str = str(text_html)

        # # this is a hack that initiates a "TextResponse" object (taken from the Scrapy module)
        # resp_for_scrapy = TextResponse('none',200,{},html_str,[],None)

        # hxs = HtmlXPathSelector(resp_for_scrapy)

        #TODO: doesn't extract Televisions for ex

        hxs = HtmlXPathSelector(response)
        categories = hxs.select("//h2/a")

        root_url = "http://www.staples.com"

        # from parent's page:
        item = response.meta['parent']

        # add department name, url and id to item
        item['department_text'] = response.meta['department_text']
        item['department_url'] = response.meta['department_url']
        item['department_id'] = response.meta['department_id']

        # extract number of items, if any
        nritems_holder = hxs.select(
            "//div[@class='perpage']/span[@class='note']/text()").extract()
        if nritems_holder:
            m = re.findall("[0-9]+\s*items", nritems_holder[0])
            if m:
                item['nr_products'] = int("".join(re.findall("[0-9]+", m[0])))
            # else:
            #     print "NOT MATCH ", nritems_holder[0]

        # extract description, if any
        description_texts = hxs.select(
            "//h2[@class='seo short']//text() | //h2[@class='seo short long']//text()"
        ).extract()
        if description_texts and reduce(
                lambda x, y: x or y,
            [line.strip() for line in description_texts]):
            # replace all whitespace with one space, strip, and remove empty texts; then join them
            item['description_text'] = " ".join([
                re.sub("\s+", " ", description_text.strip())
                for description_text in description_texts
                if description_text.strip()
            ])

            if item['description_text']:
                item['description_title'] = item['text']

                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)

                (item['keyword_count'],
                 item['keyword_density']) = Utils.phrases_freq(
                     item['description_title'], item['description_text'])

            else:
                # if no description is found
                #print 'desc_holder but no desc_text ', response.URL
                item['description_wc'] = 0
        else:
            item['description_wc'] = 0

        # yield item the request came from (parent)
        yield item

        # extract subcategories
        for category in categories:
            # there are pages that don't have categories
            item = CategoryItem()
            text = category.select("text()").extract()
            if text:
                item['text'] = text[0]
            url = category.select("@href").extract()
            if url:
                item['url'] = root_url + url[0]
            item['level'] = int(response.meta['level'] - 1)
            if 'text' in response.meta['parent']:
                item['parent_text'] = response.meta['parent']['text']
            else:
                print 'no text in parent ', response.meta['parent']
            item['parent_url'] = response.url

            # yield the item after passing it through request and collecting additonal info
            #yield item

            # extract subcategories if any
            zipcode = "12345"
            request = Request(item['url'], callback = self.parseDept, cookies = {"zipcode" : zipcode}, \
                headers = {"Cookie" : "zipcode=" + zipcode}, meta = {"dont_redirect" : True, "dont_merge_cookies" : True, \
                "parent": item, "level": item['level'], \
                "department_text" : response.meta["department_text"], "department_url" : response.meta["department_url"], "department_id" : response.meta["department_id"]})
            yield request
示例#8
0
    def parseCategory(self, response):

        # if we are getting blocked by captcha, solve and redirect back here
        # if there is a captcha to solve, and we haven't exhausted our retries, try to solve it
        if self.has_captcha(
                response.body) and ('retry_count' not in response.meta
                                    or response.meta['retry_count'] > 0):
            yield self.solve_captcha_and_redirect(
                response, self.parseCategory
            )  # meta of response will contain number of retries left if set
            return

        hxs = HtmlXPathSelector(response)

        # extract additional info for received parent and return it
        item = response.meta['item']

        # extract product count if available and not already extracted (in extract_itemcount_and_subcategories, from menu of the left, without crawling the actual url)
        if 'nr_products' not in item:
            prod_count_holder = hxs.select(
                "//h2[@class='resultCount']/span/text()").extract()
            if prod_count_holder:
                prod_count = prod_count_holder[0]
                # extract number

                # for paged results: Showing ... out of ... Results
                m = re.match(".*\s*of\s+([0-9,]+)\s+Results\s*", prod_count)

                # for one page results: Showing ... Result(s)
                if not m:
                    m = re.match(".*\s+([0-9,]+)\s+Results?\s*", prod_count)

                if m:
                    item['nr_products'] = int(re.sub(",", "", m.group(1)))

        # extract description if available
        # only extracts descriptions that contain a h2. is that good?
        desc_holders = hxs.select(
            "//div[@class='unified_widget rcmBody'][descendant::h2][last()]")
        # select the one among these with the most text
        #TODO: another idea: check if the holder has a h2 item
        if desc_holders:
            maxsize = 0
            max_desc_holder = desc_holders[0]
            for desc_holder in desc_holders:
                size = len(" ".join(desc_holder.select(".//text()").extract()))

                if size > maxsize:
                    maxsize = size
                    max_desc_holder = desc_holder
            desc_holder = max_desc_holder
            desc_title = desc_holder.select("h2/text()").extract()
            if desc_title:
                item['description_title'] = desc_title[0].strip()

            description_texts = desc_holder.select(
                ".//text()[not(ancestor::h2)]").extract()

            # if the list is not empty and contains at least one non-whitespace item
            # if there is a description title or the description body is large enough
            size_threshold = 50
            if (description_texts
                    and reduce(lambda x, y: x or y,
                               [line.strip()
                                for line in description_texts])):  # and \
                #(desc_title or len(" ".join(description_texts.select(".//text()").extract()) > size_threshold)):
                # replace all whitespace with one space, strip, and remove empty texts; then join them
                item['description_text'] = " ".join([
                    re.sub("\s+", " ", description_text.strip())
                    for description_text in description_texts
                    if description_text.strip()
                ])

                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)

                if desc_title:
                    (item['keyword_count'],
                     item['keyword_density']) = Utils.phrases_freq(
                         item['description_title'], item['description_text'])

            else:
                item['description_wc'] = 0

        else:
            item['description_wc'] = 0

        # if item is found among EXTRA_TOPLEVEL_CATEGORIES_URLS, and no product count was found, add info from that url
        extra_category = self.find_matching_key(
            item['text'], self.EXTRA_TOPLEVEL_CATEGORIES_URLS)

        # crawl lower level categories
        if item['level'] > self.LEVEL_BARRIER:
            if extra_category:

                # collect number of products from this alternate URL
                # this will also extract subcategories and their count
                yield Request(
                    self.EXTRA_TOPLEVEL_CATEGORIES_URLS[extra_category],
                    callback=self.extractSubcategories,
                    meta={'item': item})

            else:
                # extract subcategories and their count for category even if not in extra_...
                yield Request(item['url'],
                              callback=self.extractSubcategories,
                              meta={'item': item})
        else:
            yield item
class WalmartCaSpider(BaseSpider):
    name = "walmartca"
    allowed_domains = ["walmart.ca"]
    start_urls = [
        "http://www.walmart.ca/en",
    ]

    def __init__(self, outfile=None):
        self.root_url = "http://www.walmart.ca"
        self.outfile = outfile

        # set flag that indicates that for this spider, nr of products for each catgory should be computed
        self.compute_nrproducts = True

        # level that is considered to contain departments
        self.DEPARTMENT_LEVEL = 1

        # keep crawled items represented by (url, parent_url, department_url) pairs
        # to eliminate duplicates
        # (adding department_url makes sure that if one entire department is found as a subcategory of another for ex, both (and their complete category trees) will be crawled)
        self.crawled = []

        # last used category id, used for autoincrementing ids idenrifying categories
        self.id_count = 0

        # hardcoded values for special category's item count. Currently used for 'Value of the day' that typically has fixed number of products, and nowhere to extract it from page
        self.special_itemcount = {'value of the day': 2}

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        #links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavM']")
        #parent_links = hxs.select("//div[@class='MidContainer']/div[3]//a[@class='NavXLBold']")
        #parent_links = hxs.select("//div[@class='MidContainer']/div/div/div[not(@class)]//a[@class='NavXLBold']")

        parent_links = hxs.select(
            "//div[@class='linkGroup']/div[not (@class)]/a[@class='NavXLBold'][@href]"
        )

        # #TODO: check this
        # item['nr_products'] = -1
        # yield item
        #yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item})

        department_id = 0

        for link in parent_links:
            item = CategoryItem()

            #TO remove:
            # # link to artificial parent category
            # item['parent_catid'] = 0

            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            # add domain if relative URL
            item['url'] = Utils.add_domain(item['url'], self.root_url)

            item['level'] = 1

            department_id += 1

            # send category page to parseCategory function to extract description and number of products and add them to the item
            yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \
                'department_text' : item['text'], 'department_url' : item['url'], 'department_id' : department_id})

    # parse category page and extract description and number of products
    def parseCategory(self, response):

        # URLs like health.walmart.com don't have body_as_unicode and generate an exception
        try:
            hxs = HtmlXPathSelector(response)
        except AttributeError, e:
            self.log("Could not get response from " + response.url +
                     "; original exception: " + str(e) + "\n",
                     level=log.WARNING)
            return

        item = response.meta['item']

        # Add department text, url and id to item
        item['department_text'] = response.meta['department_text']
        item['department_url'] = response.meta['department_url']
        item['department_id'] = response.meta['department_id']

        # assign unique id
        item['catid'] = self.id_count
        self.id_count += 1

        # Extract subcategories breakdown if any ("classification" field)
        classification_criteria = hxs.select(
            "//form[@id='refine']//h6[@class='AdvSearchSubhead']")
        classification_dictionary = {}
        for criterion in classification_criteria:
            criterion_name = criterion.select(
                ".//text()[normalize-space()!='']").extract()[0].strip()
            # extract subcategories by this criterion:
            # find first subcategories list element following this criterion name, ignore if subcategory text starts with "See " ("See fewer", "See more")
            subcategories = criterion.select(
                "following-sibling::div[contains(@class,'accordionContainer')][1]/ul[@class='MainMenu AdvSearchMenu']/li/a[not(contains(text(), 'See '))]"
            )
            # then filter by regex only ones whose text contains at least one letter (for ex, for customers rating subcats, they have no name, only a picture with nr of starts, we don't want them)
            subcategories = filter(
                lambda x: x.select("text()").re(".*[A-Za-z]+.*"),
                subcategories)

            # if we found these, create the classification dictionary
            if criterion_name and subcategories:
                subcategories_list = []
                for subcategory in subcategories:
                    subcategory_name = subcategory.select(
                        "@title").extract()[0]
                    # replace &nbsp with space, trim
                    subcategory_name = subcategory_name.replace("&nbsp",
                                                                " ").strip()
                    # extract product count
                    subcategory_prodcount = subcategory.select(
                        "span[@class='count']/text()").extract()
                    # if there is no count field, extract prodcount from subcategory name
                    if subcategory_prodcount:
                        m = re.match("\(([0-9]+)\)",
                                     subcategory_prodcount[0].strip())
                        # eliminate parantheses surrounding number and convert to int
                        if m:
                            subcategory_prodcount = m.group(1)
                        else:
                            subcategory_prodcount = subcategory_prodcount[
                                0].strip()
                    else:
                        # if there is no product count in separate element, try to extract it from subcategory name
                        subcategory_name = subcategory.select(
                            ".//text()[normalize-space()!='']").extract(
                            )[0].replace("&nbsp", " ").replace(u"\xa0",
                                                               " ").strip()
                        m = re.match("(.*)\(([0-9]+)\)", subcategory_name)
                        if m:
                            subcategory_prodcount = m.group(2)
                            subcategory_name = m.group(1).strip()

                    if subcategory_name and subcategory_prodcount:
                        subcategory_item = {
                            "name": subcategory_name,
                            "nr_products": int(subcategory_prodcount)
                        }
                        subcategories_list.append(subcategory_item)

                classification_dictionary[criterion_name] = subcategories_list

        if classification_dictionary:
            item['classification'] = classification_dictionary

        ##########################################################################################
        #
        # Extract description title, text, wordcount, and keyword density (if any)

        ###########################################
        #TODO:

        # first search for the description id they usually use,
        # second one is used more rarely and also with some false positives so needs to be checked for text length as well
        # try to find div with detailedPageDescriptionCopyBlock id; move on only if not found
        description_holder = hxs.select(
            "//div[@id='detailedPageDescriptionCopyBlock']")

        # flag to tell if we found it with basic rule
        found = True

        if not description_holder:
            found = False
            description_holder = hxs.select(
                "//div[@class='CustomPOV ReminderBubbleSeeAll']//p/text()[string-length() > "
                + str(DESC_LEN) + "]/parent::*/parent::*")

        # if none was found, try to find an element with much text (> DESC_LEN (200) characters)
        # this is gonna pe a paragraph in the description, look for its parent (containing the entire description)
        if not description_holder:
            #description_holder = hxs.select("//*[not(self::script or self::style)]/text()[string-length() > " + str(DESC_LEN) + "]/parent::*/parent::*")
            #TODO: !!does this mean string length for one paragraph is > DESC_LEN, or string length for entinre text content?
            # I think it means entire text content. We're ok
            description_holder = hxs.select("//p/text()[string-length() > " +
                                            str(DESC_LEN) +
                                            "]/parent::*/parent::*")

        # select element among these with most text
        if description_holder:
            desc_winner = description_holder[0]
            max_text = 0
            for desc_candidate in description_holder:
                # consider only text that is under a <p> tag and that has more than DESC_PAR_LEN (30) characters - then it's likely a description paragraph
                description_texts = desc_candidate.select(
                    ".//p//text()[string-length()>" + str(DESC_PAR_LEN) +
                    "]").extract()
                text_len = len(" ".join(description_texts))
                if text_len > max_text:
                    max_text = text_len
                    desc_winner = desc_candidate
                # if text length is the same, assume one of them is parent of the other
                #  and select the one with greater depth (fewer children)
                elif text_len == max_text and text_len != 0:
                    children_old = float(
                        desc_winner.select("count(*)").extract()[0])
                    children_new = float(
                        desc_candidate.select("count(*)").extract()[0])
                    if children_new < children_old:
                        desc_winner = desc_candidate

            description_holder = desc_winner

        # try to find description title in <b> tag in the holder;
        # if it's not found, try to find it in the first <p> if the description
        # if found there, exclude it from the description body
        if description_holder:
            #TODO:
            # try this instead: ".//p//b/text() | .//h1/text() | .//h3/text() | .//strong/text() "
            # to fix Money Center problem. but maybe it's not always inside p?
            description_title = description_holder.select(
                ".//b/text() | .//h1/text() | .//h3/text() | .//strong/text() "
            ).extract()
            if description_title:
                # this will implicitly get thle first occurence of either a <b> element or an <h1> element,
                # which is likely to be the title (the title usually comes first)
                item['description_title'] = description_title[0].strip()

            description_texts = description_holder.select(
                "./div[position()<2]//p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)] \
                | ./p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)]"
            ).extract()

            # if the list is not empty and contains at least one non-whitespace item
            if description_texts and reduce(
                    lambda x, y: x or y,
                [line.strip() for line in description_texts]):
                description_text = " ".join([
                    re.sub("\s+", " ", description_text.strip())
                    for description_text in description_texts
                    if description_text.strip()
                ])

                # if it's larger than 4096 characters and not found with main rule it's probably not a descriptions; causes problem to PHP script as well. Ignore it
                if len(description_text) < 4096 or found:

                    # replace all whitespace with one space, strip, and remove empty texts; then join them
                    item['description_text'] = description_text

                    # replace line breaks with space
                    item['description_text'] = re.sub("\n+", " ",
                                                      item['description_text'])

            if 'description_text' in item:
                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)

                # sometimes here there is no description title because of malformed html
                # if we can find description text but not description title, title is probably malformed - get first text in div instead
                if 'description_title' not in item:
                    desc_texts = description_holder.select(
                        "./text()").extract()
                    desc_texts = [text for text in desc_texts if text.strip()]
                    if desc_texts:
                        item['description_title'] = desc_texts[0].strip()

                if 'description_title' in item:
                    (item['keyword_count'],
                     item['keyword_density']) = Utils.phrases_freq(
                         item['description_title'], item['description_text'])

            else:
                item['description_wc'] = 0

        else:
            item['description_wc'] = 0

        #
        ##################################################################################

        # Extract product count

        # find if there is a wc field on the page
        wc_field = hxs.select(
            "//div[@class='mrl mod-toggleItemCount']/span/text() |\
            //div[@class='SPRecordCount']/text()").extract()
        if wc_field:
            m1 = re.match("([0-9]+) Results", wc_field[0])
            if m1:
                item['nr_products'] = int(m1.group(1))
            m2 = m2 = re.match(
                "\s*Items\s*[0-9\-]+\s*of\s*([0-9]+)\s*total\s*", wc_field[0])
            if m2:
                item['nr_products'] = int(m2.group(1))

        # set item count for special items (hardcoded in special_itemcount)
        if item['text'].lower() in self.special_itemcount:
            item['nr_products'] = self.special_itemcount[item['text'].lower()]

        # Extract subcategories if no product count found
        if 'nr_products' in item:
            yield item

        else:
            # look for links to subcategory pages in menu
            subcategories_links = hxs.select(
                "//div[contains(@class, 'G1001 LeftNavRM')]/div[contains(@class, 'yuimenuitemlabel browseInOuter')]/a[@class='browseIn']"
            )

            if not subcategories_links:
                # # if we haven't found them, try to find subcategories in menu on the left under a "Shop by Category" header
                #     subcategories_links = hxs.select("//div[@class='MainCopy']/div[@class='Header' and text()='\nShop by Category']/following-sibling::node()//a")

                # if we haven't found them, try to find subcategories in menu on the left - get almost anything
                subcategories_links = hxs.select(
                    "//div[@class='MainCopy']/div[@class='Header' and not(contains(text(),'Related Categories')) \
                    and not(contains(text(),'Special Offers')) and not(contains(text(),'View Top Registry Items')) and not(contains(text(),'Featured Content'))\
                    and not(contains(text(), 'Featured Brands'))]\
                    /following-sibling::node()//a")

            # if we found them, create new category for each and parse it from the beginning

            #TODO
            ########################################
            # Exceptions - doesn't find anything for:
            #   http://photos.walmart.com/walmart/welcome?povid=cat121828-env999999-moduleA072012-lLinkGNAV5_PhotoCenter
            #
            #
            ########################################

            if subcategories_links:

                # new categories are subcategories of current one - calculate and store their level
                parent_item = item
                level = parent_item['level'] - 1

                #print "URL ", response.url, " CALLING PARSEPAGE"
                for subcategory in subcategories_links:

                    # to avoid rescraping categories reached from links in menu and reaching levels of -9,
                    # if level < -3 assume we've been there and skip

                    if level < -3:
                        continue

                    item = CategoryItem()
                    item['url'] = Utils.add_domain(
                        subcategory.select("@href").extract()[0],
                        self.root_url)
                    text = subcategory.select("text()").extract()

                    if text:
                        item['text'] = text[0].strip()
                    else:
                        # usually means it's something else than what we need
                        #TODO: check
                        continue
                        #print "no text for subcategory ", item, response.url

                    # # take care of unicode
                    # item['text'] = item['text'].encode("utf-8", errors=ignore)

                    item['level'] = level

                    item['parent_text'] = parent_item['text']
                    item['parent_url'] = parent_item['url']
                    item['parent_catid'] = parent_item['catid']

                    if 'parent_text' in parent_item:
                        item['grandparent_text'] = parent_item['parent_text']
                    if 'parent_url' in parent_item:
                        item['grandparent_url'] = parent_item['parent_url']

                    # if parent's parents are missing, level must be at least 0
                    if 'parent_text' not in parent_item or 'parent_url' not in parent_item:
                        assert level >= 0

                    # send subcategory items to be parsed again
                    # if not already crawled
                    if (item['url'], item['parent_url'],
                            response.meta['department_url']
                        ) not in self.crawled:
                        yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \
                            'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']})
                        self.crawled.append((item['url'], item['parent_url'],
                                             response.meta['department_url']))

                # return current item
                # idea for sending parent and collecting nr products. send all of these subcats as a list in meta, pass it on, when list becomes empty, yield the parent
                yield parent_item
                #yield Request(item['url'], callback = self.parsePage, meta = {'item' : item, 'parent_item' : parent_item})

            # if we can't find either products on the page or subcategory links
            else:
                #print "URL", response.url, " NO SUBCATs"
                #item['nr_products'] = 0
                yield item
示例#10
0
    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)

        # output received parent element after extracting additional info
        item = response.meta['parent']

        # add department name, url and id to item
        item['department_text'] = response.meta['department_text']
        item['department_url'] = response.meta['department_url']
        item['department_id'] = response.meta['department_id']

        # extract number of items if available
        prod_count_holder = hxs.select(
            "//span[@id='productCount']/text()").extract()
        if prod_count_holder:
            item['nr_products'] = int(prod_count_holder[0].strip())
        # exract description if available
        desc_holder = hxs.select("//div[@id='catalogCopyBlock']")
        if desc_holder:
            item['description_title'] = desc_holder.select(
                "h2/text()").extract()[0]
            description_texts = desc_holder.select("p/text()").extract()

            # if the list is not empty and contains at least one non-whitespace item
            if description_texts and reduce(
                    lambda x, y: x or y,
                [line.strip() for line in description_texts]):
                # replace all whitespace with one space, strip, and remove empty texts; then join them
                item['description_text'] = " ".join([
                    re.sub("\s+", " ", description_text.strip())
                    for description_text in description_texts
                    if description_text.strip()
                ])

                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)

                (item['keyword_count'],
                 item['keyword_density']) = Utils.phrases_freq(
                     item['description_title'], item['description_text'])
            else:
                item['description_wc'] = 0

        else:
            item['description_wc'] = 0

        yield item

        chapters = hxs.select("//li[@class='nav_cat_item_bold']")

        for chapter in chapters:

            #TODO: still includes some special categories (like "Coming Soon" in men)
            # exclude "Brands" chapter
            chapter_name = chapter.select("span/text()").extract()
            if not chapter_name or "brands" in chapter_name[0]:
                continue

            subcats = chapter.select("ul/li/a")
            for subcat in subcats:
                item = CategoryItem()
                text = subcat.select('text()').extract()[0]
                # if it starts with "Shop all", ignore it
                if re.match("Shop [aA]ll.*", text):
                    continue
                else:
                    item['text'] = text
                # remove unnecessary suffix from URL
                url = subcat.select('@href').extract()[0]
                m = re.match("(.*\?id=[0-9]+)&?.*", url)
                if m:
                    item['url'] = m.group(1)
                else:
                    item['url'] = url
                item['level'] = int(response.meta['level']) - 1
                item['parent_text'] = response.meta['parent']['text']
                item['parent_url'] = response.url

                #yield item

                yield Request(item['url'], callback = self.parseCategory, meta = {'parent' : item, 'level' : item['level'], \
                    'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']})
示例#11
0
    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)

        # extract additional info for received parent and return it
        item = response.meta['item']

        # extract product count if available and not already extracted (in extract_itemcount_and_subcategories, from menu of the left, without crawling the actual url)
        if 'nr_products' not in item:
            prod_count_holder = hxs.select("//h2[@class='resultCount']/span/text()").extract()
            if prod_count_holder:
                prod_count = prod_count_holder[0]
                # extract number
                m = re.match(".*\s*of\s*([0-9,]+)\s*Results\s*", prod_count)
                if m:
                    item['nr_products'] = int(re.sub(",","",m.group(1)))

        # extract description if available
        # only extracts descriptions that contain a h2. is that good?
        desc_holders = hxs.select("//div[@class='unified_widget rcmBody'][descendant::h2][last()]")
        # select the one among these with the most text
        #TODO: another idea: check if the holder has a h2 item
        if desc_holders:
            maxsize = 0
            max_desc_holder = desc_holders[0]
            for desc_holder in desc_holders:
                size = len(" ".join(desc_holder.select(".//text()").extract()))

                if size > maxsize:
                    maxsize = size
                    max_desc_holder = desc_holder
            desc_holder = max_desc_holder
            desc_title = desc_holder.select("h2/text()").extract()
            if desc_title:
                item['description_title'] = desc_title[0].strip()
            
            description_texts = desc_holder.select(".//text()[not(ancestor::h2)]").extract()

            # if the list is not empty and contains at least one non-whitespace item
            # if there is a description title or the description body is large enough
            size_threshold = 50
            if (description_texts and reduce(lambda x,y: x or y, [line.strip() for line in description_texts])):# and \
            #(desc_title or len(" ".join(description_texts.select(".//text()").extract()) > size_threshold)):
                # replace all whitespace with one space, strip, and remove empty texts; then join them
                item['description_text'] = " ".join([re.sub("\s+"," ", description_text.strip()) for description_text in description_texts if description_text.strip()])

                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)

                if desc_title:
                    (item['keyword_count'], item['keyword_density']) = Utils.phrases_freq(item['description_title'], item['description_text'])
            
            else:
                item['description_wc'] = 0

        else:
            item['description_wc'] = 0


        # if item is found among extra_toplevel_categories_urls, and no product count was found, add info from that url
        extra_category = self.find_matching_key(item['text'], self.extra_toplevel_categories_urls)

        #yield item

        # crawl level 0 categories (only for their product count and subcategories - no descriptions...)

        if 'nr_products' not in item or item['level'] > self.LEVEL_BARRIER:
            if extra_category:
            
                # collect number of products from this alternate URL
                # this will also extract subcategories and their count
                yield Request(self.extra_toplevel_categories_urls[extra_category], callback = self.extract_nrprods_and_subcats, meta = {'item' : item})

            else:
                # extract subcategories and their count for category even if not in extra_...
                yield Request(item['url'], callback = self.extract_nrprods_and_subcats, meta = {'item' : item})
        else:
            yield item
    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)

        item = response.meta['item']

        # extract number of products if available
        nrproducts_holder = hxs.select(
            "//div[@class='resultsfilterBottom']/div[@class='itemsShowresult']/strong[2]/text()"
        ).extract()
        if nrproducts_holder:
            item['nr_products'] = int(nrproducts_holder[0])

        # extract description if available
        description_holders = hxs.select("//div[@class='textBlock']")
        # if the list is not empty and contains at least one non-whitespace item
        if description_holders:
            description_texts = description_holders.select(
                ".//text()[not(ancestor::h2)]").extract()

            # replace all whitespace with one space, strip, and remove empty texts; then join them
            desc_text = " ".join([
                re.sub("\s+", " ", description_text.strip())
                for description_text in description_texts
                if description_text.strip()
            ])
            if desc_text:
                item['description_text'] = desc_text

                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)
            else:
                item['description_wc'] = 0

            description_title = description_holders.select(
                ".//h2/text()").extract()
            if description_title:
                item['description_title'] = description_title[0].strip()

                if desc_text:

                    (item['keyword_count'],
                     item['keyword_density']) = Utils.phrases_freq(
                         item['description_title'], item['description_text'])
        else:
            item['description_wc'] = 0

        self.parsed_urls.append(item['url'])

        yield item

        # extract subcategories
        product_links = hxs.select(
            "//div[@class='resultsWrap listView']//h3[@class='itemName']/a/@href"
        ).extract()
        # only extract subcategories if product links not found on page
        if not product_links:

            parent = item

            # search for a link to "See All Products"
            seeall = hxs.select(
                "//span[text()='See All Products']/parent::node()/@href"
            ).extract()
            if seeall:
                # pass the page with subcategories menu to a method to parse it
                #print 'parsing seeall: from ', response.url, ' to ', Utils.add_domain(seeall[0], "http://www.tigerdirect.com")
                yield Request(url = Utils.add_domain(seeall[0], "http://www.tigerdirect.com"), callback = self.parseSubcats, \
                    meta = {'parent' : parent,\
                     'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'],\
                     'department_id' : response.meta['department_id']})
            else:
                # pass the current page (with subcategories menu on it) to a method to parse it
                #print 'parsing for subcategories ', response.url
                yield Request(url = response.url, callback = self.parseSubcats, meta = {'parent' : parent,\
                    'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'],\
                    'department_id' : response.meta['department_id']})
示例#13
0
    def parseCategory(self, response):
        hxs = HtmlXPathSelector(response)
        item = response.meta['item']

        # Add department text, url and id to item
        item['department_text'] = response.meta['department_text']
        item['department_url'] = response.meta['department_url']
        item['department_id'] = response.meta['department_id']

        # assign unique id
        item['catid'] = self.id_count
        self.id_count += 1

        # Extract subcategories breakdown if any ("classification" field)
        classification_criteria = hxs.select(
            "//form[@id='refine']//h6[@class='AdvSearchSubhead']")
        classification_dictionary = {}
        for criterion in classification_criteria:
            criterion_name = criterion.select(
                ".//text()[normalize-space()!='']").extract()[0].strip()
            # extract subcategories by this criterion:
            # find first subcategories list element following this criterion name, ignore if subcategory text starts with "See " ("See fewer", "See more")
            subcategories = criterion.select(
                "following-sibling::div[contains(@class,'accordionContainer')][1]/ul[@class='MainMenu AdvSearchMenu']/li/a[not(contains(text(), 'See '))]"
            )
            # then filter by regex only ones whose text contains at least one letter (for ex, for customers rating subcats, they have no name, only a picture with nr of starts, we don't want them)
            subcategories = filter(
                lambda x: x.select("text()").re(".*[A-Za-z]+.*"),
                subcategories)

            # if we found these, create the classification dictionary
            if criterion_name and subcategories:
                subcategories_list = []
                for subcategory in subcategories:
                    subcategory_name = subcategory.select(
                        "@title").extract()[0]
                    # replace &nbsp with space, trim
                    subcategory_name = subcategory_name.replace("&nbsp",
                                                                " ").strip()
                    # extract product count
                    subcategory_prodcount = subcategory.select(
                        "span[@class='count']/text()").extract()
                    # if there is no count field, extract prodcount from subcategory name
                    if subcategory_prodcount:
                        m = re.match("\(([0-9]+)\)",
                                     subcategory_prodcount[0].strip())
                        # eliminate parantheses surrounding number and convert to int
                        if m:
                            subcategory_prodcount = m.group(1)
                        else:
                            subcategory_prodcount = subcategory_prodcount[
                                0].strip()
                    else:
                        # if there is no product count in separate element, try to extract it from subcategory name
                        subcategory_name = subcategory.select(
                            ".//text()[normalize-space()!='']").extract(
                            )[0].replace("&nbsp", " ").replace(u"\xa0",
                                                               " ").strip()
                        m = re.match("(.*)\(([0-9]+)\)", subcategory_name)
                        if m:
                            subcategory_prodcount = m.group(2)
                            subcategory_name = m.group(1).strip()

                    if subcategory_name and subcategory_prodcount:
                        subcategory_item = {
                            "name": subcategory_name,
                            "nr_products": int(subcategory_prodcount)
                        }
                        subcategories_list.append(subcategory_item)

                classification_dictionary[criterion_name] = subcategories_list

        if classification_dictionary:
            item['classification'] = classification_dictionary

        ##########################################################################################
        #
        # Extract description title, text, wordcount, and keyword density (if any)

        ###########################################
        #TODO:
        # Exceptions:
        #   http://www.walmart.com/cp/5431?povid=cat1078944-env506746-moduleA030213-lLinkLHNRelatedCategories2Pharmacy - finds wrong title (also wrong description holder - too high level)
        #   http://www.walmart.com/cp/1102793?povid=cat1094926-env999999-moduleA030713-lLinkLHNLearnmoreAbouttheprogram - finds description, actually no description, CustomPOV... with large text inside, hard to fix
        #   http://brands.walmart.com/fishing/essential-rods-and-reels/ - finds description, actually no description. Just an element with much text
        #   http://brands.walmart.com/fishing/get-salty-with-your-bass-skills/ - finds description, actually no description. Just an element with much text
        #   http://instoresnow.walmart.com/article.aspx?Center=Pets&id=104225 - finds description, actually no description. Just an element with much text
        #   http://brands.walmart.com/fishing/turn-a-kid-on-to-flyfishing/ - finds description, actually no description. Just an element with much text
        #   http://www.walmart.com/cp/1094926?povid=cat121828-env999999-moduleA030713-lLinkGNAV1_Campaign_EmpoweringWomenTogether - finds description, actually no description. Just an element with much text
        #   http://www.walmart.com/ip/Straight-Talk-Samsung-Galaxy-S-III/23573710?povid=cat1105910-env542259-moduleA092613-lLinkLHNWhatsNewSamsungSIIIStraightTalk - finds description, actually no description. Just an element with much text
        #   http://www.walmart.com/cp/Bakery/120764 - finds description, actually no description. Just an element with much text, also title problem
        #   http://www.walmart.com/cp/1078665 - not a description, also imperfect title extraction
        #   http://www.walmart.com/cp/1101244?povid=cat1100706-env999999-module122012-LHN_HealthyLivingTips - wrong title extraction, extracts too much as a description holder
        #   http://www.walmart.com/cp/flexible-spending-account/555326 - finds description though no description, just large text (also bad title extraction)

        # Idea for excluding elements with much text that are false positives: check if element is composed of many sibling paragraphs or so
        ###########################################

        # first search for the description id they usually use,
        # second one is used more rarely and also with some false positives so needs to be checked for text length as well
        # try to find div with detailedPageDescriptionCopyBlock id; move on only if not found
        description_holder = hxs.select(
            "//div[@id='detailedPageDescriptionCopyBlock']")

        # flag to tell if we found it with basic rule
        found = True

        if not description_holder:
            found = False
            description_holder = hxs.select(
                "//div[@class='CustomPOV ReminderBubbleSeeAll']//p/text()[string-length() > "
                + str(DESC_LEN) + "]/parent::*/parent::*")

        # if none was found, try to find an element with much text (> DESC_LEN (200) characters)
        # this is gonna pe a paragraph in the description, look for its parent (containing the entire description)
        if not description_holder:
            #description_holder = hxs.select("//*[not(self::script or self::style)]/text()[string-length() > " + str(DESC_LEN) + "]/parent::*/parent::*")
            #TODO: !!does this mean string length for one paragraph is > DESC_LEN, or string length for entinre text content?
            # I think it means entire text content. We're ok
            description_holder = hxs.select("//p/text()[string-length() > " +
                                            str(DESC_LEN) +
                                            "]/parent::*/parent::*")

        # select element among these with most text
        if description_holder:
            desc_winner = description_holder[0]
            max_text = 0
            for desc_candidate in description_holder:
                # consider only text that is under a <p> tag and that has more than DESC_PAR_LEN (30) characters - then it's likely a description paragraph
                description_texts = desc_candidate.select(
                    ".//p//text()[string-length()>" + str(DESC_PAR_LEN) +
                    "]").extract()
                text_len = len(" ".join(description_texts))
                if text_len > max_text:
                    max_text = text_len
                    desc_winner = desc_candidate
                # if text length is the same, assume one of them is parent of the other
                #  and select the one with greater depth (fewer children)
                elif text_len == max_text and text_len != 0:
                    children_old = float(
                        desc_winner.select("count(*)").extract()[0])
                    children_new = float(
                        desc_candidate.select("count(*)").extract()[0])
                    if children_new < children_old:
                        desc_winner = desc_candidate

            description_holder = desc_winner

        # try to find description title in <b> tag in the holder;
        # if it's not found, try to find it in the first <p> if the description
        # if found there, exclude it from the description body
        if description_holder:
            #TODO:
            # try this instead: ".//p//b/text() | .//h1/text() | .//h3/text() | .//strong/text() "
            # to fix Money Center problem. but maybe it's not always inside p?
            description_title = description_holder.select(
                ".//b/text() | .//h1/text() | .//h3/text() | .//strong/text() "
            ).extract()
            if description_title:
                # this will implicitly get thle first occurence of either a <b> element or an <h1> element,
                # which is likely to be the title (the title usually comes first)
                item['description_title'] = description_title[0].strip()

            description_texts = description_holder.select(
                "./div[position()<2]//p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)] \
                | ./p//text()[not(ancestor::b) and not(ancestor::h1) and not(ancestor::strong)]"
            ).extract()

            # if the list is not empty and contains at least one non-whitespace item
            if description_texts and reduce(
                    lambda x, y: x or y,
                [line.strip() for line in description_texts]):
                description_text = " ".join([
                    re.sub("\s+", " ", description_text.strip())
                    for description_text in description_texts
                    if description_text.strip()
                ])

                # if it's larger than 4096 characters and not found with main rule it's probably not a descriptions; causes problem to PHP script as well. Ignore it
                if len(description_text) < 4096 or found:

                    # replace all whitespace with one space, strip, and remove empty texts; then join them
                    item['description_text'] = description_text

                    # replace line breaks with space
                    item['description_text'] = re.sub("\n+", " ",
                                                      item['description_text'])

            if 'description_text' in item:
                tokenized = Utils.normalize_text(item['description_text'])
                item['description_wc'] = len(tokenized)

                # sometimes here there is no description title because of malformed html
                # if we can find description text but not description title, title is probably malformed - get first text in div instead
                if 'description_title' not in item:
                    desc_texts = description_holder.select(
                        "./text()").extract()
                    desc_texts = [text for text in desc_texts if text.strip()]
                    if desc_texts:
                        item['description_title'] = desc_texts[0].strip()

                if 'description_title' in item:
                    (item['keyword_count'],
                     item['keyword_density']) = Utils.phrases_freq(
                         item['description_title'], item['description_text'])

            else:
                item['description_wc'] = 0

        else:
            item['description_wc'] = 0

        #
        ##################################################################################

        # Extract product count

        # find if there is a wc field on the page
        wc_field = hxs.select(
            "//div[@class='mrl mod-toggleItemCount']/span/text() |\
            //div[@class='SPRecordCount']/text()").extract()
        if wc_field:
            m1 = re.match("([0-9]+) Results", wc_field[0])
            if m1:
                item['nr_products'] = int(m1.group(1))
            m2 = m2 = re.match(
                "\s*Items\s*[0-9\-]+\s*of\s*([0-9]+)\s*total\s*", wc_field[0])
            if m2:
                item['nr_products'] = int(m2.group(1))
            yield item

        else:
            # look for links to subcategory pages in menu
            subcategories_links = hxs.select(
                "//div[contains(@class, 'G1001 LeftNavRM')]/div[contains(@class, 'yuimenuitemlabel browseInOuter')]/a[@class='browseIn']"
            )

            if not subcategories_links:
                # # if we haven't found them, try to find subcategories in menu on the left under a "Shop by Category" header
                #     subcategories_links = hxs.select("//div[@class='MainCopy']/div[@class='Header' and text()='\nShop by Category']/following-sibling::node()//a")

                # if we haven't found them, try to find subcategories in menu on the left - get almost anything
                subcategories_links = hxs.select(
                    "//div[@class='MainCopy']/div[@class='Header' and not(contains(text(),'Related Categories')) \
                    and not(contains(text(),'Special Offers')) and not(contains(text(),'View Top Registry Items')) and not(contains(text(),'Featured Content'))\
                    and not(contains(text(), 'Featured Brands'))]\
                    /following-sibling::node()//a")

            # if we found them, create new category for each and parse it from the beginning

            #TODO
            ########################################
            # Exceptions - doesn't find anything for:
            #   http://photos.walmart.com/walmart/welcome?povid=cat121828-env999999-moduleA072012-lLinkGNAV5_PhotoCenter
            #
            #
            ########################################

            if subcategories_links:

                # new categories are subcategories of current one - calculate and store their level
                parent_item = item
                level = parent_item['level'] - 1

                #print "URL ", response.url, " CALLING PARSEPAGE"
                for subcategory in subcategories_links:

                    # to avoid rescraping categories reached from links in menu and reaching levels of -9,
                    # if level < -3 assume we've been there and skip

                    if level < -3:
                        continue

                    item = CategoryItem()
                    item['url'] = Utils.add_domain(
                        subcategory.select("@href").extract()[0],
                        self.root_url)
                    text = subcategory.select("text()").extract()

                    if text:
                        item['text'] = text[0].strip()
                    else:
                        # usually means it's something else than what we need
                        #TODO: check
                        continue
                        #print "no text for subcategory ", item, response.url

                    # # take care of unicode
                    # item['text'] = item['text'].encode("utf-8", errors=ignore)

                    item['level'] = level

                    item['parent_text'] = parent_item['text']
                    item['parent_url'] = parent_item['url']
                    item['parent_catid'] = parent_item['catid']

                    if 'parent_text' in parent_item:
                        item['grandparent_text'] = parent_item['parent_text']
                    if 'parent_url' in parent_item:
                        item['grandparent_url'] = parent_item['parent_url']

                    # if parent's parents are missing, level must be at least 0
                    if 'parent_text' not in parent_item or 'parent_url' not in parent_item:
                        assert level >= 0

                    # send subcategory items to be parsed again
                    # if not already crawled
                    if (item['url'], item['parent_url'],
                            response.meta['department_url']
                        ) not in self.crawled:
                        yield Request(item['url'], callback = self.parseCategory, meta = {'item' : item, \
                            'department_text' : response.meta['department_text'], 'department_url' : response.meta['department_url'], 'department_id' : response.meta['department_id']})
                        self.crawled.append((item['url'], item['parent_url'],
                                             response.meta['department_url']))

                # return current item
                # idea for sending parent and collecting nr products. send all of these subcats as a list in meta, pass it on, when list becomes empty, yield the parent
                yield parent_item
                #yield Request(item['url'], callback = self.parsePage, meta = {'item' : item, 'parent_item' : parent_item})

            # if we can't find either products on the page or subcategory links
            else:
                #print "URL", response.url, " NO SUBCATs"
                #item['nr_products'] = 0
                yield item
示例#14
0
    def parseCategory(self, response):

        #TODO: add extraction of additional category info
        sel = Selector(response)

        #TODO: a lot of redirects. maybe for item, set 'url' to the one to which it was redirected? (response.url)
        item = response.meta['item']

        # Description extraction needs to be done first because it can be found in regular /c/ pages that are first passed to this method.
        # For other info (item count, subcategories), the spider will redirect to different page if necessary (where description won't be available)
        # extract description
        description_texts = sel.xpath(
            "//div[@class='subpart']/p//text()").extract()

        # second try at finding descriptions
        if not description_texts:
            description_texts = sel.xpath(
                "//div[@id='SEO_TEXT']//text()").extract()

        # replace all whitespace with one space, strip, and remove empty texts; then join them
        if description_texts:
            item['description_text'] = " ".join([
                re.sub("\s+", " ", description_text.strip())
                for description_text in description_texts
                if description_text.strip()
            ])

            tokenized = Utils.normalize_text(item['description_text'])
            item['description_wc'] = len(tokenized)

        else:
            item['description_wc'] = 0

        # try to extract item count - if alternative extraction needs to be done.
        # this item's parsing will be redirected through different method and returned here

        # extract item count
        nr_products_node = sel.xpath("//ul[@class='results']//strong/text()")
        if nr_products_node:
            # nr of products is in the second of these nodessel.xpath("//ul[@class='results']//strong/text()")
            nr_products = nr_products_node.extract()[1].strip()
            item['nr_products'] = int(nr_products)

        # alternative item count: try on same page, but with /sb/ instead of /c/ in url
        if not nr_products_node:
            m = re.match("http://www\.target\.com/c/(.*)", response.url)
            if m:
                new_url = "http://www.target.com/sb/" + m.group(1)

                # retry to this same method but with new url
                #TODO: will miss descriptions. leave it to the end of the method then. but I want subcats from that one too?
                #OR extract it in secondary method and send it back to original url
                yield Request(new_url,
                              callback=self.parseCategory,
                              meta={'item': item})

            else:
                if "/sb/" not in new_url:
                    print "DOES NOT MATCH", response.url

        # alternative item count extraction 2 (dynamically generated content)
        if not nr_products_node:

            # extract dynamycally loaded data by making an additional request (made by the page to load the data)
            # extract url and parameters from form data
            form = sel.xpath("//form[@name='dynamicAjaxFrm1']")
            if form:
                form_action = form.xpath("@action").extract()[0]
                form_inputs = form.xpath("input")
                # build string of parameters from input names and values
                param_dict = {
                    form_input.xpath("@name").extract()[0]:
                    form_input.xpath("@value").extract()[0]
                    for form_input in form_inputs
                }
                param_string = urllib.urlencode(param_dict)
                # build url to make request to
                new_url = "http://www.target.com" + form_action + "&" + param_string

                # if this url was found, redirect request to new method to extract item count as well, that method will yield the item
                # only redirect to this method if we weren't already redirected from it - to avoid redirect loop
                if 'redirected' not in response.meta or not response.meta[
                        'redirected']:
                    yield Request(new_url,
                                  callback=self.parseCategoryDyncontent,
                                  meta={'item': item})
                    return

        #TODO: add description title as category name if no title available?
        # then also add the keyword/density count

        yield item

        if 'parent_url' in item:
            self.crawled_urls.append((item['url'], item['parent_url']))

        # extract subcategories (if we haven't reached level barrier)
        if item['level'] <= self.LEVEL_BARRIER:
            return

        parent_item = item

        # "shop categories" menu
        #subcategories = sel.xpath("//h3[text() = 'shop categories']/following-sibling::ul/li/a")
        #TODO: replace the not startswith with != ?
        subcategories_menu = sel.xpath(
            "//h3[starts-with(text(), 'shop ') and not(starts-with(text(), 'shop by')) \
            and not(starts-with(text(), 'shop for')) and not(starts-with(text(), 'shop favorite')) and not(contains(text(), ' size'))]"
        )
        subcategories = subcategories_menu.xpath("following-sibling::ul/li/a")

        for subcategory in subcategories:
            subcategory_item = CategoryItem()

            subcategory_item['text'] = subcategory.xpath(
                "text()").extract()[0].strip()
            subcategory_item['url'] = self.build_url(
                subcategory.xpath("@href").extract()[0])

            # filter duplicates
            if (subcategory_item['url'],
                    parent_item['url']) in self.crawled_urls:
                # print subcategory_item['url']
                # print parent_item['url']
                continue

            # assign next available category id
            self.catid += 1
            subcategory_item['catid'] = self.catid

            subcategory_item['level'] = parent_item['level'] - 1

            subcategory_item['parent_url'] = parent_item['url']
            subcategory_item['parent_text'] = parent_item['text']
            subcategory_item['parent_catid'] = parent_item['catid']

            subcategory_item['department_text'] = parent_item[
                'department_text']
            subcategory_item['department_url'] = parent_item['department_url']
            subcategory_item['department_id'] = parent_item['department_id']

            # send this subcategory to be further parsed
            yield Request(subcategory_item['url'],
                          callback=self.parseCategory,
                          meta={'item': subcategory_item})