示例#1
0
    def parseResults(self, response):

        hxs = HtmlXPathSelector(response)

        #site = response.meta['origin_site']
        origin_name = response.meta['origin_name']
        origin_model = response.meta['origin_model']

        # if this comes from a previous request, get last request's items and add to them the results

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        results = hxs.select("//div[@class='shortDescription']/a")
        for result in results:
            item = SearchItem()
            #item['origin_site'] = site
            item['product_name'] = result.select("text()").extract()[0]
            item['product_url'] = result.select("@href").extract()[0]

            if 'origin_url' in response.meta:
                item['origin_url'] = response.meta['origin_url']

            items.add(item)

        response.meta['items'] = items
        response.meta['parsed'] = items
        return self.reduceResults(response)
示例#2
0
 def parse(self, response):
     sel = Selector(response)
     items = []
     for i in range((self.pages - 1) * 10, self.pages * 10):
         i += 1
         item = SearchItem()
         url = sel.xpath('//div[@id="%d"]/h3/a/@href' % (i)).extract()
         title = sel.xpath('//div[@id="%d"]/h3/a//text()' % (i)).extract()
         desc = sel.xpath(
             '//div[@id="%d"]//div[@class="c-abstract"]//text()' %
             (i)).extract()
         item['num'] = str(i)
         item['title'] = title if title else '未爬取到标题'
         item['url'] = url if url else 'www.example.com'
         item['desc'] = desc if desc else '未爬取到内容'
         print str(i) + ' ' + ''.join(item['title']) + '\n'
         yield item
     next_page = sel.xpath(
         '//div[@id="page"]/a[@class="n"]/@href').extract()
     self.pages += 1
     if len(next_page) == 2:
         yield Request(self.domain + next_page[1])
     elif ((len(next_page) == 1)
           and (re.match(r'.*rsv_page=1$', next_page[0]))):
         yield Request(self.domain + next_page[0])
     else:
         print "Congratulations! All results has crawled!"
         raise CloseSpider('Happy Ending')
示例#3
0
    def parse(self, response):
        self.count += 1
        print("\n第%d个网页, 网址%s" % (self.count, response.url))

        # 检查本页的内容,是否含有关键词。
        ls = response.xpath('//text()').extract()
        mark = 0
        for s in ls:
            if 'кита' in s.lower():
                mark = 1
                break

        # 如果mark == 1,则说明本页上有关键词。
        if mark == 1:
            item = SearchItem()
            item['site'] = response.url
            yield item

        # 如果第一级url超过5个,就不再找了,只找下级的url。
        if self.count < 3:
            next_pages = response.xpath('//a/@href').extract()
            if len(next_pages) > 0:
                for link in next_pages:
                    # 如果是http开头,则直接使用该链接,是外网链接。
                    if re.match('http', link):
                        yield scrapy.Request(link, callback=self.parse)
                    # 如果不是http开头,则需要补充前缀,是本站链接。
                    else:
                        yield scrapy.Request('https://yandex.ru' + link,
                                             callback=self.parse)
示例#4
0
    def extract_result_products(self, response):

        hxs = HtmlXPathSelector(response)

        results = hxs.select("//div[@class='innerWrapper']")
        items = []

        for result in results:

            item = SearchItem()
            product_name = result.select(
                ".//div[@class='shortDescription']/a/text()").extract()
            product_url = result.select(
                ".//div[@class='shortDescription']/a/@href").extract()

            # quit if there is no product name
            if product_name and product_url:
                item['product_url'] = "http://www1.macys.com" + product_url[0]
                item['product_name'] = product_name[0].strip()
            else:
                self.log("No product name: " + str(response.url) +
                         " from product: " + response.meta['origin_url'],
                         level=log.ERROR)
                continue

            # extract price
            #! extracting regular price and not discount price when discounts available?
            price_holder = result.select(
                "div[@class='prices']/span/text()").extract()

            if price_holder:
                product_target_price = price_holder[0].strip()
                # remove commas separating orders of magnitude (ex 2,000)
                product_target_price = re.sub(",", "", product_target_price)
                # if more than one match, it will get the first one
                m = re.match("([a-zA-Z\.\s]+)?(\xa3|\$)([0-9]+\.?[0-9]*)",
                             product_target_price)
                if m:
                    price = float(m.group(3))
                    currency = m.group(2)
                    item['product_target_price'] = Utils.convert_to_dollars(
                        price, currency)
                else:
                    self.log("Didn't match product price: " +
                             product_target_price + " " + response.url + "\n",
                             level=log.WARNING)

            else:
                self.log("Didn't find product price: " + response.url + "\n",
                         level=log.DEBUG)

            # extract product brand
            #

            items.append(item)

        return items
示例#5
0
 def start_requests(self):
     while True:
         code = self.kw.pop()
         code = code.strip('\n')
         item = SearchItem()
         item['keyword'] = code
         sic_url = 'http://www.alexa.com/siteinfo/%s' % code
         yield Request(url=sic_url,
                       meta={"item": item['keyword']},
                       callback=self.parse0)
示例#6
0
    def parse_product_sony(self, response):
        hxs = HtmlXPathSelector(response)

        items = response.meta['items']

        #site = response.meta['origin_site']
        origin_url = response.meta['origin_url']

        # create item
        item = SearchItem()
        item['product_url'] = response.url
        item['origin_url'] = origin_url
        # hardcode brand to sony
        item['product_brand'] = 'sony'

        # extract product name, brand, model, etc; add to items
        product_name = hxs.select("//h2[@class='ws-product-title fn']//text()")
        if not product_name:
            self.log("Error: No product name: " + str(response.url),
                     level=log.INFO)
        else:
            item['product_name'] = product_name.extract()[0]
        product_model = hxs.select(
            "//span[@class='ws-product-item-number-value item-number']/text()")
        if product_model:
            item['product_model'] = product_model.extract()[0]

        item['product_images'] = len(
            hxs.select(
                "//a[@class='ws-alternate-views-list-link']/img").extract())
        item['product_videos'] = len(
            hxs.select("//li[@class='ws-video']//img").extract())

        items.add(item)

        # if there are any more results to be parsed, send a request back to this method with the next product to be parsed
        product_urls = response.meta['search_results']

        if product_urls:
            request = Request(product_urls.pop(),
                              callback=self.parse_product_sony,
                              meta=response.meta)
            request.meta['items'] = items
            # eliminate next product from pending list (this will be the new list with the first item popped)
            request.meta['search_results'] = product_urls

            return request
        else:
            # otherwise, we are done, send a the response back to reduceResults (no need to make a new request)

            response.meta['parsed'] = True
            response.meta['items'] = items

            return self.reduceResults(response)
    def parse_product_maplin(self, response):

        hxs = HtmlXPathSelector(response)

        items = response.meta['items']

        #site = response.meta['origin_site']
        origin_url = response.meta['origin_url']

        item = SearchItem()
        item['product_url'] = response.url
        #item['origin_site'] = site
        item['origin_url'] = origin_url
        item['origin_name'] = response.meta['origin_name']

        if 'origin_model' in response.meta:
            item['origin_model'] = response.meta['origin_model']
        if 'origin_upc' in response.meta:
            item['origin_upc'] = response.meta['origin_upc']
        if 'origin_brand' in response.meta:
            item['origin_brand'] = response.meta['origin_brand']


        product_name_node = hxs.select("//h1[@itemprop='name']/text()").extract()
        if product_name_node:
            product_name = product_name_node[0].strip()
        else:
            self.log("Error: No product name: " + str(response.url) + " for source product " + origin_url, level=log.ERROR)
            # TODO:is this ok? I think so
            return

        item['product_name'] = product_name

        # extract product model number
        # TODO: no model?
        # TODO: no upc?
        # TODO: no brand?
        # TODO: add code extraction
        
        # extract price
        price_holder = hxs.select("//meta[@itemprop='price']/@content").extract()
        # if we can't find it like above try other things:
        if price_holder:
            product_target_price = price_holder[0].strip()
            # remove commas separating orders of magnitude (ex 2,000)
            product_target_price = re.sub(",","",product_target_price)
            try:
                product_target_price = float(product_target_price)

                # convert to dollars (assume pounds)
                product_target_price = Utils.convert_to_dollars(product_target_price, u'\xa3')
                item['product_target_price'] = product_target_price
            except Exception, ex:
                self.log("Couldn't convert product price: " + response.url + "\n", level=log.WARNING)
示例#8
0
    def extract_result_products(self, response):

        hxs = HtmlXPathSelector(response)

        items = []
        results = hxs.select(
            "//div[@class='list-item-info']/div[@class='sku-title']/h4/a")

        for result in results:
            item = SearchItem()
            #item['origin_site'] = site
            product_name_holder = result.select("text()").extract()
            if product_name_holder:
                item['product_name'] = product_name_holder[0].strip()
            else:
                self.log("Error: No product name: " + str(response.url) +
                         " from product: " + origin_url,
                         level=log.ERROR)

            item['product_url'] = Utils.clean_url(
                Utils.add_domain(
                    result.select("@href").extract()[0],
                    "http://www.bestbuy.com"))

            if 'origin_url' in response.meta:
                item['origin_url'] = response.meta['origin_url']

            if 'origin_name' in response.meta:
                item['origin_name'] = response.meta['origin_name']

            if 'origin_model' in response.meta:
                item['origin_model'] = response.meta['origin_model']

            model_holder = result.select(
                "../../../div[@class='sku-model']/ul/li[@class='model-number']/span[@id='model-value']/text()"
            ).extract()
            if model_holder:
                item['product_model'] = model_holder[0]

            price_holder = result.select(
                "../../../../div[@class='list-item-price']//div[@class='price-block']//div[@class='medium-item-price']/text()[normalize-space()]"
            ).extract()
            if price_holder:
                price = price_holder[0].strip()
                price = re.sub(",", "", price)
                price = float(price)
                item['product_target_price'] = price

            items.append(item)

        return items
示例#9
0
    def parseResults(self, response):

        hxs = HtmlXPathSelector(response)

        #site = response.meta['origin_site']
        origin_name = response.meta['origin_name']
        origin_model = response.meta['origin_model']

        # if this comes from a previous request, get last request's items and add to them the results

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        results = hxs.select(
            "//div[@class='prodInfo']/div[@class='prodInfoBox']/a[@class='prodLink ListItemLink']"
        )
        for result in results:
            item = SearchItem()
            #item['origin_site'] = site

            #TODO: usually the manufacturer is in bold, so maybe use that
            product_name = " ".join(result.select(".//text()").extract())
            # append text that is in <span> if any
            span_text = result.select("./span/text()")

            #TODO: use span text differently, as it is more important/relevant (bold) ?
            for text in span_text:
                product_name += " " + text.extract()
            item['product_name'] = product_name
            rel_url = result.select("@href").extract()[0]

            root_url = "http://www.walmart.com"
            item['product_url'] = Utils.add_domain(rel_url, root_url)

            if 'origin_url' in response.meta:
                item['origin_url'] = response.meta['origin_url']

            if 'origin_id' in response.meta:
                request.meta['origin_id'] = response.meta['origin_id']
                assert self.by_id
            else:
                assert not self.by_id

            items.add(item)

        response.meta['items'] = items
        response.meta['parsed'] = items
        return self.reduceResults(response)
示例#10
0
    def parseResults(self, response):

        hxs = HtmlXPathSelector(response)

        #site = response.meta['origin_site']
        origin_name = response.meta['origin_name']
        origin_model = response.meta['origin_model']

        # if this comes from a previous request, get last request's items and add to them the results

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        results = hxs.select(
            "//div[@class='hproduct']/div[@class='info-main']/h3/a")

        for result in results:
            item = SearchItem()
            #item['origin_site'] = site
            item['product_name'] = result.select("text()").extract()[0].strip()
            item['product_url'] = Utils.clean_url(
                Utils.add_domain(
                    result.select("@href").extract()[0],
                    "http://www.bestbuy.com"))

            if 'origin_url' in response.meta:
                item['origin_url'] = response.meta['origin_url']

            if 'origin_id' in response.meta:
                request.meta['origin_id'] = response.meta['origin_id']
            # 	assert self.by_id
            # else:
            # 	assert not self.by_id

            model_holder = result.select(
                "parent::node()/parent::node()//strong[@itemprop='model']/text()"
            ).extract()
            if model_holder:
                item['product_model'] = model_holder[0]

            items.add(item)

        response.meta['items'] = items
        response.meta['parsed'] = items
        return self.reduceResults(response)
示例#11
0
    def parse0(self, response):
        item = SearchItem()
        selector = Selector(response)
        text0 = selector.xpath(
            '//div/strong[@class="metrics-data align-vmiddle"]/text()'
        ).extract()[1].strip()
        text1 = selector.xpath(
            '//span[@class="font-4 box1-r"]/text()').extract()[0]
        item['result'] = text0
        item['keyword'] = response.meta['item']
        item['link'] = text1

        yield item


# class Spider(CrawlSpider):
#     name="search"
#     with open(r'input', 'r') as r:
#         f=r.readlines()
#     kw=set(f)
#     finished=set()
#     def start_requests(self):
#         while self.kw.__len__():
#             code=self.kw.pop()
#             self.finished.add(code)
#             code=code.strip('\n')
#             item=SearchItem()
#             item['keyword']=code
#             sic_url='https://www.google.com/search?q=%s'%code
#             yield Request(url=sic_url,meta={"item":item['keyword']},callback=self.parse0)
#
#     def parse0(self,response):
#         item = SearchItem()
#         selector = Selector(response)
#         text0 = selector.xpath('//div[@id="resultStats"]/text()').extract()
#         text0 = ''.join(text0)
#         num = re.findall(r'\d+', text0)
#         item['result'] = ''.join(num)
#         item['keyword'] = response.meta['item']
#         if item['keyword'] not in self.finished:
#             self.kw.add(item['keyword'])
#
#
#         yield item
示例#12
0
    def parseResults(self, response):

        hxs = HtmlXPathSelector(response)

        #site = response.meta['origin_site']
        origin_name = response.meta['origin_name']
        origin_model = response.meta['origin_model']

        # if this comes from a previous request, get last request's items and add to them the results

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        results = hxs.select("//li[@class='productbox']")

        for result in results:
            product_link = result.select(".//a[@class='toplink']")
            item = SearchItem()
            #item['origin_site'] = site
            #TODO: site changed structure?
            item['product_url'] = product_link.select("@href").extract()[0]
            item['product_name'] = product_link.select(
                "div[@class='prodname']/text()").extract()[0]
            #TODO: add brand?
            #item['brand'] = result.select("div[@class='prodname']/div[@class='prodbrandname emphasis]/text()").extract()[0]

            if 'origin_url' in response.meta:
                item['origin_url'] = response.meta['origin_url']

            if 'origin_id' in response.meta:
                request.meta['origin_id'] = response.meta['origin_id']
                assert self.by_id
            else:
                assert not self.by_id

            items.add(item)

        response.meta['items'] = items
        response.meta['parsed'] = items
        return self.reduceResults(response)
示例#13
0
    def parseResults(self, response):

        hxs = HtmlXPathSelector(response)

        #site = response.meta['origin_site']
        origin_name = response.meta['origin_name']
        origin_model = response.meta['origin_model']

        # if this comes from a previous request, get last request's items and add to them the results

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        # toysrus
        results = hxs.select("//a[@class='prodtitle']")

        for result in results:
            item = SearchItem()
            #item['origin_site'] = site
            item['product_name'] = result.select("text()").extract()[0]
            root_url = "http://www.toysrus.com"
            item['product_url'] = root_url + result.select(
                "@href").extract()[0]

            if 'origin_url' in response.meta:
                item['origin_url'] = response.meta['origin_url']

            if 'origin_id' in response.meta:
                request.meta['origin_id'] = response.meta['origin_id']
                assert self.by_id
            else:
                assert not self.by_id

            items.add(item)

        response.meta['items'] = items
        response.meta['parsed'] = items
        return self.reduceResults(response)
示例#14
0
    def parseResults(self, response):

        hxs = HtmlXPathSelector(response)

        #site = response.meta['origin_site']
        origin_name = response.meta['origin_name']
        origin_model = response.meta['origin_model']

        # if this comes from a previous request, get last request's items and add to them the results

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        results = hxs.select(
            "//li[@class='product']/div[@class='product-content']/a[@class='pro-thumb']"
        )
        for result in results:
            item = SearchItem()
            #item['origin_site'] = site
            item['product_name'] = result.select(
                "span[@class='pro-name']/text()").extract()[0]
            item['product_url'] = result.select("@href").extract()[0]

            if 'origin_url' in response.meta:
                item['origin_url'] = response.meta['origin_url']

            if 'origin_id' in response.meta:
                request.meta['origin_id'] = response.meta['origin_id']
                assert self.by_id
            else:
                assert not self.by_id

            items.add(item)

        response.meta['items'] = items
        response.meta['parsed'] = items
        return self.reduceResults(response)
示例#15
0
    def parse_item(self, response):
        item = self.config.get('item')
        if item:
            # cls = eval(item.get('class'))()
            # loader = eval(item.get('loader'))(cls, response=response)
            loader = SearchLoader(item=SearchItem(), response=response)
            loader.add_value('tenantId', self.tenantId)
            loader.add_value('indexName', self.indexName)
            loader.add_value('dataAnnotation', self.dataAnnotation)
            # 格式化成2016-03-20 11:45:39形式
            # loader.add_value('createDate', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
            loader.add_value(
                'createDate',
                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

            # 动态获取属性配置
            for key, value in item.get(
                    'attrs').items():  #attrs是json,json——>items
                for extractor in value:  #value:数组,extractor:json
                    if extractor.get('method') == 'xpath':
                        args = extractor.get('args')  #数组
                        if key == 'title':
                            args = ['normalize-space(' + self.title + ')']
                        elif key == 'content':
                            args = [self.content]
                        loader.add_xpath(key, *args,
                                         **{'re': extractor.get('re')})
                    if extractor.get('method') == 'css':
                        loader.add_css(key, *extractor.get('args'),
                                       **{'re': extractor.get('re')})
                    if extractor.get('method') == 'value':
                        loader.add_value(key, *extractor.get('args'),
                                         **{'re': extractor.get('re')})
                    if extractor.get('method') == 'attr':
                        loader.add_value(
                            key, getattr(response, *extractor.get('args')))
            yield loader.load_item()
示例#16
0
    def parse_product_ebay(self, response):
        hxs = HtmlXPathSelector(response)

        items = response.meta['items']

        #site = response.meta['origin_site']
        origin_url = response.meta['origin_url']

        item = SearchItem()
        item['product_url'] = response.url
        #item['origin_site'] = site
        item['origin_url'] = origin_url

        if 'origin_id' in response.meta:
            item['origin_id'] = response.meta['origin_id']
            assert self.by_id
        else:
            assert not self.by_id

        # extract product name
        product_name = hxs.select("//h1[@id='itemTitle']/text()").extract()
        if not product_name:
            self.log("Error: No product name: " + str(response.url),
                     level=log.INFO)

        else:

            item['product_name'] = product_name[0]

            # extract product brand
            product_brand_holder = hxs.select("//td[@class='attrLabels'][contains(normalize-space(),'Brand')]" + \
             "/following-sibling::node()[normalize-space()!=''][1]//text()[normalize-space()!='']").extract()
            if product_brand_holder:
                item['product_brand'] = product_brand_holder[0]

            # extract product model
            product_model_holder = hxs.select("//td[@class='attrLabels'][contains(normalize-space(),'Model')]" + \
             "/following-sibling::node()[normalize-space()!=''][1]//text()[normalize-space()!='']").extract()
            if not product_model_holder:
                product_model_holder = hxs.select("//td[@class='attrLabels'][contains(normalize-space(),'MPN')]" + \
                "/following-sibling::node()[normalize-space()!=''][1]//text()[normalize-space()!='']").extract()

            if product_model_holder:
                item['product_model'] = product_model_holder[0]

            # add result to items
            items.add(item)

        # if there are any more results to be parsed, send a request back to this method with the next product to be parsed
        product_urls = response.meta['search_results']

        if product_urls:
            request = Request(product_urls.pop(),
                              callback=self.parse_product_ebay,
                              meta=response.meta)
            request.meta['items'] = items
            # eliminate next product from pending list (this will be the new list with the first item popped)
            request.meta['search_results'] = product_urls

            return request
        else:
            # otherwise, we are done, send a the response back to reduceResults (no need to make a new request)
            # add as meta newly added items
            # also add 'parsed' field to indicate that the parsing of all products was completed and they cand be further used
            # (actually that the call was made from this method and was not the initial one, so it has to move on to the next request)

            response.meta['parsed'] = True
            response.meta['items'] = items

            return self.reduceResults(response)
示例#17
0
    def parseURL(self, response):

        site = response.meta['origin_site']
        hxs = HtmlXPathSelector(response)

        product_model = ""

        product_brand = ""
        product_price = ""

        #############################################################3
        # Extract product attributes (differently depending on site)

        if site == 'staples':

            product_name = hxs.select("//h1/text()").extract()[0]

            model_nodes = hxs.select(
                "//p[@class='itemModel']/text()").extract()
            if model_nodes:
                model_node = model_nodes[0]

                model_node = re.sub("\W", " ", model_node, re.UNICODE)
                m = re.match("(.*)Model:(.*)", model_node.encode("utf-8"),
                             re.UNICODE)

                if m:
                    product_model = m.group(2).strip()

        elif site == 'walmart':
            product_name_holder = hxs.select(
                "//h1[@class='productTitle']/text()").extract()
            if product_name_holder:
                product_name = product_name_holder[0].strip()

                # get integer part of product price
                product_price_big = hxs.select(
                    "//span[@class='bigPriceText1']/text()").extract()

                if not product_price_big:
                    self.log("Didn't find product price: " + response.url +
                             "\n",
                             level=log.DEBUG)
                # if there is a range of prices take their average
                if len(product_price_big) > 1:

                    # remove $ and .
                    product_price_min = re.sub("[\$\.,]", "",
                                               product_price_big[0])
                    product_price_max = re.sub("[\$\.,]", "",
                                               product_price_big[-1])

                    #TODO: check if they're ints?
                    product_price_big = (int(product_price_min) +
                                         int(product_price_max)) / 2.0

                elif product_price_big:
                    product_price_big = int(
                        re.sub("[\$\.,]", "", product_price_big[0]))

                # get fractional part of price
                #TODO - not that important

                if product_price_big:
                    product_price = product_price_big

            else:
                sys.stderr.write(
                    "Broken product page link (can't find item title): " +
                    response.url + "\n")
                # return the item as a non-matched item
                item = SearchItem()
                #item['origin_site'] = site
                item['origin_url'] = response.url
                # remove unnecessary parameters
                m = re.match("(.*)\?enlargedSearch.*", item['origin_url'])
                if m:
                    item['origin_url'] = m.group(1)
                #item['origin_id'] = self.extract_walmart_id(item['origin_url'])
                if self.name != 'manufacturer':
                    # don't return empty matches in manufacturer spider
                    yield item
                return

            #TODO: if it contains 2 words, first could be brand - also add it in similar_names function
            product_model_holder = hxs.select(
                "//td[contains(text(),'Model')]/following-sibling::*/text()"
            ).extract()
            if product_model_holder:
                product_model = product_model_holder[0]

        #TODO: for the sites below, complete with missing logic, for not returning empty elements in manufacturer spider
        elif site == 'newegg':
            product_name_holder = hxs.select(
                "//span[@itemprop='name']/text()").extract()
            if product_name_holder:
                product_name = product_name_holder[0].strip()
            else:
                sys.stderr.write(
                    "Broken product page link (can't find item title): " +
                    response.url + "\n")
                item = SearchItem()
                #item['origin_site'] = site
                item['origin_url'] = response.url
                yield item
                return
            product_model_holder = hxs.select(
                "//dt[text()='Model']/following-sibling::*/text()").extract()
            if product_model_holder:
                product_model = product_model_holder[0]

        else:
            raise CloseSpider("Unsupported site: " + site)

        if site == 'staples':
            zipcode = "12345"
            cookies = {"zipcode": zipcode}
        else:
            cookies = {}

        #######################################################################
        # Create search queries to the second site, based on product attributes

        request = None

        #TODO: search by alternative model numbers?

        #TODO: search by model number extracted from product name? Don't I do that implicitly? no, but in combinations

        # if there is no product model, try to extract it
        if not product_model:
            product_model = ProcessText.extract_model_from_name(product_name)

            # for logging purposes, set this back to the empty string if it wasn't found (so was None)
            if not product_model:
                product_model = ""

            # product_model_index = ProcessText.extract_model_nr_index(product_name)
            # if product_model_index >= 0:
            # 	product_model = product_name[product_model_index]

            ## print "MODEL EXTRACTED: ", product_model, " FROM NAME ", product_name

        # if there is no product brand, get first word in name, assume it's the brand
        product_brand_extracted = ""
        #product_name_tokenized = ProcessText.normalize(product_name)
        product_name_tokenized = [
            word.lower() for word in product_name.split(" ")
        ]
        #TODO: maybe extract brand as word after 'by', if 'by' is somewhere in the product name
        if len(product_name_tokenized) > 0 and re.match(
                "[a-z]*", product_name_tokenized[0]):
            product_brand_extracted = product_name_tokenized[0].lower()

        # if we are in manufacturer spider, set target_site to manufacturer site

        # for manufacturer spider set target_site of request to brand extracted from name for this particular product
        if self.name == 'manufacturer':

            #TODO: restore commented code; if brand not found, try to search for it on every manufacturer site (build queries fo every supported site)
            # hardcode target site to sony
            #self.target_site = 'sony'
            #self.target_site = product_brand_extracted

            #target_site = product_brand_extracted

            # can only go on if site is supported
            # (use dummy query)
            #if target_site not in self.build_search_pages("").keys():
            if product_brand_extracted not in self.build_search_pages(
                    "").keys():

                product_brands_extracted = set(
                    self.build_search_pages("").keys()).intersection(
                        set(product_name_tokenized))

                if product_brands_extracted:
                    product_brand_extracted = product_brands_extracted.pop()
                    #target_site = product_brand_extracted
                else:
                    # give up and return item without match
                    self.log(
                        "Manufacturer site not supported (" +
                        product_brand_extracted +
                        ") or not able to extract brand from product name (" +
                        product_name + ")\n",
                        level=log.ERROR)

                    ## comment lines below to: don't return anything if you can't search on manufacturer site
                    # item = SearchItem()
                    # item['origin_url'] = response.url
                    # item['origin_name'] = product_name
                    # if product_model:
                    # 	item['origin_model'] = product_model
                    # yield item
                    return

            # if specific site is not set, search on manufacturer site as extracted from name
            if not self.manufacturer_site:
                target_site = product_brand_extracted
            else:
                # if it's set, continue only if it matches extracted brand
                if self.manufacturer_site != product_brand_extracted:
                    self.log(
                        "Will abort matching for product, extracted brand does not match specified manufacturer option ("
                        + product_brand_extracted + ")\n",
                        level=log.INFO)

                    ## comment lines below to: don't return anything if you can't search on manufacturer site
                    # item = SearchItem()
                    # item['origin_url'] = response.url
                    # item['origin_name'] = product_name
                    # if product_model:
                    # 	item['origin_model'] = product_model
                    # yield item
                    return

                else:
                    target_site = product_brand_extracted

                    # # try to match it without specific site (manufacturer spider will try to search on all manufacturer sites)
                    # target_site = None

        # for other (site specific) spiders, set target_site of request to class variable self.target_site set in class "constructor" (init_sub)
        else:
            target_site = self.target_site

        # 1) Search by model number
        if product_model:

            #TODO: model was extracted with ProcessText.extract_model_from_name(), without lowercasing, should I lowercase before adding it to query?
            query1 = self.build_search_query(product_model)
            search_pages1 = self.build_search_pages(query1)
            #page1 = search_pages1[self.target_site]
            page1 = search_pages1[target_site]

            request1 = Request(page1, callback=self.parseResults)

            # set amazon cookies
            if (self.target_site == 'amazon' and self.cookies_file):
                request1.cookies = self.amazon_cookies
                request1.headers['Cookies'] = self.amazon_cookie_header
                #request1.meta['dont_merge_cookies'] = True
                ## print "SET AMAZON COOKIES"

            request1.meta['query'] = query1
            request1.meta['target_site'] = target_site

            request = request1

        # 2) Search by product full name
        query2 = self.build_search_query(product_name)
        search_pages2 = self.build_search_pages(query2)
        #page2 = search_pages2[self.target_site]
        page2 = search_pages2[target_site]
        request2 = Request(page2, callback=self.parseResults)

        # set cookies for amazon
        if (self.target_site == 'amazon' and self.cookies_file):
            request2.cookies = self.amazon_cookies
            request2.headers['Cookies'] = self.amazon_cookie_header
            #request2.meta['dont_merge_cookies'] = True

        request2.meta['query'] = query2
        request2.meta['target_site'] = target_site

        pending_requests = []

        if not request:
            request = request2
        else:
            pending_requests.append(request2)

        # 3) Search by combinations of words in product's name
        # create queries

        for words in ProcessText.words_combinations(product_name,
                                                    fast=self.fast):
            query3 = self.build_search_query(" ".join(words))
            search_pages3 = self.build_search_pages(query3)
            #page3 = search_pages3[self.target_site]
            page3 = search_pages3[target_site]
            request3 = Request(page3, callback=self.parseResults)

            # set amazon cookies
            if (self.target_site == 'amazon' and self.cookies_file):
                request3.cookies = self.amazon_cookies
                request3.headers['Cookies'] = self.amazon_cookie_header
                #request3.meta['dont_merge_cookies'] = True

            request3.meta['query'] = query3
            request3.meta['target_site'] = target_site

            pending_requests.append(request3)

        request.meta['pending_requests'] = pending_requests
        #request.meta['origin_site'] =
        # product page from source site
        #TODO: clean this URL? for walmart it added something with ?enlargedsearch=True
        request.meta['origin_url'] = response.url

        request.meta['origin_name'] = product_name
        request.meta['origin_model'] = product_model
        if product_price:
            request.meta['origin_price'] = product_price

        # origin product brand as extracted from name (basically the first word in the name)
        request.meta['origin_brand_extracted'] = product_brand_extracted

        # if self.by_id:
        # 	request.meta['origin_id'] = self.extract_walmart_id(response.url)

        #self.target_site = product_brand_extracted
        #TODO: should this be here??
        target_site = product_brand_extracted

        # print "SENDING REQUEST FOR ", product_name, response.url

        yield request
示例#18
0
 def parse(self, response):
     data = json.loads(str(response.text))
     if 1 == data['ok']:
         card_list = data['data']['cards']
         for item in card_list:
             if "card_type_name" in item and "微博" == item["card_type_name"]:
                 _data = self.__get_blog(item)
                 data_item = ItemLoader(item=SearchItem(),
                                        response=response)
                 data_item.add_value("user_name", _data['user_name'])
                 data_item.add_value("user_id", _data['user_id'])
                 data_item.add_value("user_verified_reason",
                                     _data['user_verified_reason'])
                 data_item.add_value("user_description",
                                     _data['user_description'])
                 data_item.add_value("user_followers_count",
                                     _data['user_followers_count'])
                 data_item.add_value("user_statuses_count",
                                     _data['user_statuses_count'])
                 data_item.add_value("reposts_count",
                                     _data['reposts_count'])
                 data_item.add_value("comments_count",
                                     _data['comments_count'])
                 data_item.add_value("attitudes_count",
                                     _data['attitudes_count'])
                 data_item.add_value("user_content", _data['user_content'])
                 data_item.add_value("created_at", _data['created_at'])
                 data_item.add_value("source", _data['source'])
                 data_item.add_value("mid", _data['mid'])
                 data_item.add_value("idstr", _data['idstr'])
                 data_item.add_value("user_pics", _data['user_pics'])
                 yield data_item.load_item()
                 # 是否爬去回复
                 if self.is_get_reply:
                     reply_first_url = get_reply_url(id_str=_data['idstr'])
                     meta = {
                         "idstr": _data["idstr"],
                         "url": reply_first_url
                     }
                     yield Request(url=reply_first_url,
                                   callback=self.parse_reply,
                                   meta=meta)
             else:
                 if "card_group" in item:
                     for it in item["card_group"]:
                         if "mblog" in it:
                             _data = self.__get_blog(it)
                             data_item = ItemLoader(item=SearchItem(),
                                                    response=response)
                             data_item.add_value("user_name",
                                                 _data['user_name'])
                             data_item.add_value("user_id",
                                                 _data['user_id'])
                             data_item.add_value(
                                 "user_verified_reason",
                                 _data['user_verified_reason'])
                             data_item.add_value("user_description",
                                                 _data['user_description'])
                             data_item.add_value(
                                 "user_followers_count",
                                 _data['user_followers_count'])
                             data_item.add_value(
                                 "user_statuses_count",
                                 _data['user_statuses_count'])
                             data_item.add_value("reposts_count",
                                                 _data['reposts_count'])
                             data_item.add_value("comments_count",
                                                 _data['comments_count'])
                             data_item.add_value("attitudes_count",
                                                 _data['attitudes_count'])
                             data_item.add_value("user_content",
                                                 _data['user_content'])
                             data_item.add_value("created_at",
                                                 _data['created_at'])
                             data_item.add_value("source", _data['source'])
                             data_item.add_value("mid", _data['mid'])
                             data_item.add_value("idstr", _data['idstr'])
                             data_item.add_value("user_pics",
                                                 _data['user_pics'])
                             yield data_item.load_item()
                             # 是否爬去回复
                             if self.is_get_reply:
                                 reply_first_url = get_reply_url(
                                     id_str=_data['idstr'])
                                 meta = {
                                     "idstr": _data["idstr"],
                                     "url": reply_first_url
                                 }
                                 yield Request(url=reply_first_url,
                                               callback=self.parse_reply,
                                               meta=meta)
                         else:
                             # print("no_1_data")
                             pass
                 else:
                     # print("no_data")
                     pass
     else:
         print("content interface error, ", "\n", data["msg"])
     if 1 > self.page:
         self.page += 1
         next_url = get_search_url(keyword=self.keyword, page=self.page)
         yield Request(next_url, callback=self.parse)
示例#19
0
    def parse_product_amazon(self, response):

        # print "PARSE AMAZON PRODUCT FOR", response.meta['origin_url'], response.url

        hxs = HtmlXPathSelector(response)

        items = response.meta['items']

        #site = response.meta['origin_site']
        origin_url = response.meta['origin_url']

        item = SearchItem()
        item['product_url'] = response.url
        #item['origin_site'] = site
        item['origin_url'] = origin_url
        item['origin_name'] = response.meta['origin_name']

        if 'origin_model' in response.meta:
            item['origin_model'] = response.meta['origin_model']

        # if 'origin_id' in response.meta:
        # 	item['origin_id'] = response.meta['origin_id']
        # 	assert self.by_id
        # else:
        # 	assert not self.by_id

        # extract product name
        #TODO: id='title' doesn't work for all, should I use a 'contains' or something?
        # extract titles that are not empty (ignoring whitespace)
        # eliminate "Amazon Prime Free Trial"

        #TODO: to test this
        #product_name = filter(lambda x: not x.startswith("Amazon Prime"), hxs.select("//div[@id='title_feature_div']//h1//text()[normalize-space()!='']").extract())
        product_name = filter(
            lambda x: not x.startswith("Amazon Prime"),
            hxs.select("//h1//text()[normalize-space()!='']").extract())
        if not product_name:
            # print "NO PRODUCT NAME FOR", response.url
            self.log("Error: No product name: " + str(response.url) +
                     " for walmart product " + origin_url,
                     level=log.ERROR)

            # assume there is a captcha to crack
            # check if there is a form on the page - that means it's probably the captcha form
            forms = hxs.select("//form")
            if forms:

                # solve captcha
                captcha_text = None
                image = hxs.select(".//img/@src").extract()
                if image:
                    captcha_text = self.CB.solve_captcha(image[0])

                # value to use if there was an exception
                if not captcha_text:
                    captcha_text = ''

                # create a FormRequest to this same URL, with everything needed in meta
                # items, cookies and search_urls not changed from previous response so no need to set them again

                # redo the entire request (no items will be lost)
                return [
                    FormRequest.from_response(
                        response,
                        callback=self.parse_product_amazon,
                        formdata={'field-keywords': captcha_text},
                        meta=response.meta)
                ]

        else:
            item['product_name'] = product_name[0].strip()

            # extract product model number
            model_number_holder = hxs.select(
                "//tr[@class='item-model-number']/td[@class='value']/text() | //li/b/text()[normalize-space()='Item model number:']/parent::node()/parent::node()/text()"
            ).extract()
            if model_number_holder:
                item['product_model'] = model_number_holder[0].strip()
            # if no product model explicitly on the page, try to extract it from name
            else:
                product_model_extracted = ProcessText.extract_model_from_name(
                    item['product_name'])
                if product_model_extracted:
                    item['product_model'] = product_model_extracted
                ## print "MODEL EXTRACTED: ", product_model_extracted, " FROM NAME ", item['product_name'].encode("utf-8")

            brand_holder = hxs.select(
                "//div[@id='brandByline_feature_div']//a/text() | //a[@id='brand']/text()"
            ).extract()
            if brand_holder:
                item['product_brand'] = brand_holder[0]
            else:
                pass
                #sys.stderr.write("Didn't find product brand: " + response.url + "\n")

            # extract price
            #! extracting list price and not discount price when discounts available?
            price_holder = hxs.select("//span[contains(@id,'priceblock')]/text() | //span[@class='a-color-price']/text() " + \
             "| //span[@class='listprice']/text() | //span[@id='actualPriceValue']/text() | //b[@class='priceLarge']/text() | //span[@class='price']/text()").extract()

            # if we can't find it like above try other things:
            if not price_holder:
                # prefer new prices to used ones
                price_holder = hxs.select(
                    "//span[contains(@class, 'olp-new')]//text()[contains(.,'$')]"
                ).extract()
            if price_holder:
                product_target_price = price_holder[0].strip()
                # remove commas separating orders of magnitude (ex 2,000)
                product_target_price = re.sub(",", "", product_target_price)
                m = re.match("\$([0-9]+\.?[0-9]*)", product_target_price)
                if m:
                    item['product_target_price'] = float(m.group(1))
                else:
                    self.log("Didn't match product price: " +
                             product_target_price + " " + response.url + "\n",
                             level=log.WARNING)

            else:
                self.log("Didn't find product price: " + response.url + "\n",
                         level=log.INFO)

            # add result to items
            items.add(item)

        # print "STILL IN parse_product FOR", response.url

        product_urls = response.meta['search_results']

        # try to send request to parse next product, try until url for next product url is valid (response not 404)
        # this is needed because if next product url is not valid, this request will not be sent and all info about this match (stored in request meta) will be lost

        # find first valid next product url
        next_product_url = None
        if product_urls:
            next_product_url = product_urls.pop()
        while (product_urls and not self.is_valid_url(next_product_url)):
            # print "404 FROM", next_product_url
            next_product_url = product_urls.pop()

        # handle corner case of bad next product url
        if not product_urls and next_product_url and not self.is_valid_url(
                next_product_url):
            next_product_url = None

        # if a next product url was found, send new request back to parse_product_url
        if next_product_url:
            request = Request(next_product_url,
                              callback=self.parse_product_amazon,
                              meta=response.meta)
            if self.cookies_file:
                request.cookies = self.amazon_cookies
                request.headers['Cookies'] = self.amazon_cookie_header
                #request.meta['dont_merge_cookies'] = True
            request.meta['items'] = items
            # eliminate next product from pending list (this will be the new list with the first item popped)
            request.meta['search_results'] = product_urls

            # print "RETURNING FROM PARSE AMAZON PRODUCT TO parse_product FOR", response.meta['origin_url'], response.url, "NEXT IS", next_product_url
            respcode = urllib.urlopen(next_product_url)

            return request

        # if no next valid product url was found
        else:
            # we are done, send a the response back to reduceResults (no need to make a new request)
            # add as meta newly added items
            # also add 'parsed' field to indicate that the parsing of all products was completed and they cand be further used
            # (actually that the call was made from this method and was not the initial one, so it has to move on to the next request)

            response.meta['parsed'] = True
            response.meta['items'] = items

            # print "RETURNING FROM PARSE AMAZON PRODUCT TO reduce_results FOR", response.meta['origin_url'], response.url

            return self.reduceResults(response)
示例#20
0
    def parseResults(self, response):
        hxs = HtmlXPathSelector(response)

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        results = hxs.select(
            "//ul[@class='products']//div[@class='product ']//h3//a")
        for result in results:
            item = SearchItem()

            product_url = result.select("@href").extract()[0] if result.select(
                "@href") else None
            product_name = result.select(
                "@title").extract()[0] if result.select("@title") else None

            # assert name is not abbreviated
            # empirically, this only seems to produce false positives, so removed
            # assert '...' not in product_name

            # quit if there is no product name
            if product_name and product_url:
                # clean url
                item['product_url'] = Utils.add_domain(product_url,
                                                       self.base_url)

                item['product_name'] = product_name
            else:
                self.log("No product name: " + str(response.url) +
                         " from product: " + response.meta['origin_url'],
                         level=log.ERROR)
                continue

            # add url, name and model of product to be matched (from origin site)
            item['origin_url'] = response.meta['origin_url']
            item['origin_name'] = response.meta['origin_name']

            if 'origin_model' in response.meta:
                item['origin_model'] = response.meta['origin_model']

            # extract product model from name
            product_model_extracted = ProcessText.extract_model_from_name(
                item['product_name'])
            if product_model_extracted:
                item['product_model'] = product_model_extracted

            #TODO: extract: price, brand?

            # add result to items
            items.add(item)

        # extract product info from product pages (send request to parse first URL in list)
        # add as meta all that was received as meta, will pass it on to reduceResults function in the end
        # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed

        # send the request back to reduceResults (with updated 'items') whether there are any more pending requests or not
        # if there are, reduceResults will send the next one back here, if not it will return the final result

        response.meta['items'] = items

        # and field 'parsed' to indicate that the call was received from this method (was not the initial one)
        #TODO: do we still need this?
        response.meta['parsed'] = True
        # only send the response we have as an argument, no need to make a new request
        return self.reduceResults(response)
示例#21
0
    def parse_product_currys(self, response):

        hxs = HtmlXPathSelector(response)

        items = response.meta['items']

        #site = response.meta['origin_site']
        origin_url = response.meta['origin_url']

        item = SearchItem()
        item['product_url'] = response.url
        #item['origin_site'] = site
        item['origin_url'] = origin_url
        item['origin_name'] = response.meta['origin_name']

        if 'origin_model' in response.meta:
            item['origin_model'] = response.meta['origin_model']
        if 'origin_upc' in response.meta:
            item['origin_upc'] = response.meta['origin_upc']
        if 'origin_brand' in response.meta:
            item['origin_brand'] = response.meta['origin_brand']


        product_name_node = hxs.select("//span[@itemprop='name']/text()").extract()
        if product_name_node:
            product_name = product_name_node[0].strip()
        else:
            self.log("Error: No product name: " + str(response.url) + " for source product " + origin_url, level=log.ERROR)
            # TODO:is this ok? I think so
            return

        item['product_name'] = product_name

        # extract product model number
        # TODO: no model?
        # TODO: no upc?
        
        brand_holder = hxs.select("//span[@itemprop='name']/text()").extract()
        if brand_holder:
            item['product_brand'] = brand_holder[0]

        # extract price
        price_holder = hxs.select("//span[@class='currentPrice']/ins/text()").extract()
        # if we can't find it like above try other things:
        if price_holder:
            product_target_price = price_holder[0].strip()
            # remove commas separating orders of magnitude (ex 2,000)
            product_target_price = re.sub(",","",product_target_price)
            m = re.match("(\xa3)([0-9]+\.?[0-9]*)", product_target_price)
            if m:
                item['product_target_price'] = float(m.group(2))
                currency = m.group(1)
                item['product_target_price'] = Utils.convert_to_dollars(item['product_target_price'], currency)
            else:
                self.log("Didn't match product price: " + product_target_price + " " + response.url + "\n", level=log.WARNING)

        else:
            self.log("Didn't find product price: " + response.url + "\n", level=log.INFO)


        # add result to items
        items.add(item)


        product_urls = response.meta['search_results']

        # try to send request to parse next product, try until url for next product url is valid (response not 404)
        # this is needed because if next product url is not valid, this request will not be sent and all info about this match (stored in request meta) will be lost

        # find first valid next product url
        next_product_url = None
        if product_urls:
            next_product_url = product_urls.pop()

        # if a next product url was found, send new request back to parse_product_url
        if next_product_url:
            request = Request(next_product_url, callback = self.parse_product_currys, meta = response.meta)
            request.meta['items'] = items
            # eliminate next product from pending list (this will be the new list with the first item popped)
            request.meta['search_results'] = product_urls

            return request

        # if no next valid product url was found
        else:
            # we are done, send a the response back to reduceResults (no need to make a new request)
            # add as meta newly added items
            # also add 'parsed' field to indicate that the parsing of all products was completed and they cand be further used
            # (actually that the call was made from this method and was not the initial one, so it has to move on to the next request)

            response.meta['parsed'] = True
            response.meta['items'] = items

            return self.reduceResults(response)
示例#22
0
    def parse_product_samsung(self, response):

        hxs = HtmlXPathSelector(response)

        items = response.meta['items']

        #site = response.meta['origin_site']
        origin_url = response.meta['origin_url']

        # create item
        item = SearchItem()
        item['product_url'] = response.url
        item['origin_url'] = origin_url
        item['origin_name'] = response.meta['origin_name']
        # hardcode brand to sony
        item['product_brand'] = 'samsung'

        # extract product name, brand, model, etc; add to items
        product_info = hxs.select("//ul[@class='product-info']")
        #TODO: for some products name is not extracted correctly
        product_name = product_info.select("meta[@itemprop='name']/@content")
        if not product_name:
            self.log("Error: No product name: " + str(response.url),
                     level=log.INFO)
        else:
            item['product_name'] = product_name.extract()[0]
            product_model = product_info.select(
                "meta[@itemprop='model']/@content")
            if product_model:
                item['product_model'] = product_model.extract()[0]

            #TODO
            # item['product_images'] =
            # #TODO: to check
            # item['product_videos'] = l

            items.add(item)

        # if there are any more results to be parsed, send a request back to this method with the next product to be parsed
        product_urls = response.meta['search_results']

        if product_urls:
            request = Request(product_urls.pop(),
                              callback=self.parse_product_samsung,
                              meta=response.meta)
            request.meta['items'] = items
            # eliminate next product from pending list (this will be the new list with the first item popped)
            request.meta['search_results'] = product_urls

            return request
        else:
            # otherwise, we are done, send a the response back to reduceResults (no need to make a new request)

            # # we are finished so we should also close the driver
            # if self.driver:
            #     self.driver.close()

            response.meta['parsed'] = True
            response.meta['items'] = items

            return self.reduceResults(response)
示例#23
0
    def parse_product_target(self, response):

        hxs = HtmlXPathSelector(response)

        items = response.meta['items']

        #site = response.meta['origin_site']
        origin_url = response.meta['origin_url']

        item = SearchItem()
        item['product_url'] = response.url
        #item['origin_site'] = site
        item['origin_url'] = origin_url
        item['origin_name'] = response.meta['origin_name']

        if 'origin_model' in response.meta:
            item['origin_model'] = response.meta['origin_model']

        # extract product name

        #TODO: is this general enough?
        product_name = hxs.select(
            "//h2[@class='product-name item']/span[@itemprop='name']/text()"
        ).extract()

        # if you can't find product name in product page, use the one extracted from results page
        if not product_name:
            item['product_name'] = response.meta['product_name']
            self.log(
                "Error: product name not found on product page, extracted from results page: "
                + item['product_name'] + " " + origin_url,
                level=log.INFO)
        else:
            item['product_name'] = product_name[0].strip()

        if not item['product_name']:
            self.log("Error: No product name: " + str(response.url) +
                     " from product: " + origin_url,
                     level=log.INFO)

        else:
            #TODO: no model number field?
            model_number_holder = None
            if model_number_holder:
                item['product_model'] = model_number_holder[0].strip()
            # if no product model explicitly on the page, try to extract it from name
            else:
                product_model_extracted = ProcessText.extract_model_from_name(
                    item['product_name'])
                if product_model_extracted:
                    item['product_model'] = product_model_extracted
                #print "MODEL EXTRACTED: ", product_model_extracted, " FROM NAME ", item['product_name'].encode("utf-8")

            #TODO: no brand field?

            # extract price
            #! extracting list price and not discount price when discounts available?
            #TODO: complete this with other types of pages
            price_holder = hxs.select(
                "//span[@class='offerPrice']/text()").extract()

            if price_holder:
                product_target_price = price_holder[0].strip()
                # remove commas separating orders of magnitude (ex 2,000)
                product_target_price = re.sub(",", "", product_target_price)
                m = re.match("\$([0-9]+\.?[0-9]*)", product_target_price)
                if m:
                    item['product_target_price'] = float(m.group(1))
                else:
                    sys.stderr.write("Didn't match product price: " +
                                     product_target_price + " " +
                                     response.url + "\n")

            else:
                sys.stderr.write("Didn't find product price: " + response.url +
                                 "\n")

            # add result to items
            items.add(item)

        # if there are any more results to be parsed, send a request back to this method with the next product to be parsed
        product_urls_and_names = response.meta['search_results']

        if product_urls_and_names:
            product_url_and_name = product_urls_and_names.pop()
            request = Request(product_url_and_name[0],
                              callback=self.parse_product_target,
                              meta=response.meta)
            request.meta['items'] = items
            # eliminate next product from pending list (this will be the new list with the first item popped)

            # send product name with request as well
            request.meta['product_name'] = product_url_and_name[1]
            request.meta['search_results'] = product_urls_and_names

            return request
        else:
            # otherwise, we are done, send a the response back to reduceResults (no need to make a new request)
            # add as meta newly added items
            # also add 'parsed' field to indicate that the parsing of all products was completed and they cand be further used
            # (actually that the call was made from this method and was not the initial one, so it has to move on to the next request)

            response.meta['parsed'] = True
            response.meta['items'] = items

            return self.reduceResults(response)
示例#24
0
    def parseResults(self, response):
        hxs = HtmlXPathSelector(response)

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        #results = hxs.select("//ul[@class='productsListView']/li")
        results = hxs.select("//li[contains(@class,'tile standard')]")
        for result in results:
            item = SearchItem()
            product_title_holder = result.select(
                ".//div[@class='tileInfo']/a[contains(@class,'productTitle')]")
            product_url = product_title_holder.select("@href").extract()
            product_name = product_title_holder.select("@title").extract()

            #print "ITEM", product_name

            # quit if there is no product name
            if product_name and product_url:
                # clean url
                m = re.match("(.*)#prodSlot*", url.extract(), product_url[0])
                if m:
                    item['product_url'] = m.group(1)
                else:
                    item['product_url'] = product_url[0]
                item['product_name'] = product_name[0]
            else:
                self.log("No product name: " + str(response.url) +
                         " from product: " + response.meta['origin_url'],
                         level=log.ERROR)
                continue

            # add url, name and model of product to be matched (from origin site)
            item['origin_url'] = response.meta['origin_url']
            item['origin_name'] = response.meta['origin_name']

            if 'origin_model' in response.meta:
                item['origin_model'] = response.meta['origin_model']

            # extract product model from name
            product_model_extracted = ProcessText.extract_model_from_name(
                item['product_name'])
            if product_model_extracted:
                item['product_model'] = product_model_extracted

            # extract price
            #! extracting regular price and not discount price when discounts available?
            price_holder = result.select(
                ".//p[@class='regularprice-label']//text()[contains(.,'$')]"
            ).extract()

            # second attempt at finding price
            if not price_holder:
                price_holder = result.select(
                    ".//*[contains(@class, 'price price-label')]/text()[contains(.,'$')]"
                ).extract()

            if price_holder:
                product_target_price = price_holder[0].strip()
                # remove commas separating orders of magnitude (ex 2,000)
                product_target_price = re.sub(",", "", product_target_price)
                # if more than one match, it will get the first one
                m = re.match("\$([0-9]+\.?[0-9]*)", product_target_price)
                if m:
                    item['product_target_price'] = float(m.group(1))
                else:
                    self.log("Didn't match product price: " +
                             product_target_price + " " + response.url + "\n",
                             level=log.WARNING)

            else:
                self.log("Didn't find product price: " + response.url + "\n",
                         level=log.DEBUG)

            # extract product brand
            brand_holder = product_title_holder.select(
                "parent::node()//a[contains(@class,'productBrand')]/a/text()"
            ).extract()
            if brand_holder:
                item['product_brand'] = brand_holder[0]
                self.log("Extracted brand: " + item['product_brand'] +
                         " from results page: " + str(response.url),
                         level=log.DEBUG)

            # add result to items
            items.add(item)

        # extract product info from product pages (send request to parse first URL in list)
        # add as meta all that was received as meta, will pass it on to reduceResults function in the end
        # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed

        # send the request back to reduceResults (with updated 'items') whether there are any more pending requests or not
        # if there are, reduceResults will send the next one back here, if not it will return the final result

        response.meta['items'] = items

        # and field 'parsed' to indicate that the call was received from this method (was not the initial one)
        #TODO: do we still need this?
        response.meta['parsed'] = True
        # only send the response we have as an argument, no need to make a new request
        return self.reduceResults(response)
    def parse_product(self, response):

        # redirect pages, if handled, can return empty bodies
        # especially for kohls
        if not response.body:
            self.log("Retried empty page: " + response.url, level=log.WARNING)
            return Request(response.url, callback = self.parse_product, meta=response.meta)

        # try to avoid mobile versions
        # especially for kohls
        if response.url.startswith("http://m."):
            meta = response.meta
            meta['dont_redirect'] = True
            url = re.sub("/m\.","/www.",response.url)
            self.log("Retrying: redirecting mobile page to www page", level=log.WARNING)
            return Request(url, callback=self.parse_product, meta=meta)

        origin_product_id = response.meta['origin_product_id']
        current_query = response.meta['query']
        origin_url = self.results[origin_product_id]['origin_product']['origin_url']

        item = SearchItem()
        item['product_url'] = response.url
        for field in self.results[origin_product_id]['origin_product'].keys():
            item[field] = self.results[origin_product_id]['origin_product'][field]
        

        # all product urls from all queries
        items = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['product_items'], \
            self.results[origin_product_id]['search_requests']), [])
        # all product urls from all queries
        product_urls = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['search_results'], \
            self.results[origin_product_id]['search_requests']), [])
        product_urls = set(product_urls)

        item = self.extract_product_data(response, item)

        # add result to items (if it was successful)
        if item:
            self.results[origin_product_id]['search_requests'][current_query]['product_items'].append(item)

        # try to send request to parse next product, try until url for next product url is valid (response not 404)
        # this is needed because if next product url is not valid, this request will not be sent and all info about this match (stored in request meta) will be lost

        # find first valid next product url
        next_product_url = None
        if product_urls:
            next_product_url = product_urls.pop()

        # if a next product url was found, send new request back to parse_product_url
        if next_product_url:
            request = Request(next_product_url, callback = self.parse_product, meta = response.meta)
            # eliminate next product from pending list (this will be the new list with the first item popped)
            self.remove_result_from_queue(origin_product_id, next_product_url)

            return request

        # if no next valid product url was found
        else:
            # we are done, send a the response back to reduceResults (no need to make a new request)
            # add as meta newly added items
            # also add 'parsed' field to indicate that the parsing of all products was completed and they cand be further used
            # (actually that the call was made from this method and was not the initial one, so it has to move on to the next request)

            response.meta['parsed'] = True

            return self.reduceResults(response)
示例#26
0
    def parseResults(self, response):
        hxs = HtmlXPathSelector(response)

        if 'items' in response.meta:
            items = response.meta['items']
        else:
            items = set()

        results = hxs.select("//h3[@class='productTitle']/a")
        for result in results:
            item = SearchItem()
            product_url = result.select("@href").extract()[0]
            # extract all text in <a> (contains product name inside <strong>, and size(ml) directly in text())

            # node containing full product name if the displayed one is abbreviated. use this one if exists, and displayed one if it doesn't
            product_name_node = result.select("strong/abbr/@title")
            product_name = product_name_node.extract(
            )[0] if product_name_node else result.select(
                "strong/text()").extract()[0]
            # assert name is not abbreviated
            assert '...' not in product_name
            # add product quantity
            product_quantity_node = result.select(
                "text()[normalize-space()!='']")
            product_quantity = product_quantity_node.extract()[0].strip(
            ) if product_quantity_node else ""
            product_name_full = product_name + " " + product_quantity

            #print "ITEM", product_name

            # quit if there is no product name
            if product_name and product_url:
                # clean url
                item['product_url'] = Utils.add_domain(
                    Utils.clean_url(product_url), self.base_url)

                item['product_name'] = product_name_full
            else:
                self.log("No product name: " + str(response.url) +
                         " from product: " + response.meta['origin_url'],
                         level=log.ERROR)
                continue

            # add url, name and model of product to be matched (from origin site)
            item['origin_url'] = response.meta['origin_url']
            item['origin_name'] = response.meta['origin_name']

            if 'origin_model' in response.meta:
                item['origin_model'] = response.meta['origin_model']

            # extract product model from name
            product_model_extracted = ProcessText.extract_model_from_name(
                item['product_name'])
            if product_model_extracted:
                item['product_model'] = product_model_extracted

            #TODO: extract: price, brand?

            # add result to items
            items.add(item)

        # extract product info from product pages (send request to parse first URL in list)
        # add as meta all that was received as meta, will pass it on to reduceResults function in the end
        # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed

        # send the request back to reduceResults (with updated 'items') whether there are any more pending requests or not
        # if there are, reduceResults will send the next one back here, if not it will return the final result

        response.meta['items'] = items

        # and field 'parsed' to indicate that the call was received from this method (was not the initial one)
        #TODO: do we still need this?
        response.meta['parsed'] = True
        # only send the response we have as an argument, no need to make a new request
        return self.reduceResults(response)
示例#27
0
    def parse_product_amazon(self, response):

        hxs = HtmlXPathSelector(response)

        origin_product_id = response.meta['origin_product_id']
        current_query = response.meta['query']
        origin_url = self.results[origin_product_id]['origin_product'][
            'origin_url']

        item = SearchItem()
        item['product_url'] = response.url
        for field in self.results[origin_product_id]['origin_product'].keys():
            item[field] = self.results[origin_product_id]['origin_product'][
                field]

        # all product urls from all queries
        items = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['product_items'], \
            self.results[origin_product_id]['search_requests']), [])
        # all product urls from all queries
        product_urls = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['search_results'], \
            self.results[origin_product_id]['search_requests']), [])
        product_urls = set(product_urls)

        #TODO: to test this
        #product_name = filter(lambda x: not x.startswith("Amazon Prime"), hxs.select("//div[@id='title_feature_div']//h1//text()[normalize-space()!='']").extract())
        product_name_node = hxs.select(
            '//h1[@id="title"]/span[@id="productTitle"]/text()').extract()
        product_name = None
        if not product_name_node:
            product_name_node = hxs.select(
                '//h1[@id="aiv-content-title"]//text()').extract()
        if not product_name_node:
            product_name_node = hxs.select(
                '//div[@id="title_feature_div"]/h1//text()').extract()

        if product_name_node:
            product_name = product_name_node[0].strip()
        else:
            # needs special treatment
            product_name_node = hxs.select(
                '//h1[@class="parseasinTitle " or @class="parseasinTitle"]/span[@id="btAsinTitle"]//text()'
            ).extract()
            if product_name_node:
                product_name = " ".join(product_name_node).strip()

        if not product_name:

            # log this error:
            # if number of retries were not exhausted, it might just be a captcha page, not an insurmonutable error
            if 'captcha_retries' in response.meta and response.meta[
                    'captcha_retries'] <= self.MAX_CAPTCHA_RETRIES:

                self.log("Error: No product name: " + str(response.url) +
                         " for walmart product " + origin_url,
                         level=log.WARNING)
            else:
                # if it comes from a solved captcha page, then it's an error if it's still not found
                self.log("Error: No product name: " + str(response.url) +
                         " for walmart product " + origin_url,
                         level=log.ERROR)

                # try this: don't remove captcha_retries from meta, may cause infinite loops, works
                # if response.meta['captcha_retries'] > self.MAX_CAPTCHA_RETRIES:
                # del response.meta['captcha_retries']
            # if we have reached maximum number of retries, do nothing (item just won't be added to the "items" list)

            # if we haven't reached maximum retries, try again
            if 'captcha_retries' not in response.meta \
                or 'captcha_retries' in response.meta and response.meta['captcha_retries'] <= self.MAX_CAPTCHA_RETRIES:

                # assume there is a captcha to crack
                # check if there is a form on the page - that means it's probably the captcha form
                forms = hxs.select("//form")
                if forms:

                    # solve captcha
                    captcha_text = None
                    image = hxs.select(".//img/@src").extract()
                    if image:
                        captcha_text = self.CB.solve_captcha(image[0])

                    # value to use if there was an exception
                    if not captcha_text:
                        captcha_text = ''

                    # create a FormRequest to this same URL, with everything needed in meta
                    # items, cookies and search_urls not changed from previous response so no need to set them again

                    # redo the entire request (no items will be lost)
                    meta = response.meta
                    # flag indicating how many times we already retried to solve captcha
                    if 'captcha_retries' in meta:
                        meta['captcha_retries'] += 1
                    else:
                        meta['captcha_retries'] = 1
                    return [
                        FormRequest.from_response(
                            response,
                            callback=self.parse_product_amazon,
                            formdata={'field-keywords': captcha_text},
                            meta=meta)
                    ]

        else:
            item['product_name'] = product_name

            # extract product model number
            model_number_holder = hxs.select(
                """//tr[@class='item-model-number']/td[@class='value']/text() |
             //li/b/text()[normalize-space()='Item model number:']/parent::node()/parent::node()/text() |
             //span/text()[normalize-space()='Item model number:']/parent::node()/parent::node()/span[2]/text()"""
            ).extract()
            if model_number_holder:
                item['product_model'] = model_number_holder[0].strip()
            # if no product model explicitly on the page, try to extract it from name
            else:
                product_model_extracted = ProcessText.extract_model_from_name(
                    item['product_name'])
                if product_model_extracted:
                    item['product_model'] = product_model_extracted
                ## print "MODEL EXTRACTED: ", product_model_extracted, " FROM NAME ", item['product_name'].encode("utf-8")

            upc_node = hxs.select(
                "//li/b/text()[normalize-space()='UPC:']/parent::node()/parent::node()/text()"
            ).extract()
            if upc_node:
                upc = upc_node[0].strip().split()
                item['product_upc'] = upc

            manufacturer_code_node = hxs.select(
                "//li/b/text()[normalize-space()='Manufacturer reference:']/parent::node()/parent::node()/text()"
            ).extract()
            if manufacturer_code_node:
                manufacturer_code = manufacturer_code_node[0].strip()
                item['manufacturer_code'] = manufacturer_code

            try:
                # for lowest level category:
                # TODO: test the xpath for the second type of page (see second type of xpath for top-level category)
                # bestsellers_rank = hxs.select("//tr[@id='SalesRank']/td[@class='value']/ul/li/span/text()" + \
                # "| //li[@id='SalesRank']/ul/li/span/text()").re("#[0-9,]+")[0]

                # for top-level category:
                bestsellers_rank = hxs.select(
                    "//tr[@id='SalesRank']/td[@class='value']/text()" +
                    " | //li[@id='SalesRank']/text()").re("#[0-9,]+")[0]
                item['bestsellers_rank'] = int(
                    re.sub(",", "", "".join(bestsellers_rank[1:])))
            except Exception, e:
                if self.output == 6 or self.bestsellers_link:
                    self.log("Didn't find product rank: " + str(e) + " " +
                             response.url + "\n",
                             level=log.INFO)

            asin_node = hxs.select(
                "//li/b/text()[normalize-space()='ASIN:']/parent::node()/parent::node()/text()"
            ).extract()
            if asin_node:
                item['product_asin'] = asin_node[0].strip()

            brand_holder = hxs.select(
                "//div[@id='brandByline_feature_div']//a/text() | //a[@id='brand']/text()"
            ).extract()
            if brand_holder:
                item['product_brand'] = brand_holder[0]
            else:
                pass
                #sys.stderr.write("Didn't find product brand: " + response.url + "\n")

            # extract price
            #! extracting list price and not discount price when discounts available?
            price_holder = hxs.select("//span[contains(@id,'priceblock')]/text() | //span[@class='a-color-price']/text() " + \
                "| //span[@class='listprice']/text() | //span[@id='actualPriceValue']/text() | //b[@class='priceLarge']/text() | //span[@class='price']/text()").extract()

            # if we can't find it like above try other things:
            if not price_holder:
                # prefer new prices to used ones
                # TODO: doesn't work for amazon.co.uk (pounds), but isn't needed bery often
                price_holder = hxs.select(
                    "//span[contains(@class, 'olp-new')]//text()[contains(.,'$')]"
                ).extract()
            if price_holder:
                product_target_price = price_holder[0].strip()
                # remove commas separating orders of magnitude (ex 2,000)
                product_target_price = re.sub(",", "", product_target_price)
                m = re.match("(\$|\xa3)([0-9]+\.?[0-9]*)",
                             product_target_price)
                if m:
                    item['product_target_price'] = float(m.group(2))
                    currency = m.group(1)
                    if currency != "$":
                        item[
                            'product_target_price'] = Utils.convert_to_dollars(
                                item['product_target_price'], currency)
                else:
                    self.log("Didn't match product price: " +
                             product_target_price + " " + response.url + "\n",
                             level=log.WARNING)

            else:
                self.log("Didn't find product price: " + response.url + "\n",
                         level=log.INFO)

            try:
                item['product_category_tree'] = \
                    filter(None, map(lambda c: c.strip(), hxs.select("//ul[li[@class='a-breadcrumb-divider']]/li/span[@class='a-list-item']/a/text()").extract()))
            except:
                pass

            try:
                item['product_keywords'] = hxs.select(
                    "//meta[@name='keywords']/@content").extract()[0]
            except:
                pass

            try:
                product_image = hxs.select(
                    "//img[@id='landingImage']/@src").extract()[0]
                item['product_image_url'] = product_image
                item['product_image_encoded'] = ProcessText.encode_image(
                    product_image)
            except:
                pass

            # add result to items
            self.results[origin_product_id]['search_requests'][current_query][
                'product_items'].append(item)
示例#28
0
    def reduceResults(self, response):

        # print "IN REDUCE RESULTS"

        items = response.meta['items']
        #site = response.meta['origin_site']

        #TODO: do we still need this?
        if 'parsed' not in response.meta:

            # pass to specific prase results function (in derived class)
            return self.parseResults(response)

        else:
            del response.meta['parsed']

        ## print stuff
        self.log("PRODUCT: " + response.meta['origin_name'].encode("utf-8") +
                 " MODEL: " + response.meta['origin_model'].encode("utf-8"),
                 level=log.DEBUG)
        self.log("QUERY: " + response.meta['query'], level=log.DEBUG)
        self.log("MATCHES: ", level=log.DEBUG)
        for item in items:
            self.log(item['product_name'].encode("utf-8"), level=log.DEBUG)
        self.log('\n', level=log.DEBUG)

        # if there is a pending request (current request used product model, and pending request is to use product name),
        # continue with that one and send current results to it as metadata
        if 'pending_requests' in response.meta:
            # yield first request in queue and send the other ones as metadata
            pending_requests = response.meta['pending_requests']

            if pending_requests:
                # print "PENDING REQUESTS FOR", response.meta['origin_url'], response.meta['origin_name']
                request = pending_requests[0]

                # update pending requests
                request.meta['pending_requests'] = pending_requests[1:]

                request.meta['items'] = items

                #request.meta['origin_site'] = response.meta['origin_site']
                # product page from source site
                request.meta['origin_url'] = response.meta['origin_url']
                request.meta['origin_name'] = response.meta['origin_name']
                request.meta['origin_model'] = response.meta['origin_model']
                if 'origin_price' in response.meta:
                    request.meta['origin_price'] = response.meta[
                        'origin_price']
                request.meta['origin_brand_extracted'] = response.meta[
                    'origin_brand_extracted']
                if 'threshold' in response.meta:
                    request.meta['threshold'] = response.meta['threshold']

                # if 'origin_id' in response.meta:
                # 	request.meta['origin_id'] = response.meta['origin_id']
                # 	assert self.by_id
                # else:
                # 	assert not self.by_id

                # used for result product URLs
                if 'search_results' in response.meta:
                    request.meta['search_results'] = response.meta[
                        'search_results']

                return request

            # if there are no more pending requests, use cumulated items to find best match and send it as a result
            else:

                # print "DONE FOR ", response.meta['origin_url'], response.meta['origin_name']

                best_match = None

                if items:

                    # from all results, select the product whose name is most similar with the original product's name
                    # if there was a specific threshold set in request, use that, otherwise, use the class variable
                    if 'threshold' in response.meta:
                        threshold = response.meta['threshold']
                    else:
                        threshold = self.threshold

                    if 'origin_price' in response.meta:
                        product_price = response.meta['origin_price']
                        ## print "PRICE:", product_price
                    else:
                        product_price = None
                        ## print "NO PRICE"
                    best_match = ProcessText.similar(
                        response.meta['origin_name'],
                        response.meta['origin_model'], product_price, items,
                        threshold)

                    # #self.log( "ALL MATCHES: ", level=log.WARNING)
                    # for item in items:
                    # 	## print item['product_name'].encode("utf-8")
                    # ## print '\n'

                self.log("FINAL: " + str(best_match), level=log.WARNING)
                self.log("\n----------------------------------------------\n",
                         level=log.WARNING)

                if not best_match:
                    # if there are no results but the option was to include original product URL, create an item with just that
                    # output item if match not found for either output type
                    #if self.output == 2:
                    item = SearchItem()
                    #item['origin_site'] = site

                    item['origin_url'] = response.meta['origin_url']
                    item['origin_name'] = response.meta['origin_name']
                    if 'origin_model' in response.meta:
                        item['origin_model'] = response.meta['origin_model']

                    # if 'origin_id' in response.meta:
                    # 	item['origin_id'] = response.meta['origin_id']
                    # 	assert self.by_id
                    # else:
                    # 	assert not self.by_id
                    return [item]

                return best_match

        else:
            # output item if match not found
            item = SearchItem()
            #item['origin_site'] = site

            # print "DONE FOR ", response.meta['origin_name']

            item['origin_url'] = response.meta['origin_url']
            item['origin_name'] = response.meta['origin_name']

            # if 'origin_id' in response.meta:
            # 	item['origin_id'] = response.meta['origin_id']
            # 	assert self.by_id
            # else:
            # 	assert not self.by_id

            #TODO: uncomment below - it should not have been in if/else branch!

            self.log("FINAL: " + str(item), level=log.WARNING)
            self.log("\n----------------------------------------------\n",
                     level=log.WARNING)

            return [item]