def parse_search_result(self, response):
        """ Parse Search Result

        Parser that will be used if response result detected as search result page.

        Args:
            response

        Returns:
            item
        """
        item = ElectronicItem()

        part_number = cleansplit(Selector(text=response.body)
                                            #.xpath('//td[contains(@class, "tr-dkPartNumber")]//a//text()'))
                                            .css("a.symbol.product-symbol::text"))
        #print key_part_number
        manufacturer_part_number = cleansplit(Selector(text=response.body)
                                            #.xpath('//td[contains(@class, "tr-mfgPartNumber")]//a//span//text()'))
                                            .css(".manufacturer>a:nth-of-type(2)>b::text"))
        #print manufacturer_part_number
        manufacturer_name = cleansplit(Selector(text=response.body)
                                            #.xpath('//td[contains(@class, "tr-vendor")]//span//a//span//text()'))
                                            .css(".manufacturer>a:nth-of-type(1)>b::text"))
        description = cleansplit(Selector(text=response.body)
                                            #.xpath('//td[contains(@class, "tr-description")]//text()'))
                                            .css(".product>div>span::text"))
        # javascript execution needed, scrapy doesnt handle it
        quantity_available = [] #cleansplit(Selector(text=response.body)
                                            #.xpath('//td[contains(@class, "tr-qtyAvailable ptable-param")]//span//text()'))
                                            #.xpath('//tbody[1]/tr/td[5]/div[1]/b[2]'))
                                            #.css("td.stany>div>b:nth-of-type(2)::text"))
        print quantity_available
        image_url = cleansplit(Selector(text=response.body)
                                            #.xpath('//img[contains(@class, "pszoomer")]')
                                            #.xpath('@src'))
                                            .css(".product_image>a>img::attr(src)"))
        '''
        This is variable handler when no content in selected xpath. so this algorithm will keep list balanced.
        and alyways will process zip iteration. and return scaped item. see customfunction.py for listbalancer method'''
        if not quantity_available: quantity_available = listbalancer(part_number)
        if not image_url: image_url = listbalancer(image_url)
        if not description: description = listbalancer(description)

        for i, j, k, l, m, n in zip(part_number, manufacturer_part_number, manufacturer_name,
                                    description, quantity_available, image_url):
            item['manufacturer'] = k
            item['manufacturer_part_number'] = j
            item['supplier'] = self.spider_name
            item['supplier_part_number'] = i
            item['description'] = l
            item['image_url'] = n
            item['product_url'] = response.url
            item['stock_qty'] = cleanqty(m.replace(u'\xa0', u''))
            yield item
        next_url = response.css('form>div.nawigator>a:last-of-type::attr(href)').extract_first()
        if self.debug: print "Next URL -> %s" % (response.urljoin(next_url))
        if next_url and "javascript:void(0);" not in next_url:
            "Following Next Page {0}".format(response.urljoin(next_url))
            yield Request(response.urljoin(next_url), callback=self.parse_search_result, dont_filter=True)
    def parse_search_result(self, response):
        """ Parse Search Result

        Parser that will be used if response result detected as search result page.

        Args:
            response

        Returns:
            item
        """
        item = ElectronicItem()

        part_number = cleansplit(
            Selector(text=response.body).xpath(
                "//p[@class='mfr-results']//a//text()"))
        # manufacturer part number always same with part number in futureelectronics.com
        manufacturer_part_number = part_number
        manufacturer_name = cleansplit(
            Selector(
                text=response.body).xpath("//div[@class='desc']//h5//text()"))
        description = cleansplit(
            Selector(text=response.body).xpath(
                "//p[@class='mfr-results']//a//text()"))
        quantity_available = cleansplit(
            Selector(text=response.body).xpath(
                "//span[@class='prices-in-stock-value']//text()"))
        image_url = cleansplit(
            Selector(text=response.body).xpath(
                "//img[@class='productThumbnail']").xpath('@src'))
        '''
        This is variable handler when no content in selected xpath. so this algorithm will keep list balanced.
        and alyways will process zip iteration. and return scaped item. see customfunction.py for listbalancer method'''
        if not quantity_available:
            quantity_available = listbalancer(part_number)
        if not image_url: image_url = listbalancer(image_url)
        if not description: description = listbalancer(description)

        for i, j, k, l, m, n in zip(part_number, manufacturer_part_number,
                                    manufacturer_name, description,
                                    quantity_available, image_url):
            item['manufacturer'] = k
            item['manufacturer_part_number'] = j
            item['supplier'] = self.spider_name
            item['supplier_part_number'] = i
            item['description'] = l
            item['image_url'] = n
            item['product_url'] = response.url
            item['stock_qty'] = m.replace(u'\xa0', u'')
            yield item
        next_url = response.xpath(
            '//a[@id="ctl00_PlaceHolderMain_results_pagingFooter_ctl08_HyperLink6"]//@href'
        ).extract_first()
        if self.debug: print "Next URL -> %s" % (next_url)
        if next_url:
            "Following Next Page {0}".format(response.urljoin(next_url))
            yield Request(response.urljoin(next_url),
                          callback=self.parse_search_result,
                          dont_filter=True)
    def parse_search_result(self, response):
        '''
        Search Result Page Parser. self callback if there is pagination automatically.

        :param response:
        :return:
        '''
        item = ElectronicItem()
        items = []

        part_number = cleansplit(Selector(text=response.body)
                                #.css("li.ttipartnumber a ::text/li[@class='ttipartnumber']/a/text()"))
                                 .css("li.ttipartnumber a::text"))
        manufacturer_part_number = cleansplit(Selector(text=response.body)
                                #.xpath("/li[@class='mfrpartnumber']/a/text()"))
                                 .css("li.mfrpartnumber a::text"))
        manufacturer_name = cleansplit(Selector(text=response.body)
                                #.xpath("/li[@class='manufacturer']/text()"))
                                 .css("li.manufacturer::text"))
        description = cleansplit(Selector(text=response.body)
                                #.xpath("/td[@class='description']/text()"))
                                .css("td.description::text"))
        quantity_available = cleansplit(Selector(text=response.body)
                                #.xpath("/td[@class='availability']/text()"))
                                .css("td.availability::text"))
        image_url = cleansplit(Selector(text=response.body).xpath("//img[@class='large-photo']")
                               .xpath('@src'))

        '''
        This is variable handler when no content in selected xpath. so this algorithm will keep list balanced.
        and alyways will process zip iteration. and return scaped item. see customfunction.py for listbalancer method'''
        if not quantity_available: quantity_available = listbalancer(part_number)
        if not image_url: image_url = listbalancer(image_url)
        if not description: description = listbalancer(description)

        for i, j, k, l, m, n in zip(part_number, manufacturer_part_number, manufacturer_name,
                                    description, quantity_available, image_url):
            item['manufacturer'] = k
            item['manufacturer_part_number'] = j
            item['supplier'] = self.spider_name
            item['supplier_part_number'] = i
            item['description'] = l
            item['image_url'] = n
            item['product_url'] = response.url
            item['stock_qty'] = cleanqty(m.replace(u'\xa0', u''))
            yield item

        next_url = response.xpath("/html/body[@id='search_results']"
                                  "/div[@id='pageContent']/div[@id='content-wrapper']"
                                  "/div[@id='content-box']/form[@id='SearchAgainForm']"
                                  "/div[2]/div[@id='search-results']/div[@class='action-row']"
                                  "/div[@class='pagination']/strong/a[@class='current']"
                                  "/following-sibling::a/@href").extract_first()
        if self.debug: print "Next URL -> %s" % (next_url)
        if next_url:
            "Following Next Page {0}".format(response.urljoin(next_url))
            yield Request(response.urljoin(next_url), callback=self.parse_search_result, dont_filter=True)
    def parse_search_result(self, response):
        """ Parse Search Result

        Parser that will be used if response result detected as search result page

        Args:
            response

        Returns:
            item
        """
        item = ElectronicItem()

        part_number = cleansplit(Selector(text=response.body)
                                            .xpath("//span[@class='SearchResults-productName']/span//text()"))
        # manufacturer part number always same with part number in arrow.com
        manufacturer_part_number = part_number
        manufacturer_name = cleansplit(Selector(text=response.body)
                                            .xpath("//a[@class='SearchResults-productManufacturer']//text()"))
        description = cleansplit(Selector(text=response.body)
                                            .xpath("//td[@class='SearchResults-column SearchResults-column--description']"
                                                   "//span//text()"))
        quantity_available = cleansplit(Selector(text=response.body)
                                            .xpath("//span[@class='SearchResults-stock']//span//following-sibling::text()"))
        image_url = cleansplit(Selector(text=response.body).xpath("//img[contains(@class, 'SearchResults-image')]")
                                            .xpath('@src'))

        '''
        This is variable handler when no content in selected xpath. so this algorithm will keep list balanced.
        and alyways will process zip iteration. and return scaped item. see customfunction.py for listbalancer method'''
        if not quantity_available: quantity_available = listbalancer(part_number)
        if not image_url: image_url = listbalancer(image_url)
        if not description: description = listbalancer(description)

        for i, j, k, l, m, n in zip(part_number, manufacturer_part_number, manufacturer_name,
                                    description, quantity_available, image_url):
            item['manufacturer'] = k
            item['manufacturer_part_number'] = j
            item['supplier'] = self.spider_name
            item['supplier_part_number'] = i
            item['description'] = l
            item['image_url'] = "{0}{1}".format("http:",n)
            item['product_url'] = response.url
            item['stock_qty'] = cleanqty(m)

            yield item
        next_url = response.xpath('//link[@rel="next"]/@href').extract_first()
        if self.debug: print "Next URL -> %s" % (next_url)
        if next_url:
            "Following Next Page {0}".format(response.urljoin(next_url))
            yield Request(response.urljoin(next_url), callback=self.parse_search_result, dont_filter=True)
    def parse_search_result(self, response):
        """ Parse Search Result

        Parser that will be used if response result detected as search result page.

        Args:
            response

        Returns:
            item
        """
        item = ElectronicItem()
        part_number = cleansplit(
            Selector(text=response.body)
            #.css("table.SearchResultsTable > tbody > tr > td:nth-child(3) > div > a"))
            #.xpath("//table[@class='SearchResultsTable']/tbody/tr/td[3]/div/a/text()"))
            #.xpath("//table[@id='ctl00_ContentMain_SearchResultsGrid_grid']/tbody/tr/td[3]/div/a/text()"))
            .xpath(
                "//a[@title='Click to view additional information on this product.']//text()"
            ))
        #print part_number
        manufacturer_part_number = cleansplit(
            Selector(
                text=response.body).xpath("//div[@class='mfrDiv']/a/text()"))
        #.xpath("//table/tbody/tr/td[4]/div/a/text()"))
        #.css("tbody tr td:nth-of-type(4)"))
        #print manufacturer_part_number
        manufacturer_name = cleansplit(
            Selector(text=response.body)
            #.xpath("//table[@class='SearchResultsTable']/tbody/tr/td[5]/a/text()"))
            .xpath("//a[contains(@id, 'lnkSupplier')]/text()"))
        #.css("table#ctl00_ContentMain_SearchResultsGrid_grid > tbody > tr > td:nth-child(5) > a::text"))
        #print manufacturer_name
        description = cleansplit(
            Selector(text=response.body).xpath(
                "//a[contains(@id, 'lnkSupplier')]/../following-sibling::td/text()"
            ))
        #.css("table#ctl00_ContentMain_SearchResultsGrid_grid tbody tr td:nth-child(6)"))
        #print len(description)
        quantity_available = cleansplit(
            Selector(text=response.body).xpath(
                "//span[contains(@id,'lnkAvailability')]/text()"))
        #       "/text()[1]"))
        # .css("span.inStockBold::text"))
        print quantity_available
        image_url = cleansplit(
            Selector(text=response.body).xpath(
                "//tr[@class='SearchResultsRowOdd']/td/a/img/@src"))
        '''
        This is variable handler when no content in selected xpath. so this algorithm will keep list balanced.
        and alyways will process zip iteration. and return scaped item. see customfunction.py for listbalancer method'''
        if not quantity_available:
            quantity_available = listbalancer(part_number)
        if not image_url: image_url = listbalancer(image_url)
        if not description: description = listbalancer(description)

        for i, j, k, l, m, n in zip(part_number, manufacturer_part_number,
                                    manufacturer_name, description,
                                    quantity_available, image_url):
            item['manufacturer'] = k
            item['manufacturer_part_number'] = j
            item['supplier'] = self.spider_name
            item['supplier_part_number'] = i
            item['description'] = l
            item['image_url'] = "{0}{1}".format(self.start_urls[0], n)
            item['product_url'] = response.url
            item['stock_qty'] = cleanqty(m.replace('In Stock', ''))
            yield item
        next_url = response.xpath(
            "//a[@id='ctl00_ContentMain_PagerTop_lnkNext']/@href"
        ).extract_first()
        if self.debug: print "Next URL -> %s" % (next_url)
        if next_url:
            self.pn += 1
            "Following Next Page {0}".format(response.urljoin(next_url))
            yield Request(response.urljoin(next_url),
                          callback=self.parse_search_result,
                          dont_filter=True)
Пример #6
0
    def parse_search_result(self, response):
        """ Parse Search Result

        Parser that will be used if response result detected as search result page.

        Args:
            response

        Returns:
            item
        """
        item = ElectronicItem()
        items = []

        part_number = cleansplit(
            Selector(text=response.body)
            # .css("li.ttipartnumber a ::text/li[@class='ttipartnumber']/a/text()"))
            .xpath(
                "//div[@class='partColContent']/ul[@class='viewDescList']/li[1]/a[@class='primarySearchLink']/text()"
            ))
        manufacturer_part_number = cleansplit(
            Selector(text=response.body).xpath(
                "//ul[@class='viewDescList']/li[3]/span[@class='defaultSearchText']/text()"
            ))
        #.css("td.productImage.mftrPart a::text"))
        manufacturer_name = cleansplit(
            Selector(text=response.body).xpath(
                "//ul[@class='viewDescList']/li[2]/a[@class='secondarySearchLink']/text()"
            ))
        # .css("td.description a p:first-of-type::text"))
        description = cleansplit(
            Selector(text=response.body).xpath(
                "//div[@class='srDescDiv']/a[@class='primarySearchLink'][1]/text()"
            ))
        #.css("td.description a p:nth-of-type(2)::text"))
        quantity_available = cleansplit(
            Selector(text=response.body)
            # .xpath("/td[@class='availability']/text()"))
            .css("span.inStockBold::text")
        )  # quantity is not found in rscomponents
        image_url = cleansplit(
            Selector(text=response.body).xpath(
                "//div[@class='viewsImage']/a/img/@src"))
        '''
        This is variable handler when no content in selected xpath. so this algorithm will keep list balanced.
        and alyways will process zip iteration. and return scaped item. see customfunction.py for listbalancer method'''
        if not quantity_available:
            quantity_available = listbalancer(part_number)
        if not image_url: image_url = listbalancer(image_url)
        if not description: description = listbalancer(description)

        for i, j, k, l, m, n in zip(part_number, manufacturer_part_number,
                                    manufacturer_name, description,
                                    quantity_available, image_url):
            item['manufacturer'] = k
            item['manufacturer_part_number'] = j
            item['supplier'] = self.spider_name
            item['supplier_part_number'] = i
            item['description'] = l
            item['image_url'] = "{0}{1}".format("http:", n)
            item['product_url'] = response.url
            item['stock_qty'] = cleanqty(m.replace(u'\xa0', u''))
            yield item

        next_url = response.xpath(
            "//a[@class='rightLink nextLink approverMessageTitle']/@href"
        ).extract_first()
        if self.debug: print "Next URL -> %s" % (next_url)
        if next_url:
            self.pn += 1
            "Following Next Page {0}".format(response.urljoin(next_url))
            yield Request(
                'http://uk.rs-online.com/web/c/?sra=oss&r=t&searchTerm=%s&pn=%s&rpp=2'
                % (self.query, self.pn),
                callback=self.parse_search_result,
                dont_filter=True)
Пример #7
0
    def parse_search_result(self, response):
        """ Parse Search Result

        Parser that will be used if response result detected as search result page.

        Args:
            response

        Returns:
            item
        """
        item = ElectronicItem()

        part_number = cleansplit(
            Selector(text=response.body)
            #.css("li.ttipartnumber a ::text/li[@class='ttipartnumber']/a/text()"))
            #.css("li.ttipartnumber a::text"))
            .xpath("//meta[@itemprop='sku']/@content"))
        manufacturer_part_number = cleansplit(
            Selector(
                text=response.body).xpath("//meta[@itemprop='mpn']/@content"))
        #.css("li.mfrpartnumber a::text"))
        manufacturer_name = cleansplit(
            Selector(text=response.body).xpath(
                "//td[@class='oc_row']/div/img/@title"))
        #.css("li.manufacturer::text"))
        description = cleansplit(
            Selector(text=response.body).xpath(
                "//span[@itemprop='description']/text()"))
        #.css("td.description::text"))
        quantity_available = cleansplit(
            Selector(text=response.body).xpath(
                "//table[1]/tbody[1]/tr/td[5]//text()"))
        #.css("td.availability::text"))
        image_url = cleansplit(
            Selector(text=response.body).xpath(
                "//table[1]/tbody[1]/tr/td[2]/img[1]/@src"))
        '''
        This is variable handler when no content in selected xpath. so this algorithm will keep list balanced.
        and alyways will process zip iteration. and return scaped item. see customfunction.py for listbalancer method'''
        if not quantity_available:
            quantity_available = listbalancer(part_number)
        if not image_url: image_url = listbalancer(image_url)
        if not description: description = listbalancer(description)

        for i, j, k, l, m, n in zip(part_number, manufacturer_part_number,
                                    manufacturer_name, description,
                                    quantity_available, image_url):
            item['manufacturer'] = k
            item['manufacturer_part_number'] = j
            item['supplier'] = self.spider_name
            item['supplier_part_number'] = i
            item['description'] = l
            item['image_url'] = n
            item['product_url'] = response.url
            item['stock_qty'] = cleanqty(m.replace(u'\xa0', u''))
            yield item
        #next_url = response.xpath(
        #    '//a[@id="ctl00_PlaceHolderMain_results_pagingFooter_ctl08_HyperLink6"]//@href').extract_first()
        next_url = response.xpath(
            "//nav[1]/ul[1]/li[4]/a[1]/@href").extract_first()
        if self.debug: print "Next URL -> %s" % (next_url)
        if next_url:
            "Following Next Page {0}".format(response.urljoin(next_url))
            yield Request(response.urljoin(next_url),
                          callback=self.parse_search_result,
                          dont_filter=True)
Пример #8
0
    def parse_search_result(self, response):
        """ Parse Search Result

        Parser that will be used if response result detected as search result page.

        Args:
            response

        Returns:
            item
        """
        item = ElectronicItem()

        part_number = cleansplit(
            Selector(text=response.body)
            #.css("li.ttipartnumber a ::text/li[@class='ttipartnumber']/a/text()"))
            .css("p.sku a::text"))
        manufacturer_part_number = cleansplit(
            Selector(text=response.body)
            #.xpath("/li[@class='mfrpartnumber']/a/text()"))
            .css("td.productImage.mftrPart a::text"))
        manufacturer_name = cleansplit(
            Selector(text=response.body)
            #.xpath("//id('sProdList')/x:tbody/x:tr/x:td/x:a/x:p[1]::text"))
            .css("td.description a p:first-of-type::text"))
        description = cleansplit(
            Selector(text=response.body)
            #.xpath("/td[@class='description']/text()"))
            .css("td.description a p:nth-of-type(2)::text"))
        quantity_available = cleansplit(
            Selector(text=response.body)
            #.xpath("/td[@class='availability']/text()"))
            .css("span.inStockBold::text"))
        image_url = cleansplit(
            Selector(text=response.body).xpath(
                "//img[@class='productThumbnail']").xpath('@src'))
        '''
        This is variable handler when no content in selected xpath. so this algorithm will keep list balanced.
        and alyways will process zip iteration. and return scaped item. see customfunction.py for listbalancer method'''
        if not quantity_available:
            quantity_available = listbalancer(part_number)
        if not image_url: image_url = listbalancer(image_url)
        if not description: description = listbalancer(description)

        for i, j, k, l, m, n in zip(part_number, manufacturer_part_number,
                                    manufacturer_name, description,
                                    quantity_available, image_url):
            item['manufacturer'] = k
            item['manufacturer_part_number'] = j
            item['supplier'] = self.spider_name
            item['supplier_part_number'] = i
            item['description'] = l
            item['image_url'] = n
            item['product_url'] = response.url
            item['stock_qty'] = cleanqty(m.replace(u'\xa0', u''))
            yield item
        next_url = response.xpath(
            "//span[@class='current']"
            "/following-sibling::span/a/@href").extract_first()
        if self.debug: print "Next URL -> %s" % (next_url)
        if next_url:
            "Following Next Page {0}".format(response.urljoin(next_url))
            yield Request(response.urljoin(next_url),
                          callback=self.parse_search_result,
                          dont_filter=True)