Exemplo n.º 1
0
 def list_page(self, response):
     urls = response.xpath('//div[@class="house-title"]/a/@href').extract()
     for url in urls:
         yield WebdriverRequest(url, callback=self.parse)
     next_page = response.xpath('//a[@class="aNxt"]/@href').extract_first()
     if next_page:
         yield WebdriverRequest(next_page, callback=self.list_page)
Exemplo n.º 2
0
 def parse_smzdm_list_page(self, response):
     try:
         category = response.meta["category"]
         sel = WebdriverXPathSelector(response)
         item_url_sel_list = sel.select(
             "/html/body/section//div[@class='listTitle']/h3[@class='itemName']/a/@href"
         )
         for item_url_sel in item_url_sel_list:
             item_url = item_url_sel.extract()
             if item_url not in self.urls_seen:
                 yield WebdriverRequest(item_url,
                                        meta={'category': category},
                                        callback=self.parse_smzdm_item_page)
             # else:
             #     raise StopIteration
         next_page_xpath = "//li[@class='pagedown']/a/@href"
         next_page_url_sel_list = sel.select(next_page_xpath)
         for next_page_url_sel in next_page_url_sel_list:
             next_page_url = next_page_url_sel.extract()
             yield WebdriverRequest(next_page_url,
                                    meta={'category': category},
                                    callback=self.parse_smzdm_list_page)
     except:
         log.msg("Smzdm list page parse failed:\t[%s]" % (response.url),
                 level=log.ERROR,
                 spider=SmzdmSpider)
         raise StopIteration
Exemplo n.º 3
0
 def list_page(self, response):
     urls = response.xpath('//ul//div[@class="title"]/a/@href').extract()
     for url in urls:
         yield WebdriverRequest(url=response.urljoin(url),
                                callback=self.parse)
     next_page = response.xpath(
         '//div[@class="page-box fr"]//a[@class="on"]/following-sibling::*[1]/@href'
     ).extract_first()
     if next_page and int(next_page[14:-5]) < 20:
         yield WebdriverRequest(url=response.urljoin(next_page),
                                callback=self.list_page)
Exemplo n.º 4
0
    def parse_feed(self, response):
        """
        Parses a json datafeed page on The Warehouse

        Designed for: http://www.thewarehouse.co.nz/red/catalog/gifting/gifts-for-him?JsonFlag=true
        """
        jsonresponse = json.loads(response.body_as_unicode())

        for product in jsonresponse['products']:
            l = ScraperProductLoader(item=ScraperProduct(), response=response)
            l.add_value('url', product['productUrl'])
            l.add_value('sku', product['productSku'])
            l.add_value('name', product['productName'])
            l.add_value('in_stock', product['derivedInStock'])
            l.add_value('description', product['productDescription'])
            l.add_value('price', product['price'])
            l.add_value("image_urls", [product['productImageUrl']])
            l.add_value("price", product['price'])

            if len(product['productPriceInfo']):
                l.add_value('attributes', product['productPriceInfo'])

            yield l.load_item()

        # If the feed continues on another page, follow it
        if jsonresponse.get('nextPageUrl'):
            yield WebdriverRequest(jsonresponse.get('nextPageUrl'),
                                   callback=self.parse_feed)
Exemplo n.º 5
0
    def parse_start_url(self, response):
        """
        Handles any special parsing from start_urls.

        However, we mostly use it to handle pagination.

        This method is misleading as it actually cascades...
        """
        if self.is_product_page(response):
            self.rules = ()
            self._rules = []
            return self.parse_product(response)

        if response.url in self.visited:
            return []

        sel = Selector(response)
        pages = sel.css('.pagePaginatorLabel').re_first(r'Page \d+ of (\d+)')

        if not pages or int(pages) <= 1:
            return []

        urls = []
        pages = int(pages)
        for page in xrange(pages):
            url = '{base}#pageId={page}'.format(base=response.url, page=page)
            self.visited.append(url)
            urls.append(WebdriverRequest(url))

        return urls
Exemplo n.º 6
0
    def parse_product(self, response):
        """
        Parses a product page on Gap.com.

        @url http://www.gap.com/browse/product.do?cid=64526&vid=1&pid=960079012
        @returns items 1 1
        @returns requests 0 0
        @scrapes url sku name price in_stock description image_urls attributes
        """
        sel = Selector(response)

        url_sel = sel.css('link[rel="canonical"]::attr(href)')
        their_id = url_sel.re("/P(\d+).jsp")[0]
        l = ScraperProductLoader(item=ScraperProduct(), response=response)
        l.add_value('url', url_sel.extract()[0])
        l.add_value('sku', their_id)
        l.add_css('name', '.productName::text')

        # Presence of product name determines product availability
        l.add_value('in_stock', bool(l.get_output_value('name')))

        l.add_css('description', '#tabWindow')

        attributes = {}
        sale_price = sel.css('#priceText .salePrice::text').extract_first()

        if not sale_price:
            l.add_css('price', '#priceText::text')
        else:
            l.add_css('price', '#priceText strike::text')
            attributes['sales_price'] = sale_price

        try:
            category_name = sel.css('#lnc-division::attr(alt)').extract()[0]
        except IndexError:
            category_name = sel.css('ul.category li.category')[0].css(
                'a::text').extract()[0]
        l.add_value('attributes', attributes)

        # image urls are stored in the "productData" page
        url = self.product_data_url.format(their_id)
        request = WebdriverRequest(url, callback=self.images)
        # request contains other data in meta['item']
        request.meta['item'] = l.load_item()
        yield request  # crawl the other url
Exemplo n.º 7
0
 def parse_recommended_products(self, response):
     # Scrape similar products
     sel = Selector(response)
     url_paths = sel.css(
         'article.top-products .content>a::attr(href)').extract()
     for url_path in url_paths:
         request = WebdriverRequest(url_path, callback=self.parse_product)
         self.prep_product_tagging(request, response.meta.get('item'))
         yield request
Exemplo n.º 8
0
    def parse_sub2(self, response):
        # def parse(self, response):
        product_links = response.xpath(
            '//*[@class="item-list-block"]/a/@href').extract()

        count = 0
        for product_link in product_links:
            # if count >= 1:
            #     break

            count += 1
            if not self.dbutils.check_history_exist(product_link):
                yield WebdriverRequest(product_link,
                                       callback=self.parse_product)
Exemplo n.º 9
0
    def parse_category_full_page(self, response):
        count = 0
        _slick_slides = response.xpath('//*[@class="slick-track"]/div')
        for slide in _slick_slides:
            # if count >= 1:
            #     break
            sub2_link = response.xpath(
                '//*[@class="slick-track"]/div/a/@href')[count].extract()
            count += 1

            yield WebdriverRequest(sub2_link, callback=self.parse_sub2)

        count = 0
        _sale_banners = response.xpath(
            '//*[@role="saleBannersList"]/div/*[@role="saleBannerContainer"]')
        for banners in _sale_banners:
            # if count >= 1:
            #     break
            sub2_link = response.xpath(
                '//*[@role="saleBannersList"]/div/*[@role="saleBannerContainer"]/a/@href'
            )[count].extract()
            count += 1

            yield WebdriverRequest(sub2_link, callback=self.parse_sub2)
Exemplo n.º 10
0
    def parse_recipe(self, response):
        if not self.is_recipe_page(response):
            self.logger.warning(u"Unexpectedly not a recipe page: {}".format(response.request.url))
            return

        recipe_id = re.match(r'(?:http://|https://)?www\.surlatable\.com/product/REC-(\d+)(/.*)?', response.url).group(1)
        sel = Selector(response)

        l = ScraperContentLoader(item=ScraperImage(), response=response)
        l.add_value('force_skip_tiles', self.skip_tiles)
        l.add_value('original_url', unicode(response.request.url))
        l.add_value('source', 'Sur La Table')
        l.add_css('name', 'h1.name::text')
        l.add_css('description', '#recipedetail .story')
        item = l.load_item()

        self.handle_product_tagging(response, item, content_id=recipe_id)

        if self.skip_images:
            yield item
        else:
            # Continue to XML data to get recipe image
            magic_values = sel.css('.fluid-display::attr(id)').extract_first().split(':')
            xml_path = u'/images/customers/c{1}/{2}/{2}_{3}/pview_{2}_{3}.xml'.format(*magic_values)
            request = WebdriverRequest(self.root_url + xml_path, callback=self.parse_one_image)

            request.meta['item'] = item

            yield request

        # Scrape tagged products
        url_paths = sel.css('.productinfo .itemwrapper>a::attr(href)').extract()
        for url_path in url_paths:
            request = WebdriverRequest(self.root_url + url_path, callback=self.parse_product)
            self.prep_product_tagging(request, item)
            yield request
Exemplo n.º 11
0
    def start_requests(self):
        urls = [
            'http://www.ewtn.com/daily-readings/?date=2016-12-24',
'http://www.ewtn.com/daily-readings/?date=2016-12-25',
'http://www.ewtn.com/daily-readings/?date=2016-12-26',
'http://www.ewtn.com/daily-readings/?date=2016-12-27',
'http://www.ewtn.com/daily-readings/?date=2016-12-28',
'http://www.ewtn.com/daily-readings/?date=2016-12-29',
'http://www.ewtn.com/daily-readings/?date=2016-12-30',
'http://www.ewtn.com/daily-readings/?date=2016-12-31',




     

        ]
        for url in urls:
            yield WebdriverRequest(url=url, callback=self.parse)
Exemplo n.º 12
0
 def parse(self, response):
     """ main method
     """
     yield WebdriverRequest(response.url,
                            callback=self._spider_page,
                            method="GET")
Exemplo n.º 13
0
 def parse_smzdm_item_page(self, response):
     try:
         category = response.meta["category"]
         sel = WebdriverXPathSelector(response)
         title_sel_list = sel.select('/html/body/section/div[1]/article/h1')
         attachment_sel_list = sel.select(
             '/html/body/section/div[1]/article/h1/span')
         if len(title_sel_list):
             title = self.normalize_text(title_sel_list[0].extract())
             item_name = title
         else:
             log.msg("Smzdm title parse failed:\t[%s]" % (response.url),
                     level=log.ERROR,
                     spider=SmzdmSpider)
             raise StopIteration
         all_attachment = ''
         for attachment_sel in attachment_sel_list:
             attachment = attachment_sel.extract()
             item_name = item_name.replace(attachment, '')
             all_attachment += attachment
         price, currency = self.parse_price(all_attachment)
         item_shopping_url_sel_list = sel.select(
             "/html/body/section/div[1]/article/div[2]/div/div/a/@href")
         if len(item_shopping_url_sel_list):
             item_shopping_url = item_shopping_url_sel_list[0].extract()
             yield WebdriverRequest(item_shopping_url,
                                    meta={'referer': response.url},
                                    callback=self.parse_shopping_item_page)
         description_sel_list = sel.select(
             '/html/body/section/div[1]/article/div[2]/p[@itemprop="description"]'
         )
         description = ''
         img_src_list = []
         for description_sel in description_sel_list:
             description += self.normalize_text(description_sel.extract())
             img_src_sel_list = description_sel.select(".//img/@src")
             for img_src_sel in img_src_sel_list:
                 img_src_list.append(img_src_sel.extract())
         try:
             worthy_vote = int(
                 self.get_text_by_xpath(
                     sel, "//span[@id='rating_worthy_num']/text()"))
         except:
             worthy_vote = 0
         try:
             unworthy_vote = int(
                 self.get_text_by_xpath(
                     sel, "//span[@id='rating_unworthy_num']/text()"))
         except:
             unworthy_vote = 0
         try:
             favorite_count = int(
                 self.get_text_by_xpath(sel, "//a[@class='fav']/em/text()"))
         except:
             favorite_count = 0
         try:
             comment_count = int(
                 self.get_text_by_xpath(sel,
                                        "//a[@class='comment']/em/text()"))
         except:
             comment_count = 0
         yield items.SmzdmItem(title=item_name, price=price, url=response.url, description=description, \
                               image_urls=img_src_list, worthy_vote=worthy_vote, unworthy_vote=unworthy_vote, \
                               favorite_count=favorite_count, comment_count=comment_count, category=category, currency=currency)
     except:
         log.msg("Smzdm item page parse failed:\t[%s]" % (response.url),
                 level=log.ERROR,
                 spider=SmzdmSpider)
         raise StopIteration
Exemplo n.º 14
0
 def parse_action(self, response):
     yield WebdriverRequest('%s&wr=%d' % (response.url, 0),
                            callback=self.parse_nothing)
Exemplo n.º 15
0
 def start_requests(self):
     for title, url in self.url:
         yield WebdriverRequest(url, callback=self.parse)
Exemplo n.º 16
0
 def login(self):
     return WebdriverRequest(
         'http://associates.amazon.cn/gp/associates/network/main.html',
         callback=self.submit_login_info)
Exemplo n.º 17
0
 def start_requests(self):
     for i in xrange(2):
         yield WebdriverRequest('http://testdomain/path?wr=%d' % i)
         yield Request('http://testdomain/path?r=%d' % i)
Exemplo n.º 18
0
 def parse(self, response):
     for url in response.css('#MainContent_tStock h3 a::attr("href")'):
         url = url.extract()
         yield WebdriverRequest(urljoin(response.url, url),
                                callback=self.parse_company)
Exemplo n.º 19
0
 def start_requests(self):
     for url in self.start_urls:
         yield WebdriverRequest(url=url, callback=self.list_page)
Exemplo n.º 20
0
    def parse_shopping_item_page(self, response):
        try:
            sel = WebdriverXPathSelector(response)
            referer = response.meta["referer"]
            target_price = response.meta["target_price"]
            jd_jump_url_sel = sel.select(
                "/html/body/div[5]/div/div/div[1]/div[2]/div[3]/a/@href")
            if jd_jump_url_sel:
                log.msg("JD jump url:\t[%s]" % (jd_jump_url_sel[0].extract()),
                        level=log.DEBUG,
                        spider=SmzdmSpider)
                yield WebdriverRequest(jd_jump_url_sel[0].extract(),
                                       meta={'referer': referer},
                                       callback=self.parse_shopping_item_page)
            else:
                img_src_list = []
                comment_list = []
                description = ""
                title = ""
                vote_count = ""
                vote_score = ""
                price = -1.0
                log.msg("Shopping url: %s" % (response.url),
                        level=log.DEBUG,
                        spider=SmzdmSpider)
                log.msg("Real shopping url: %s" %
                        (response.webdriver.current_url),
                        level=log.DEBUG,
                        spider=SmzdmSpider)
                url = response.webdriver.current_url
                hostname = urlparse(url).hostname
                if hostname != "www.amazon.cn":
                    log.msg("Shopping robot does not support this site",
                            level=log.INFO,
                            spider=SmzdmSpider)
                    return
                for url_pattern, (title_xpath, price_xpath,
                                  price_redudant_pattern, description_xpath,
                                  description_img_xpath, currency,
                                  title_img_xpath_list, comment_xpath,
                                  vote_count_xpath, vote_score_xpath
                                  ) in self.__url_pattern_xpath_dict.items():
                    if url_pattern.match(url):
                        log.msg("Shopping url pattern is found",
                                level=log.DEBUG,
                                spider=SmzdmSpider)
                        title_sel_list = sel.select(title_xpath)
                        if len(title_sel_list):
                            title = self.normalize_text(
                                title_sel_list[0].extract())
                        else:
                            log.msg("Shopping page error:\ttitle is not found",
                                    level=log.ERROR,
                                    spider=SmzdmSpider)
                            raise StopIteration
                            continue
                        price_sel_list = sel.select(price_xpath)
                        if len(price_sel_list):
                            price_text = price_sel_list[0].extract()
                            price_text = price_redudant_pattern.sub(
                                '', price_text)
                            try:
                                price = float(price_text)
                                if url.startswith("http://www.kiddies24.de"):
                                    price /= 100
                                if (price -
                                        target_price) / target_price > 0.05:
                                    log.msg(
                                        "Price is not ideal. (current price: %f, target price: %f)"
                                        % (price, target_price),
                                        level=log.INFO,
                                        spider=SmzdmSpider)
                                    return
                            except:
                                traceback.print_exc()
                                log.msg(
                                    "Shopping page error:\tThis item is sold out, the price is %s"
                                    % (price),
                                    level=log.WARNING,
                                    spider=SmzdmSpider)
                        else:
                            log.msg("Shopping page error:\tprice is not found",
                                    level=log.WARNING,
                                    spider=SmzdmSpider)
                        title_img_sel_list = []
                        for title_img_xpath in title_img_xpath_list:
                            title_img_sel_list += sel.select(title_img_xpath)
                        title_img_src = ""
                        for title_img_sel in title_img_sel_list:
                            title_img_src = title_img_sel.extract()
                            if title_img_src:
                                img_src_list.append(title_img_src)
                                break
                        if hostname == "item.jd.com":
                            # sel.select_script("arguments[0].scrollIntoView(true);", sel.webdriver.find_element_by_xpath("//div[@id='comment-0']"))
                            # sel.select_script("arguments[0].scrollIntoView(true);", sel.webdriver.find_element_by_xpath("//div[@id='comment-2']"))
                            # sel.webdriver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                            sel.webdriver.find_element_by_xpath(
                                "//li[@id='detail-tab-comm']/a").click()
                            time.sleep(2)
                        for comment_sel in sel.select(comment_xpath):
                            comment_list.append(comment_sel.extract())
                        vote_count_sel_list = sel.select(vote_count_xpath)
                        if len(vote_count_sel_list):
                            vote_count = vote_count_sel_list[0].extract()
                        else:
                            log.msg(
                                "Shopping page error:\tvote count is not found",
                                level=log.ERROR,
                                spider=SmzdmSpider)
                        vote_score_sel_list = sel.select(vote_score_xpath)
                        if len(vote_score_sel_list):
                            vote_score = vote_score_sel_list[0].extract()
                        else:
                            log.msg(
                                "Shopping page error:\tvote score is not found",
                                level=log.ERROR,
                                spider=SmzdmSpider)
                        log.msg("Shopping item: [%s] [%s] [%s] [%s] [%s]" %
                                (title, description, price, url, referer),
                                level=log.DEBUG,
                                spider=SmzdmSpider)
                        yield items.ShoppingItem(title=title, price=price, url=url, referer=referer, image_urls=img_src_list, \
                                title_image_url=title_img_src, description=description, currency=currency, \
                                comment_list=comment_list, vote_count=vote_count, vote_score=vote_score)
                        log.msg("Place the order!",
                                level=log.INFO,
                                spider=SmzdmSpider)

                        sel = WebdriverXPathSelector(response)
                        one_click_button_list = sel.select(
                            '//*[@id="one-click-button"]')
                        if not one_click_button_list:
                            log.msg("Need to enable one click order!",
                                    level=log.DEBUG,
                                    spider=SmzdmSpider)
                            referer = response.meta["referer"]
                            enable_one_click_url_sel = response.webdriver.find_elements_by_xpath(
                                '//*[@id="oneClickSignIn"]')
                            if enable_one_click_url_sel:
                                # enable_one_click_url = enable_one_click_url_sel[0].extract()
                                log.msg("Enable one click order",
                                        level=log.DEBUG,
                                        spider=SmzdmSpider)
                                # enable_one_click_url_sel[0].click()
                                yield WebdriverActionRequest(response, \
                                        actions=ActionChains(response.webdriver).click(enable_one_click_url_sel[0]), \
                                        meta={'referer': referer}, \
                                        callback=self.parse_shopping_item_page)
                        else:
                            log.msg("One click order!",
                                    level=log.INFO,
                                    spider=SmzdmSpider)
                            one_click_button_list[0].click()

                        # self.order_item(response)
                        # time.sleep(1)
        except:
            traceback.print_exc()
            log.msg("Shopping item page parse failed:\t[%s]" % (response.url),
                    level=log.ERROR,
                    spider=SmzdmSpider)
            raise StopIteration
Exemplo n.º 21
0
    def parse_product(self, response):
        if not self.is_product_page(response):
            self.logger.warning(u"Unexpectedly not a product page: {}".format(response.request.url))
            return
        
        attributes = {}

        sel = Selector(response)
        l = ScraperProductLoader(item=ScraperProduct(), response=response)
        l.add_value('force_skip_tiles', self.skip_tiles)
        l.add_css('name', 'h1#product-title::text')
        l.add_css('description', '#product-description div::text')
        l.add_css('details', '#product-moreInfo-features li')

        # If the page doesn't have a sku, the product doesn't exist
        sku = ''
        try:
            # Try to find the SKU directly, does not work for products with multiple sizes
            sku = sel.css('#product-sku span[itemprop="sku"]::text').extract()[0].strip()
        except (IndexError, AttributeError):
            pass
        if not sku:
            try:
                # could be a color option
                sku = sel.css('#product #product-options a[data-sku]::attr(data-sku)').extract()[0]
            except (IndexError, AttributeError):
                pass
        if not sku:
            try:
                # Product ID usually of form: 'PRO-1220433'
                prod_id = sel.css('#productId::attr(value)').extract()[0]
                sku = re.search(r'\d+', prod_id).group()
            except (IndexError, AttributeError):
                # An item with a missing sku will not validate
                pass
        l.add_value('sku', unicode(sku))

        # prices are sometimes in the forms:
        #    $9.95
        #    $9.95 - $48.96
        #    $99.96  Sugg. $1,860.00 | You save 46%
        price_range = sel.css('meta[property="eb:pricerange"]::attr(content)').extract_first()
        if price_range:
            attributes['price_range'] = price_range

        try:
            price = sel.css('.product-priceMain span.hide::text').extract_first().split('-')[0]
            sugg_price = sel.css('.product-priceInfo #product-priceList span::text').extract_first()
            
            if sugg_price:
                reg_price = sugg_price.split('-')[0] # Sometimes "$9.95 - $48.96"
                sale_price = price
            else:
                reg_price = price
                sale_price = None
            
        except IndexError:
            reg_price = u'$0.00'
            sale_price = None

        l.add_value('in_stock', bool(not self.is_sold_out(response)))
        l.add_value('price', unicode(reg_price))
        l.add_value('sale_price', unicode(sale_price) if sale_price else None)
        l.add_value('attributes', attributes)
        l.add_value('url', unicode(response.request.url))
        
        item = l.load_item()

        # If this is a similar_product and tagged_product, handle it
        self.handle_product_tagging(response, item)

        if self.skip_images:
            yield item
        else:
            # Full-sized Sur La Table image URLs found in a magical XML file.
            try:
                magic_values = sel.css('.fluid-display::attr(id)').extract_first().split(':')
                xml_path = u"/images/customers/c{1}/{2}/{2}_{3}/pview_{2}_{3}.xml".format(*magic_values)
                request = WebdriverRequest(self.root_url + xml_path, callback=self.parse_product_images)
                request.meta['item'] = item
                yield request
            except IndexError:
                yield item
Exemplo n.º 22
0
    def start_requests(self):
        # logger = logging.getLogger(self.__smzdm_log_file)
        log.start(logfile=self.__smzdm_log_file,
                  loglevel='INFO',
                  logstdout=False)
        smzdm_config = ConfigParser.RawConfigParser()
        smzdm_config.read("configure/smzdm.ini")
        self.price_pattern = re.compile(
            smzdm_config.get("item_page", "price_pattern").decode("utf-8"))
        self.usd_price_pattern = re.compile(
            smzdm_config.get("item_page", "usd_price_pattern").decode("utf-8"))
        self.jpy_price_pattern = re.compile(
            smzdm_config.get("item_page", "jpy_price_pattern").decode("utf-8"))
        self.eur_price_pattern = re.compile(
            smzdm_config.get("item_page", "eur_price_pattern").decode("utf-8"))
        self.head_separator = smzdm_config.get(
            "item_page", "head_separator_pattern").decode("utf-8")
        self.attachment_pattern = re.compile(
            smzdm_config.get("item_page",
                             "attachment_pattern").decode("utf-8"))

        config_file_name = "configure/shopping_page.ini"
        shopping_config = ConfigParser.RawConfigParser()
        shopping_config.read(config_file_name)

        for section_name in shopping_config.sections():
            log.msg("Supported url pattern:\t%s" % (shopping_config.get(
                section_name, "url_pattern").decode('utf8')),
                    level=log.DEBUG,
                    spider=SmzdmSpider)
            url_pattern = re.compile(
                shopping_config.get(section_name,
                                    "url_pattern").decode('utf8'))
            title_xpath = shopping_config.get(section_name, "title_xpath")
            price_xpath = shopping_config.get(section_name, "price_xpath")
            price_redudant_pattern = re.compile(
                shopping_config.get(section_name,
                                    "price_redudant_pattern").decode('utf8'))
            description_xpath = shopping_config.get(section_name,
                                                    "description_xpath")
            description_img_xpath = shopping_config.get(
                section_name, "description_img_xpath")
            currency = shopping_config.get(section_name, "currency")
            title_img_xpath_list = shopping_config.get(
                section_name, "title_img_xpath").split(",")
            comment_xpath = shopping_config.get(section_name, "comment_xpath")
            vote_count_xpath = shopping_config.get(section_name,
                                                   "vote_count_xpath")
            vote_score_xpath = shopping_config.get(section_name,
                                                   "vote_score_xpath")
            self.__url_pattern_xpath_dict[url_pattern] = (title_xpath, \
                    price_xpath, price_redudant_pattern, description_xpath, \
                    description_img_xpath, currency, title_img_xpath_list, \
                    comment_xpath, vote_count_xpath, vote_score_xpath)

        log.msg("Start requests", level=log.INFO, spider=SmzdmSpider)
        # CrawlSpider.start_requests(self)

        log.msg("Login Amazon", level=log.INFO, spider=SmzdmSpider)
        WebdriverRequest(
            'https://www.amazon.cn/ap/signin?_encoding=UTF8&openid.assoc_handle=cnflex&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.mode=checkid_setup&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&openid.ns.pape=http%3A%2F%2Fspecs.openid.net%2Fextensions%2Fpape%2F1.0&openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.amazon.cn%2F%3Fie%3DUTF8%26ref_%3Dnav_ya_signin',
            callback=self.submit_login_info)
        log.msg("Login Amazon success", level=log.INFO, spider=SmzdmSpider)
        time.sleep(10)

        for (category, urls) in smzdm_config.items("category"):
            if category == "all_post":
                for url in urls.split(","):
                    for page_num in range(1700, 1800):
                        list_url = "%s/p%d" % (url, page_num)
                        yield WebdriverRequest(
                            list_url,
                            meta={'category': category},
                            callback=self.parse_smzdm_post_list_page)
            else:
                for url in urls.split(","):
                    yield WebdriverRequest(url,
                                           meta={'category': category},
                                           callback=self.parse_smzdm_list_page)
Exemplo n.º 23
0
    def parse_shopping_item_page(self, response):
        try:
            sel = WebdriverXPathSelector(response)
            referer = response.meta["referer"]
            jd_jump_url_sel = sel.select(
                "/html/body/div[5]/div/div/div[1]/div[2]/div[3]/a/@href")
            if jd_jump_url_sel:
                log.msg("JD jump url:\t[%s]" % (jd_jump_url_sel[0].extract()),
                        level=log.DEBUG,
                        spider=SmzdmSpider)
                yield WebdriverRequest(jd_jump_url_sel[0].extract(),
                                       meta={'referer': referer},
                                       callback=self.parse_shopping_item_page)
            else:
                img_src_list = []
                description = ""
                title = ""
                price = -1.0
                log.msg("Shopping url: %s" % (response.url),
                        level=log.DEBUG,
                        spider=SmzdmSpider)
                log.msg("Real shopping url: %s" %
                        (response.webdriver.current_url),
                        level=log.DEBUG,
                        spider=SmzdmSpider)
                url = response.webdriver.current_url
                for url_pattern, (title_xpath, price_xpath,
                                  price_redudant_pattern, description_xpath,
                                  description_img_xpath, currency,
                                  title_img_xpath_list
                                  ) in self.__url_pattern_xpath_dict.items():
                    if url_pattern.match(url):
                        log.msg("Shopping url pattern is found",
                                level=log.DEBUG,
                                spider=SmzdmSpider)
                        title_sel_list = sel.select(title_xpath)
                        if len(title_sel_list):
                            title = self.normalize_text(
                                title_sel_list[0].extract())
                        else:
                            log.msg("Shopping page error:\ttitle is not found",
                                    level=log.ERROR,
                                    spider=SmzdmSpider)
                            raise StopIteration
                            continue
                        price_sel_list = sel.select(price_xpath)
                        if len(price_sel_list):
                            price_text = price_sel_list[0].extract()
                            price_text = price_redudant_pattern.sub(
                                '', price_text)
                            try:
                                price = float(price_text)
                                if url.startswith("http://www.kiddies24.de"):
                                    price /= 100
                            except:
                                log.msg(
                                    "Shopping page error:\tThis item is sold out, the price is %s"
                                    % (price),
                                    level=log.WARNING,
                                    spider=SmzdmSpider)
                        else:
                            log.msg("Shopping page error:\tprice is not found",
                                    level=log.WARNING,
                                    spider=SmzdmSpider)
                        title_img_sel_list = []
                        for title_img_xpath in title_img_xpath_list:
                            title_img_sel_list += sel.select(title_img_xpath)
                        title_img_src = ""
                        for title_img_sel in title_img_sel_list:
                            title_img_src = title_img_sel.extract()
                            if title_img_src:
                                img_src_list.append(title_img_src)
                                break
                        # if url_pattern.match('http://www.amazon.'):
                        #     try:
                        #         WebDriverWait(response.webdriver, 10) \
                        #             .until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, '//iframe[@id="product-description-iframe"]')))
                        #     except:
                        #         log.msg("Shopping page error:\tFrame in Amazon is not found", level=log.ERROR, spider=SmzdmSpider)
                        #
                        # description_sel_list = sel.select(description_xpath + "/*")
                        # for description_sel in description_sel_list:
                        #     description_part = self.normalize_text(description_sel.extract())
                        #     if description_part:
                        #         description += description_part + '\t'

                        # description_img_sel_list = sel.select(description_img_xpath)
                        # """ Run func with the given timeout. If func didn't finish running
                        #     within the timeout, raise TimeLimitExpired
                        # """
                        # import threading
                        # class GetImgSrcThread(threading.Thread):
                        #     def __init__(self, driver, sel_list):
                        #         threading.Thread.__init__(self)
                        #         self.__driver = driver
                        #         self.__sel_list = sel_list
                        #     def run(self):
                        #         for sel in self.__sel_list:
                        #             try:
                        #                 self.__driver.execute_script("arguments[0].scrollIntoView(true);", sel.element)
                        #                 time.sleep(1)
                        #             except:
                        #                 log.msg("Shopping page error:\tscrollIntoView failed", level=log.ERROR, spider=SmzdmSpider)
                        #                 img_src_sel_list = sel.select("./@src")
                        #                 for img_src_sel in img_src_sel_list:
                        #                     log.msg("Shopping page error:\timage %s is not found" % (img_src_sel.extract()), level=log.ERROR, spider=SmzdmSpider)
                        #                 continue
                        # it = GetImgSrcThread(response.webdriver, description_img_sel_list)
                        # it.start()
                        # it.join(60)
                        # if it.isAlive():
                        #     break
                        # description_img_sel_list = sel.select(description_img_xpath + "/@src")
                        # log.msg("Shopping description img list: %s[%d]" % (description_img_sel_list, len(description_img_sel_list)) , level=log.DEBUG, spider=SmzdmSpider)
                        # for description_img_sel in description_img_sel_list:
                        #     img_src = description_img_sel.extract()
                        #     if img_src:
                        #         img_src_list.append(img_src)
                        log.msg("Shopping item: [%s] [%s] [%s] [%s] [%s]" %
                                (title, description, price, url, referer),
                                level=log.DEBUG,
                                spider=SmzdmSpider)
                        yield items.ShoppingItem(title=title,
                                                 price=price,
                                                 url=url,
                                                 referer=referer,
                                                 image_urls=img_src_list,
                                                 title_image_url=title_img_src,
                                                 description=description,
                                                 currency=currency)
        except:
            log.msg("Shopping item page parse failed:\t[%s]" % (response.url),
                    level=log.ERROR,
                    spider=SmzdmSpider)
            raise StopIteration
 def parse(self, response):
     yield WebdriverRequest(response.url,
                            callback=self.parse_search_with_js)
Exemplo n.º 25
0
    def start_requests(self):
        log.start(logfile=self.__smzdm_log_file,
                  loglevel='INFO',
                  logstdout=False)
        smzdm_config = ConfigParser.RawConfigParser()
        smzdm_config.read("configure/smzdm.ini")
        self.price_pattern = re.compile(
            smzdm_config.get("item_page", "price_pattern").decode("utf-8"))
        self.usd_price_pattern = re.compile(
            smzdm_config.get("item_page", "usd_price_pattern").decode("utf-8"))
        self.jpy_price_pattern = re.compile(
            smzdm_config.get("item_page", "jpy_price_pattern").decode("utf-8"))
        self.eur_price_pattern = re.compile(
            smzdm_config.get("item_page", "eur_price_pattern").decode("utf-8"))
        self.head_separator = smzdm_config.get(
            "item_page", "head_separator_pattern").decode("utf-8")
        self.attachment_pattern = re.compile(
            smzdm_config.get("item_page",
                             "attachment_pattern").decode("utf-8"))

        config_file_name = "configure/shopping_page.ini"
        shopping_config = ConfigParser.RawConfigParser()
        shopping_config.read(config_file_name)

        for section_name in shopping_config.sections():
            log.msg("Supported url pattern:\t%s" % shopping_config.get(
                section_name, "url_pattern").decode('utf8'),
                    level=log.DEBUG,
                    spider=SmzdmSpider)
            url_pattern = re.compile(
                shopping_config.get(section_name,
                                    "url_pattern").decode('utf8'))
            title_xpath = shopping_config.get(section_name, "title_xpath")
            price_xpath = shopping_config.get(section_name, "price_xpath")
            price_redudant_pattern = re.compile(
                shopping_config.get(section_name,
                                    "price_redudant_pattern").decode('utf8'))
            description_xpath = shopping_config.get(section_name,
                                                    "description_xpath")
            description_img_xpath = shopping_config.get(
                section_name, "description_img_xpath")
            currency = shopping_config.get(section_name, "currency")
            title_img_xpath_list = shopping_config.get(
                section_name, "title_img_xpath").split(",")
            self.__url_pattern_xpath_dict[url_pattern] = (title_xpath, \
                    price_xpath, price_redudant_pattern, description_xpath, description_img_xpath, currency, title_img_xpath_list)
        CrawlSpider.start_requests(self)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/yingertuiche/youhui/p1',
            meta={'category': 'stroller'},
            callback=self.parse_smzdm_list_page)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/anquanzuoyi/youhui/p1',
            meta={'category': 'car_seat'},
            callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/lego/youhui/p1',
                               meta={'category': 'lego'},
                               callback=self.parse_smzdm_list_page)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/huwaibeibao/youhui/p1',
            meta={'category': 'backpack'},
            callback=self.parse_smzdm_list_page)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/yingertuiche/haitao/p1',
            meta={'category': 'stroller'},
            callback=self.parse_smzdm_list_page)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/anquanzuoyi/haitao/p1',
            meta={'category': 'car_seat'},
            callback=self.parse_smzdm_list_page)
        yield WebdriverRequest('http://www.smzdm.com/fenlei/lego/haitao/p1',
                               meta={'category': 'lego'},
                               callback=self.parse_smzdm_list_page)
        yield WebdriverRequest(
            'http://www.smzdm.com/fenlei/huwaibeibao/haitao/p1',
            meta={'category': 'backpack'},
            callback=self.parse_smzdm_list_page)
Exemplo n.º 26
0
    def parse(self, response):
        _url = response.url

        yield WebdriverRequest(_url, callback=self.parse_category_full_page)