示例#1
0
    def parse(self, response):
        with SeleniumBrowser(self, response) as browser:
            selector = browser.get(response.url)

            category_path = selector.xpath(
                '//ul[contains(@class, "grouplist__group")]')
            for categories in category_path:
                path = self.extract(
                    categories.xpath(
                        './li[@class="grouplist__group__title"]/text()'))
                leaves = categories.xpath(
                    './li[@class="grouplist__group__item"]')
                for leaf in leaves:
                    category = CategoryItem()
                    category['category_leaf'] = self.extract(
                        leaf.xpath('.//text()'))
                    category[
                        'category_path'] = 'Home > ' + path + ' > ' + category[
                            'category_leaf']
                    category['category_url'] = get_full_url(
                        response.url, self.extract(leaf.xpath('./a/@href')))
                    yield category

                try:
                    for item in self.parse_categories(browser,
                                                      category['category_url'],
                                                      category):
                        yield item
                except:
                    pass
示例#2
0
    def level_4(self, response):

        original_url = response.url

        with SeleniumBrowser(self, response) as browser:

            wait_for = None
            wait_type, wait_for_xpath = "wait_none", ""
            if wait_for_xpath:
                wait_for = EC.wait_none((By.XPATH, wait_for_xpath))
            browser.get(response.url)
            while True:
                try:
                    selector = browser.click_link(
                        "//button[@id='load-more-results']", wait_for)
                    response = selector.response
                except:
                    break
        urls_xpath = "//div[@class='product-info']/a/@href"
        urls = self.extract_list(response.xpath(urls_xpath))
        for single_url in urls:
            matches = None
            if "":
                matches = re.search("", single_url, re.IGNORECASE)
            if matches:
                single_url = matches.group(0)
            single_url = get_full_url(original_url, single_url)

            request = Request(single_url, callback=self.level_5)

            yield request
示例#3
0
    def parse(self, response):
        category_xpaths = { "category_leaf": "(//ul[@class='ulNavigationBreadcrumb']/li/a)[last()]/text()",
                            "category_path": "(//ul[@class='ulNavigationBreadcrumb']/li/a)[last()]/text()"
                          }

        product_xpaths = { "PicURL": "(//ul[@class='thumbsBox']/li/a)[1]/@href",
                           "ProductName": "//h1[@itemprop='name']/text()"
                         }

        category = self.init_item_by_xpaths(response, "category", category_xpaths)
        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product["OriginalCategoryName"] = category["category_path"]
        product["PicURL"] = get_full_url(response.url, product["PicURL"])
        yield category
        yield product
        yield self.get_rm_kidval(product, response)

        reviews_xpath = "//a[@id='tabRating']"

        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            browser.scroll(200) # click auto scroll does not work for some
            selector = browser.click(reviews_xpath)

            for review in self._parse_reviews(selector, browser, product):
                yield review
示例#4
0
    def parse(self, response):
        all_review_button_xpath = "//a[contains(@class,'seeAllReviews')]"
        soup = BeautifulSoup(response.body, "lxml")
        #inspect_response(response, self)
        item_id = response.url.split('/')[-2].strip()
        product = ProductItem()
        product['source_internal_id'] = item_id
        product['ProductName'] = soup.find('span', {
            'itemprop': 'name'
        }).text.strip()
        product['ProductManufacturer'] = soup.find('span', {
            'itemprop': 'manufacturer'
        }).text.strip()
        ocn = []
        ocn_paths = soup.find('ul', {
            'class': 'Breadcrumb-list'
        }).find_all('span', {'itemprop': 'title'})
        for item in ocn_paths:
            ocn.append(item.text.strip())
        product['OriginalCategoryName'] = ' > '.join(ocn)
        product['PicURL'] = soup.find(
            'img', {'class': 'js-ProductVisuals-imagePreview'})['src'].strip()
        product['TestUrl'] = response.url
        yield product
        yield self.get_rm_kidval(product, response)

        with SeleniumBrowser(self, response) as browser:
            selector = browser.get(response.url)
            all_review_button = response.xpath(all_review_button_xpath)
            if all_review_button:
                selector = browser.click(
                    "//a[contains(@class,'seeAllReviews')]")
            for review in self._parse_reviews(selector, product, browser):
                yield review
示例#5
0
    def parse_product(self, response):
        category = response.meta['category']

        product = ProductItem()
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = category['category_path']
        product["ProductManufacturer"] = self.extract(response.xpath('//a[@class="brand"]/text()'))
        product['PicURL'] = self.extract(response.xpath('//meta[@property="og:image"]/@content'))
        product['source_internal_id'] = self.extract(response.xpath('//div[@id="pdpFRdivMain"]/@data-productid'))
        mpn = self.extract(response.xpath(
            '//dt[@data-cerberus="txt_pdp_sizetitle"]/parent::dl/dd[not(contains(text(),"Taille"))]/text()'))
        if mpn:
            product['ProductName'] = product["ProductManufacturer"] + ' ' + mpn
            product_id = self.product_id(product)
            product_id['ID_kind'] = "MPN"
            product_id['ID_value'] = mpn
            yield product_id
        else:
            name = self.extract(response.xpath('//h2[@itemprop="name"]/text()'))
            product['ProductName'] = product["ProductManufacturer"] + ' ' + name
        yield product

        review_url = self.extract(response.xpath('//a[@class="read-reviews"]/@href'))
        review_url = get_full_url(response, review_url)
        with SeleniumBrowser(self, response) as browser:
            selector = browser.get(review_url, timeout=10)

            response.meta['browser'] = browser
            response.meta['product'] = product
            response.meta['_been_in_decorator'] = True

            for review in self.parse_reviews(response, selector, incremental=True):
                yield review
示例#6
0
    def parse(self, response):
        iframe_xpath = "//iframe[@id='mainframe']"
        review_url_xpath = "//div[@class='title']/a/@href"
        review_re = 'magazine/\d+/\d+/\d+/(\d+)/'
        continue_next_page = False
        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            selector = browser.switch_to_frame(iframe_xpath)

            next_page_xpath = "//a[@class='next_page']/@href"
            review_urls = self.extract_list(selector.xpath(review_url_xpath))

            for review_url in review_urls:
                match = re.search(review_re, review_url)
                if not match:
                    print review_url
                    continue
                source_internal_id = match.group(1)
                if not is_product_in_db_by_sii(self.mysql_manager,
                                               self.spider_conf["source_id"],
                                               source_internal_id):
                    continue_next_page = True
                    review_url = get_full_url(response.url, review_url)
                    request = Request(review_url, callback=self.parse_review)
                    request.meta['source_internal_id'] = source_internal_id
                    yield request

            if continue_next_page:
                next_page = self.extract(selector.xpath(next_page_xpath))
                next_page = get_full_url(response.url, next_page)
                if next_page:
                    request = Request(next_page, callback=self.parse)
                    yield request
示例#7
0
    def parse(self, response):

        original_url = response.url

        with SeleniumBrowser(self, response) as browser:

            wait_for = None
            wait_type, wait_for_xpath = "wait_none", ""
            if wait_for_xpath:
                wait_for = EC.wait_none((By.XPATH, wait_for_xpath))
            browser.get(response.url)
            selector = browser.scroll_until_the_end(2000, wait_for)
            response = selector.response
        urls_xpath = "//h3[contains(@class,'entry-title')]/a/@href"
        params_regex = {}
        urls = self.extract_list(response.xpath(urls_xpath))

        for single_url in urls:
            matches = None
            if "":
                matches = re.search("", single_url, re.IGNORECASE)
                if matches:
                    single_url = matches.group(0)
                else:
                    continue
            single_url = get_full_url(original_url, single_url)

            request = Request(single_url, callback=self.level_2)

            yield request
示例#8
0
    def parse(self, response):

        original_url = response.url

        with SeleniumBrowser(self, response) as browser:

            wait_for = None
            wait_type, wait_for_xpath = "wait_none", ""
            if wait_for_xpath:
                wait_for = EC.wait_none((By.XPATH, wait_for_xpath))
            browser.get(response.url)
            while True:
                try:
                    selector = browser.click_link(
                        "//a[@class='more-articles']", wait_for)
                    response = selector.response
                except:
                    break
        urls_xpath = "//div[@class='article-stream-container']//a[img[@class=' article_wrap']]/@href"
        params_regex = {}
        urls = self.extract_list(response.xpath(urls_xpath))

        for single_url in urls:
            matches = None
            if "":
                matches = re.search("", single_url, re.IGNORECASE)
                if matches:
                    single_url = matches.group(0)
                else:
                    continue
            single_url = get_full_url(original_url, single_url)

            request = Request(single_url, callback=self.level_2)

            yield request
示例#9
0
 def parse_sub_category(self, response):
     category_urls = self.extract_list(response.xpath('//li[@class="level3"]/a/@href'))
     for category_url in category_urls:
         category_url = get_full_url(response, category_url)
         with SeleniumBrowser(self, response) as browser:
             selector = browser.get(category_url+'||reviews*299|reviews*298')
             for item in self.parse_category(browser, selector):
                 yield item
示例#10
0
    def parse(self, response):
        with SeleniumBrowser(self, response) as browser:
            seletor = browser.get(response.url)

            product_urls = self.extract_list(
                seletor.xpath('//div[@class="media-body"]/a/@ng-href'))
            for product_url in product_urls:
                yield Request(url=product_url, callback=self.parse_product)
示例#11
0
 def parse(self, response):
     if 'browser' in response.meta:
         browser = response.meta['browser']
         for request in self._parse(response, browser):
             yield request
     else:
         with SeleniumBrowser(self, response) as browser:
             for request in self._parse(response, browser):
                 yield request
示例#12
0
    def parse(self, response):
        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            keep_going = True

            while keep_going:
                selector = browser.scroll_until_the_end(5000)

                for review_text in selector.xpath('//article[@itemtype="http://schema.org/BlogPosting"]').extract():
                    review_section = Selector(text=review_text)
                    product = ProductItem()
                    review = ReviewItem()

                    product['OriginalCategoryName'] = "Miscellaneous"
                    review['DBaseCategoryName'] = "PRO"

                    review['TestTitle'] = self.extract(review_section.xpath('//h2[@itemprop="headline"]/a/text()'))

                    review['TestUrl'] = self.extract(review_section.xpath('//h2[@itemprop="headline"]/a/@href'))
                    product['TestUrl'] = review['TestUrl']

                    review['Author'] = self.extract(review_section.xpath('//span[@itemprop="author"]/a/text()'))

                    if review['TestTitle']:
                        matches = re.search("^(.*?) review", review['TestTitle'], re.IGNORECASE)
                        if matches:
                            review['ProductName'] = matches.group(1)
                            product['ProductName'] = matches.group(1)
                        else:
                            review['ProductName'] = review['TestTitle']
                            product['ProductName'] = review['TestTitle']

                    review["TestDateText"] = self.extract(review_section.xpath('//time/@datetime'))

                    review['TestSummary'] = self.extract_all(review_section.xpath('//div[@class="block-excerpt"]/div[@itemprop="articleBody"]/*/text()'), separator=" ")

                    product['PicURL'] = self.extract(review_section.xpath('//div[@class="block-image"]/a/img/@src'))

                    yield product
                    yield review

                if self.extract(selector.xpath('//div[@id="load-more-posts"]')):

                    #if self.extract(selector.xpath('//div[@id="load-more-posts"]/div')):
                    #    print "Current URL: ", self.browser.browser.current_url
                    #    self.browser.browser.refresh()
                    #else:
                    pre_click_headline = self.extract(selector.xpath('//article[@itemtype="http://schema.org/BlogPosting" and position()=2]//h2[@itemprop="headline"]/a/text()'))
                    browser.click('//div[@id="load-more-posts"]')
                    post_click_headline = self.extract(selector.xpath('//article[@itemtype="http://schema.org/BlogPosting" and position()=2]//h2[@itemprop="headline"]/a/text()'))
                    if pre_click_headline == post_click_headline:
                        keep_going = False
                else:
                    keep_going = False
示例#13
0
    def parse(self, response):
        #Must use only product_page
        category_xpaths = {
            "category_leaf":
            "//div[@id='breadcrumb']/a[@class='home']/following-sibling::a[last()-1]/text()"
        }
        category_path_xpath = "//div[@id='breadcrumb']/a[@class='home']/following-sibling::a/text()"

        product_xpaths = {
            "PicURL": "(//*[@property='og:image'])[1]/@content",
            "source_internal_id": "//form[@id='productSheet']/@data-product",
            "ProductName": "//div[@itemprop='name']/h1/text()",
            "ProductManufacturer": "//*[@class='nameBrand']/text()"
        }
        category_path_selector = response.xpath(category_path_xpath)
        category_path_selector = category_path_selector[:-1]

        category = self.init_item_by_xpaths(response, "category",
                                            category_xpaths)
        category["category_path"] = self.extract_all(category_path_selector,
                                                     separator=' | ')
        print category

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product["OriginalCategoryName"] = category["category_path"]

        product_id = ProductIdItem()
        product_id['source_internal_id'] = product["source_internal_id"]
        product_id['ProductName'] = product["ProductName"]
        product_id['ID_kind'] = "conforama_fr_id"
        product_id['ID_value'] = product["source_internal_id"]
        yield product_id

        yield category
        yield product
        yield self.get_rm_kidval(product, response)

        reviews_xpath = "//a[@id='rating']"

        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            selector = browser.click(reviews_xpath)

            response.meta['browser'] = browser
            response.meta['product'] = product
            response.meta['product_id'] = product_id
            response.meta['_been_in_decorator'] = True

            for review in self.parse_reviews(response, selector=selector):
                yield review
示例#14
0
    def parse(self, response):
        cookies = response.meta.get('cookies', None)
        if not cookies:
            with SeleniumBrowser(self, response, no_images=False,
                                 no_css=False) as browser:
                cookies = self.login_selenium(browser)

        cat_url_xpath = "//footer[@class='report-category__footer']/a/@href"
        cat_urls = self.extract_list_xpath(response, cat_url_xpath)
        for cat_url in cat_urls:
            cat_url = get_full_url(response, cat_url)
            request = Request(cat_url, callback=self.parse_category_leafs)
            request.meta['cookies'] = cookies
            yield request
示例#15
0
    def parse(self, response):
        #Must use only product_page
        category_xpaths = {
            "category_leaf": "//*[@id='moreFrom-catLink']/a/text()",
            "category_path": "//*[@id='moreFrom-catLink']/a/text()"
        }

        product_xpaths = {
            "PicURL": "(//li[@class='productImageItem'])[1]//img/@src",
            "ProductName": "//h1[@class='productHeading']//text()",
            "ProductManufacturer": "//h1[@class='productHeading']/text()"
        }

        category = self.init_item_by_xpaths(response, "category",
                                            category_xpaths)
        product = self.init_item_by_xpaths(response, "product", product_xpaths)

        match = re.search(self.source_internal_id_re, response.url)
        if match:
            product['source_internal_id'] = match.group(1)
        product["OriginalCategoryName"] = category["category_path"]
        yield category
        yield product
        yield self.get_rm_kidval(product, response)

        mpn_value = self.extract(
            response.xpath("//span[@id='productMPN']/text()"))
        if mpn_value:
            mpn = ProductIdItem()
            mpn['source_internal_id'] = product["source_internal_id"]
            mpn['ProductName'] = product["ProductName"]
            mpn['ID_kind'] = "MPN"
            mpn['ID_value'] = mpn_value
            yield mpn

        ean_value = self.extract(
            response.xpath("//span[@id='productEAN']/text()"))
        if ean_value:
            ean = ProductIdItem()
            ean['source_internal_id'] = product["source_internal_id"]
            ean['ProductName'] = product["ProductName"]
            ean['ID_kind'] = "EAN"
            ean['ID_value'] = ean_value
            yield ean

        with SeleniumBrowser(self, response) as browser:
            selector = browser.get(response.url)
            for review in self._parse_reviews(selector, browser, product):
                yield review
示例#16
0
    def parse(self, response):
        selector = Selector(response)
        next_page_xpath = "//*[@class='more-button']"

        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            while not self.stop_scraping(selector):
                selector = browser.click(next_page_xpath)

        review_urls = self.extract_list(
            selector.xpath('//h3[@class="news-item-title"]/a/@href'))

        for review_url in review_urls:
            review_url = get_full_url(response, review_url)
            request = Request(review_url, callback=self.parse_review)
            yield request
示例#17
0
    def parse_product(self, response):
        category = response.meta['category']

        product = ProductItem()
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = category['category_path']
        name = self.extract(response.xpath('//h1/span/text()'))
        name_match = re.findall(r'[^()]+', name)
        product['ProductName'] = name_match[0]
        pic_url = self.extract(
            response.xpath('//div[@class="main-image"]/img/@src'))
        if pic_url:
            product['PicURL'] = get_full_url(response, pic_url)
        product['ProductManufacturer'] = self.extract(
            response.xpath('//span[@class="brand"]/text()'))
        product['source_internal_id'] = self.extract(
            response.xpath(
                '//body[@id="MasterPageBodyTag"]/@data-productcode'))
        yield product

        mpn = self.extract(
            response.xpath('//span[contains(text(),"Mfg")]/span/text()'))
        if mpn:
            mpn_id = self.product_id(product)
            mpn_id['ID_kind'] = "MPN"
            mpn_id['ID_value'] = mpn
            yield mpn_id

        product_id = self.product_id(product)
        product_id['ID_kind'] = "cdw_id"
        product_id['ID_value'] = product['source_internal_id']
        yield product_id

        with SeleniumBrowser(self, response) as browser:
            selector = browser.get(response.url, timeout=10)

            response.meta['browser'] = browser
            response.meta['product'] = product
            response.meta['product_id'] = product_id
            response.meta['_been_in_decorator'] = True

            for review in self.parse_reviews(response,
                                             selector,
                                             incremental=True):
                yield review
示例#18
0
    def parse(self, response):
        product_page_xpath = "//li[contains(@class, 'parentTerminal')]//a[contains(@class, 'btn') and not(contains(@href, 'movil/tarjeta'))]/@href"

        category = CategoryItem()
        category['category_path'] = "Cell Phones"
        yield category

        with SeleniumBrowser(self, response, no_images=False,
                             no_css=False) as browser:
            selector = browser.get(response.url)
            product_urls = self.extract_list_xpath(selector,
                                                   product_page_xpath)
            for product_url in product_urls:
                product_url = get_full_url(response.url, product_url)
                request = self.selenium_request(product_url,
                                                callback=self.parse_product)
                request.meta['category'] = category
                yield request
示例#19
0
    def parse(self, response):
        #Must use only product_page
        category_xpaths = {
            "category_leaf":
            "(//div[@class='breadcrumb']//a/span)[last()]//text()"
        }
        category_path_xpath = "(//div[@class='breadcrumb']//a/span)//text()"
        product_xpaths = {
            "PicURL":
            "(//*[@property='og:image'])[1]/@content",
            "ProductName":
            "//h1[contains(@class, 'page-title')]/span//text()",
            "ProductManufacturer":
            "//h1[contains(@class,'page-title')]/span[@itemprop='brand']/text()"
        }

        category = self.init_item_by_xpaths(response, "category",
                                            category_xpaths)
        category["category_path"] = self.extract_all(
            response.xpath(category_path_xpath), separator=' | ')

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product["OriginalCategoryName"] = category["category_path"]
        product["source_internal_id"] = None

        source_internal_id_xpath = "//meta[@itemprop='identifier']/@content"
        source_internal_id = self.extract(
            response.xpath(source_internal_id_xpath))
        match = re.match(self.source_internal_id_re, source_internal_id)
        if match:
            product["source_internal_id"] = match.group(1)
        yield category
        yield product
        yield self.get_rm_kidval(product, response)

        reviews_xpath = "//a[contains(text(),' customer reviews')]"

        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            selector = browser.click(reviews_xpath)

            for review in self._parse_reviews(selector, browser, product):
                yield review
示例#20
0
    def parse_product(self, response):
        color_url_xpath = "//fieldset[contains(@class, 'color-picker')]/ul/li/a/@href"
        size_id_xpath = "//fieldset[contains(@class, 'size-picker')]//input/@id"
        single_size_xpath = "//fieldset[contains(@class, 'size-picker')]//input[@id='%s']/following-sibling::label[1]"

        with SeleniumBrowser(self, response, no_images=False,
                             no_css=False) as browser:
            selector = browser.get(response.url)
            color_urls = self.extract_list_xpath(selector, color_url_xpath)
            for color_url in color_urls:
                color_url = get_full_url(response.url, color_url)
                selector = browser.get(color_url)
                size_ids = self.extract_list_xpath(selector, size_id_xpath)

                for size_id in size_ids:
                    selector = browser.click(single_size_xpath % size_id)
                    for item in self._parse_product(response, browser,
                                                    selector):
                        yield item
示例#21
0
    def parse_category(self, response):
        all_products_xpath = "//a[@data-selector='INTRO_Link']/@href"
        all_products_url = self.extract(response.xpath(all_products_xpath))
        all_products_url = get_full_url(response.url, all_products_url)

        if all_products_url:
            request = self.selenium_request(all_products_url,
                                            callback=self.parse_category)
            request.meta['category'] = response.meta["category"]
            yield request
            return

        with SeleniumBrowser(self, response, no_css=False,
                             no_images=False) as browser:
            self.login_selenium(browser)
            products_div_xpath = "//section[contains(@class,'js-listing--desktop')]"
            #products_div = response.xpath(products_div_xpath)
            #if products_div:
            for item in self.parse_product_list_div(response, browser):
                yield item
示例#22
0
    def parse_product(self, response):
        select_xpath = "//*[contains(@class, 'product-stage')]" \
                 "//select[contains(@class, 'select_to')]"

        select_values_xpath = "//*[contains(@class, 'product-stage')]" \
                        "//select[contains(@class, 'select_to')]/option/@value"
        color_variant_url_xpath = "//li[contains(@class, 'product-stage__colors__color')]/a/@href"
        color_variants = self.extract_list_xpath(response,
                                                 color_variant_url_xpath)
        select = response.xpath(select_xpath)

        if select or color_variants:
            with SeleniumBrowser(self, response, no_images=False,
                                 no_css=False) as browser:
                browser.get(response.url)
                if select:
                    select_values = self.extract_list_xpath(
                        response, select_values_xpath)

                    for value in select_values:
                        if not value:
                            return
                        selector = browser.select_by_value(select_xpath, value)
                        color_variants = self.extract_list_xpath(
                            selector, color_variant_url_xpath)
                        if color_variants:
                            for item in self.parse_color_variants(
                                    response, color_variants, browser=browser):
                                yield item
                        else:
                            for item in self._parse_product(
                                    response, selector):
                                yield item
                elif color_variants:
                    for item in self.parse_color_variants(
                            response, color_variants, browser):
                        yield item
        else:
            for item in self._parse_product(response):
                yield item
示例#23
0
    def parse(self, response):

        original_url = response.url
        product = response.meta.get("product", {})
        review = response.meta.get("review", {})

        with SeleniumBrowser(self, response) as browser:

            wait_for = None
            wait_type, wait_for_xpath = "wait_none", ""
            if wait_for_xpath:
                wait_for = EC.wait_none((By.XPATH, wait_for_xpath))
            browser.get(response.url)
            selector = browser.scroll_until_the_end(2000, wait_for)
            response = selector.response
        urls_xpath = "//div[@class='landing-feed--special-content']/a/@href"
        params_regex = {}
        urls = self.extract_list(response.xpath(urls_xpath))

        for single_url in urls:
            matches = None
            if "":
                matches = re.search("", single_url, re.IGNORECASE)
                if matches:
                    single_url = matches.group(0)
                else:
                    continue
            single_url = get_full_url(original_url, single_url)

            request = Request(single_url, callback=self.level_2)

            try:
                request.meta["product"] = product
            except:
                pass
            try:
                request.meta["review"] = review
            except:
                pass
            yield request
示例#24
0
    def parse(self, response):
        #Must use only product_page
        product_xpaths = {
            "PicURL": "(//*[@property='og:image'])[1]/@content",
            "ProductName": "//h1/span[@itemprop='name']/text()",
            "ProductManufacturer": "(//h1/span[@itemprop='name']/text())[1]"
        }

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        parsed_url = urlparse(response.url)

        match = re.search(self.source_internal_id_re, parsed_url.path)
        if match:
            product["source_internal_id"] = match.group(1)

            product_id = ProductIdItem()
            product_id['source_internal_id'] = product["source_internal_id"]
            product_id['ProductName'] = product["ProductName"]
            product_id['ID_kind'] = "richersounds_id"
            product_id['ID_value'] = product["source_internal_id"]
            yield product_id
            yield product
            yield self.get_rm_kidval(product, response)

            reviews_xpath = "//h4/a[contains(@href,'review')]"

            with SeleniumBrowser(self, response) as browser:
                browser.get(response.url)
                selector = browser.click(reviews_xpath)

                response.meta['browser'] = browser
                response.meta['product'] = product
                response.meta['product_id'] = product_id
                response.meta['_been_in_decorator'] = True

                for review in self.parse_reviews(response, selector=selector):
                    yield review
示例#25
0
    def parse(self, response):
        #Must use only product_page
        category_xpaths = {
            "category_leaf": "(//ul[@id='breadcrumb']/li/a)[last()]/text()"
        }
        category_path_xpath = "//ul[@id='breadcrumb']/li/a/text()"

        product_xpaths = {
            "PicURL":
            "//div[@id='productImage']/img/@src",
            "ProductName":
            "//div[@typeof='v:Product']/h1/text()",
            "ProductManufacturer":
            "//div[@typeof='v:Product']/h1/span[@property='v:brand']/text()"
        }

        category = self.init_item_by_xpaths(response, "category",
                                            category_xpaths)
        category["category_path"] = self.extract_all(
            response.xpath(category_path_xpath), separator=" | ")

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product["PicURL"] = get_full_url(response, product["PicURL"])
        product["OriginalCategoryName"] = category["category_path"]
        product["ProductName"] = "%s %s" % (product['ProductManufacturer'],
                                            product["ProductName"])
        yield category
        yield product
        yield self.get_rm_kidval(product, response)

        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            selector = browser.click("//a[@class='reviewLinks']")

            for review in self._parse_reviews(selector, browser, product):
                yield review
示例#26
0
    def parse(self, response):
        category_xpaths = {
            "category_leaf": "(//a[@class='klickpfad'])[last()]//text()"
        }

        category_path_xpath = "//a[@class='klickpfad']/text()"
        product_xpaths = {
            "PicURL": "//div[@id='big_handy_img']/img/@src",
            "ProductName": "//h1/span[@itemprop='name']/text()",
            "ProductManufacturer": "//h1/span[@itemprop='brand']//text()"
        }

        category = self.init_item_by_xpaths(response, "category",
                                            category_xpaths)
        category["category_path"] = self.extract_all(
            response.xpath(category_path_xpath), separator=' | ')

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product["OriginalCategoryName"] = category["category_path"]
        product["ProductName"] = "%s %s" % (product["ProductManufacturer"],
                                            product["ProductName"])

        yield category
        yield product
        yield self.get_rm_kidval(product, response)

        reviews_xpath = "//a[@id='ekomi_button']"
        all_reviews_button = "//span[@id='lade_bewertungen']"

        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            browser.click(reviews_xpath)
            selector = browser.click(all_reviews_button)

            for review in self._parse_reviews(selector, browser, product):
                yield review
示例#27
0
    def parse(self, response):

        original_url = response.url

        with SeleniumBrowser(self, response) as browser:

            wait_for = None
            wait_type, wait_for_xpath = "wait_none", ""
            if wait_for_xpath:
                wait_for = EC.wait_none((By.XPATH, wait_for_xpath))
            browser.get(response.url)
            while True:
                try:
                    urls_xpath = "//ul[@class='review-listing']//h3/a/@href"
                    params_regex = {}
                    urls = self.extract_list(response.xpath(urls_xpath))

                    for single_url in urls:
                        matches = None
                        if "":
                            matches = re.search("", single_url, re.IGNORECASE)
                            if matches:
                                single_url = matches.group(0)
                            else:
                                continue
                        single_url = get_full_url(original_url, single_url)

                        request = Request(single_url, callback=self.level_2)

                        yield request
                    selector = browser.click_link(
                        "//ul[contains(@class,'pager')]//a[contains(text(),'next')]",
                        wait_for)
                    response = selector.response
                except:
                    break
示例#28
0
    def level_5(self, response):
                                     
        original_url = response.url
        


        
        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
        
            first_time = True
            while True:
                if not first_time:
                    try:
                        selector = browser.click_link("//button[contains(@class, 'loadmore')]", None)
                        response = selector.response
                    except:
                        break

                first_time = False
                containers_xpath = ".//div[@itemprop='review']"
                containers = response.xpath(containers_xpath)
                for review_container in containers:
                    review = ReviewItem()
                    
                    
                    review['ProductName'] = self.extract(review_container.xpath("//a[contains(@class, 'breadcrumb-link')]/@title"))
                    
                    
                    review['SourceTestRating'] = self.extract(review_container.xpath(".//meta[@itemprop='ratingValue']/@content"))
                    
                    
                    review['TestDateText'] = self.extract(review_container.xpath(".//span[contains(@class, 'review-date')]//text()"))
                    
                    
                    
                    
                    review['TestSummary'] = self.extract(review_container.xpath(".//p[@itemprop='description']/text()"))
                    
                    
                    
                    review['Author'] = self.extract(review_container.xpath(".//span[contains(@class, 'user-nickname')]/text()"))
                    
                    
                    review['TestTitle'] = self.extract(review_container.xpath(".//h4[@itemprop='name']/text()"))
                    
                    
                    
                    review['TestUrl'] = original_url
                    try:
                        review['ProductName'] = product['ProductName']
                        review['source_internal_id'] = product['source_internal_id']
                    except:
                        pass
           

                
                    review["DBaseCategoryName"] = "USER"
                 
                                    

                
                    review["SourceTestScale"] = "5"
                
                                    

                
                    if review["TestDateText"]:
                        
                        review["TestDateText"] = date_format(review["TestDateText"], "%d-%m-%Y", ["en"])
                
                                    

        
                
                            
                    yield review
示例#29
0
 def __init__(self, *a, **kw):
     AlaCrawlSpider.__init__(self, *a, **kw)
     self.browser = SeleniumBrowser(self)
示例#30
0
class SnapdealSpider(AlaCrawlSpider):
    name = 'snapdeal'
    download_delay = 2
    start_urls = ['http://www.snapdeal.com/page/sitemap']

    def __init__(self, *a, **kw):
        AlaCrawlSpider.__init__(self, *a, **kw)
        self.browser = SeleniumBrowser(self)

    def process_category_link(value):
        return value + "?sort=plrty&"

    rules = [
        Rule(
            LxmlLinkExtractor(
                unique=True,
                allow=['/products/mobiles-mobile-phones'],
                #,
                #       '/products/mobiles-tablets',
                #       '/products/cameras-digital-cameras',
                #       '/products/cameras-digital-slrs'
                restrict_xpaths=['//*[@class="ht180"]//li//*'],
                process_value=process_category_link),
            callback="parse_category")
    ]

    def parse_category(self, response):
        category_path_xpath = '//*[@class="containerBreadcrumb"]//span/text()'
        category_leaf_xpath = '//*[@class="active-bread"]/text()'

        clickable_element = '//*[contains(@class,"list-view-lang")]'
        loading_icon_xpath = '//*[@id="ajax-loader-icon" and @class="mar_20per_left"]'

        product_list_xpath = '//*[@id="prodDetails"]/@href'

        category_xpath = self.extract_all(response, category_path_xpath, '|')

        if category_xpath not in self.skip_categories:
            category = CategoryItem()
            category["category_path"] = category_xpath
            category["category_leaf"] = self.extract(response,
                                                     category_leaf_xpath)
            category["category_url"] = response.url
            yield category

            wait_for = EC.element_to_be_clickable(
                (By.XPATH, clickable_element))
            selector = self.browser.get(response, wait_for)

            wait_for = EC.invisibility_of_element_located(
                (By.XPATH, loading_icon_xpath))
            selector = self.browser.scroll_until_the_end(2000, wait_for)

            products = selector.xpath(product_list_xpath)

            for product in products:
                product_url = product.extract()
                request = Request(product_url, callback=self.parse_product)
                request.meta['category'] = category
                yield request

    def parse_product(self, response):
        category = response.meta['category']
        product_name_xpath = '//*[@class="productTitle"]//*[@itemprop="name"]/text()'
        brand_xpath = '//*[@itemprop="brand"]//*[@itemprop="name"]/text()'
        pic_url_xpath = '//*[@class="mainImageSlider"]//*[@itemprop="image"]/@src'
        source_internal_id_xpath = '//*[@id="pppid"]/text()'

        product = ProductItem()
        product["source_internal_id"] = self.extract(
            response.xpath(source_internal_id_xpath))
        product["ProductName"] = self.extract(
            response.xpath(product_name_xpath))
        product["OriginalCategoryName"] = category['category_path']
        product["PicURL"] = self.extract(response.xpath(pic_url_xpath))
        product["ProductManufacturer"] = self.extract(
            response.xpath(brand_xpath))
        product["TestUrl"] = response.url

        for review in self.parse_reviews(response, product):
            yield review

        yield product

    def parse_reviews(self, response, product):
        reviews_xpath = '//*[@class="pr-review-wrap"]'
        next_page_xpath = '//*[@class="pr-pagination-top"]//*[@class="pr-page-next"]/a'
        review_elements = response.xpath(reviews_xpath)

        for review_element in review_elements:
            yield self.parse_review(review_element, product)

        next_page = self.extract(response.xpath(next_page_xpath))
        if next_page:
            ec_condition = EC.element_to_be_clickable(
                (By.XPATH, next_page_xpath))
            self.browser.get(response, ec_condition)

            ec_condition = EC.presence_of_all_elements_located(
                (By.XPATH, '//*[@class="pr-contents-wrapper"]'))
            selector = self.browser.click(next_page_xpath, ec_condition)
            for review in self.parse_reviews(selector, product):
                yield review

    def parse_review(self, response, product):
        author_xpath = './/*[@class="prReviewAuthorProfileLnk"]/span/text()'
        rating_xpath = './/*[@class="pr-rating pr-rounded"]/text()'
        title_xpath = './/*[@class="pr-review-rating-headline"]/text()'
        verdict_xpath = './/*[@class="pr-review-bottom-line-wrapper"]/text()'
        date_xpath = './/*[contains(@class,"pr-review-author-date")]/text()'
        summary_xpath = './/*[@class="pr-comments"]'

        review = ReviewItem()
        review["source_internal_id"] = product["source_internal_id"]
        review["ProductName"] = product["ProductName"]
        review["SourceTestRating"] = self.extract(response.xpath(rating_xpath))
        extracted_date = self.extract(response.xpath(date_xpath))
        review["TestDateText"] = datetime.strptime(
            extracted_date, "%d/%m/%Y").strftime('%Y-%m-%d')
        review["TestSummary"] = self.extract(response.xpath(summary_xpath))
        review["TestVerdict"] = self.extract(response.xpath(verdict_xpath))
        review["Author"] = self.extract(response.xpath(author_xpath))
        review["DBaseCategoryName"] = "USER"
        review["TestTitle"] = self.extract(response.xpath(title_xpath))
        review["TestUrl"] = product["TestUrl"]
        return review