Exemplo n.º 1
0
    def parse(self, response):
        with SeleniumBrowser(self, response) as browser:
            selector = browser.get(response.url)

            category_path = selector.xpath(
                '//ul[contains(@class, "grouplist__group")]')
            for categories in category_path:
                path = self.extract(
                    categories.xpath(
                        './li[@class="grouplist__group__title"]/text()'))
                leaves = categories.xpath(
                    './li[@class="grouplist__group__item"]')
                for leaf in leaves:
                    category = CategoryItem()
                    category['category_leaf'] = self.extract(
                        leaf.xpath('.//text()'))
                    category[
                        'category_path'] = 'Home > ' + path + ' > ' + category[
                            'category_leaf']
                    category['category_url'] = get_full_url(
                        response.url, self.extract(leaf.xpath('./a/@href')))
                    yield category

                try:
                    for item in self.parse_categories(browser,
                                                      category['category_url'],
                                                      category):
                        yield item
                except:
                    pass
Exemplo n.º 2
0
    def level_4(self, response):

        original_url = response.url

        with SeleniumBrowser(self, response) as browser:

            wait_for = None
            wait_type, wait_for_xpath = "wait_none", ""
            if wait_for_xpath:
                wait_for = EC.wait_none((By.XPATH, wait_for_xpath))
            browser.get(response.url)
            while True:
                try:
                    selector = browser.click_link(
                        "//button[@id='load-more-results']", wait_for)
                    response = selector.response
                except:
                    break
        urls_xpath = "//div[@class='product-info']/a/@href"
        urls = self.extract_list(response.xpath(urls_xpath))
        for single_url in urls:
            matches = None
            if "":
                matches = re.search("", single_url, re.IGNORECASE)
            if matches:
                single_url = matches.group(0)
            single_url = get_full_url(original_url, single_url)

            request = Request(single_url, callback=self.level_5)

            yield request
Exemplo n.º 3
0
    def parse(self, response):
        category_xpaths = { "category_leaf": "(//ul[@class='ulNavigationBreadcrumb']/li/a)[last()]/text()",
                            "category_path": "(//ul[@class='ulNavigationBreadcrumb']/li/a)[last()]/text()"
                          }

        product_xpaths = { "PicURL": "(//ul[@class='thumbsBox']/li/a)[1]/@href",
                           "ProductName": "//h1[@itemprop='name']/text()"
                         }

        category = self.init_item_by_xpaths(response, "category", category_xpaths)
        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product["OriginalCategoryName"] = category["category_path"]
        product["PicURL"] = get_full_url(response.url, product["PicURL"])
        yield category
        yield product
        yield self.get_rm_kidval(product, response)

        reviews_xpath = "//a[@id='tabRating']"

        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            browser.scroll(200) # click auto scroll does not work for some
            selector = browser.click(reviews_xpath)

            for review in self._parse_reviews(selector, browser, product):
                yield review
Exemplo n.º 4
0
    def parse(self, response):
        all_review_button_xpath = "//a[contains(@class,'seeAllReviews')]"
        soup = BeautifulSoup(response.body, "lxml")
        #inspect_response(response, self)
        item_id = response.url.split('/')[-2].strip()
        product = ProductItem()
        product['source_internal_id'] = item_id
        product['ProductName'] = soup.find('span', {
            'itemprop': 'name'
        }).text.strip()
        product['ProductManufacturer'] = soup.find('span', {
            'itemprop': 'manufacturer'
        }).text.strip()
        ocn = []
        ocn_paths = soup.find('ul', {
            'class': 'Breadcrumb-list'
        }).find_all('span', {'itemprop': 'title'})
        for item in ocn_paths:
            ocn.append(item.text.strip())
        product['OriginalCategoryName'] = ' > '.join(ocn)
        product['PicURL'] = soup.find(
            'img', {'class': 'js-ProductVisuals-imagePreview'})['src'].strip()
        product['TestUrl'] = response.url
        yield product
        yield self.get_rm_kidval(product, response)

        with SeleniumBrowser(self, response) as browser:
            selector = browser.get(response.url)
            all_review_button = response.xpath(all_review_button_xpath)
            if all_review_button:
                selector = browser.click(
                    "//a[contains(@class,'seeAllReviews')]")
            for review in self._parse_reviews(selector, product, browser):
                yield review
Exemplo n.º 5
0
    def parse_product(self, response):
        category = response.meta['category']

        product = ProductItem()
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = category['category_path']
        product["ProductManufacturer"] = self.extract(response.xpath('//a[@class="brand"]/text()'))
        product['PicURL'] = self.extract(response.xpath('//meta[@property="og:image"]/@content'))
        product['source_internal_id'] = self.extract(response.xpath('//div[@id="pdpFRdivMain"]/@data-productid'))
        mpn = self.extract(response.xpath(
            '//dt[@data-cerberus="txt_pdp_sizetitle"]/parent::dl/dd[not(contains(text(),"Taille"))]/text()'))
        if mpn:
            product['ProductName'] = product["ProductManufacturer"] + ' ' + mpn
            product_id = self.product_id(product)
            product_id['ID_kind'] = "MPN"
            product_id['ID_value'] = mpn
            yield product_id
        else:
            name = self.extract(response.xpath('//h2[@itemprop="name"]/text()'))
            product['ProductName'] = product["ProductManufacturer"] + ' ' + name
        yield product

        review_url = self.extract(response.xpath('//a[@class="read-reviews"]/@href'))
        review_url = get_full_url(response, review_url)
        with SeleniumBrowser(self, response) as browser:
            selector = browser.get(review_url, timeout=10)

            response.meta['browser'] = browser
            response.meta['product'] = product
            response.meta['_been_in_decorator'] = True

            for review in self.parse_reviews(response, selector, incremental=True):
                yield review
Exemplo n.º 6
0
    def parse(self, response):
        iframe_xpath = "//iframe[@id='mainframe']"
        review_url_xpath = "//div[@class='title']/a/@href"
        review_re = 'magazine/\d+/\d+/\d+/(\d+)/'
        continue_next_page = False
        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            selector = browser.switch_to_frame(iframe_xpath)

            next_page_xpath = "//a[@class='next_page']/@href"
            review_urls = self.extract_list(selector.xpath(review_url_xpath))

            for review_url in review_urls:
                match = re.search(review_re, review_url)
                if not match:
                    print review_url
                    continue
                source_internal_id = match.group(1)
                if not is_product_in_db_by_sii(self.mysql_manager,
                                               self.spider_conf["source_id"],
                                               source_internal_id):
                    continue_next_page = True
                    review_url = get_full_url(response.url, review_url)
                    request = Request(review_url, callback=self.parse_review)
                    request.meta['source_internal_id'] = source_internal_id
                    yield request

            if continue_next_page:
                next_page = self.extract(selector.xpath(next_page_xpath))
                next_page = get_full_url(response.url, next_page)
                if next_page:
                    request = Request(next_page, callback=self.parse)
                    yield request
Exemplo n.º 7
0
    def parse(self, response):

        original_url = response.url

        with SeleniumBrowser(self, response) as browser:

            wait_for = None
            wait_type, wait_for_xpath = "wait_none", ""
            if wait_for_xpath:
                wait_for = EC.wait_none((By.XPATH, wait_for_xpath))
            browser.get(response.url)
            selector = browser.scroll_until_the_end(2000, wait_for)
            response = selector.response
        urls_xpath = "//h3[contains(@class,'entry-title')]/a/@href"
        params_regex = {}
        urls = self.extract_list(response.xpath(urls_xpath))

        for single_url in urls:
            matches = None
            if "":
                matches = re.search("", single_url, re.IGNORECASE)
                if matches:
                    single_url = matches.group(0)
                else:
                    continue
            single_url = get_full_url(original_url, single_url)

            request = Request(single_url, callback=self.level_2)

            yield request
Exemplo n.º 8
0
    def parse(self, response):

        original_url = response.url

        with SeleniumBrowser(self, response) as browser:

            wait_for = None
            wait_type, wait_for_xpath = "wait_none", ""
            if wait_for_xpath:
                wait_for = EC.wait_none((By.XPATH, wait_for_xpath))
            browser.get(response.url)
            while True:
                try:
                    selector = browser.click_link(
                        "//a[@class='more-articles']", wait_for)
                    response = selector.response
                except:
                    break
        urls_xpath = "//div[@class='article-stream-container']//a[img[@class=' article_wrap']]/@href"
        params_regex = {}
        urls = self.extract_list(response.xpath(urls_xpath))

        for single_url in urls:
            matches = None
            if "":
                matches = re.search("", single_url, re.IGNORECASE)
                if matches:
                    single_url = matches.group(0)
                else:
                    continue
            single_url = get_full_url(original_url, single_url)

            request = Request(single_url, callback=self.level_2)

            yield request
Exemplo n.º 9
0
 def parse_sub_category(self, response):
     category_urls = self.extract_list(response.xpath('//li[@class="level3"]/a/@href'))
     for category_url in category_urls:
         category_url = get_full_url(response, category_url)
         with SeleniumBrowser(self, response) as browser:
             selector = browser.get(category_url+'||reviews*299|reviews*298')
             for item in self.parse_category(browser, selector):
                 yield item
Exemplo n.º 10
0
    def parse(self, response):
        with SeleniumBrowser(self, response) as browser:
            seletor = browser.get(response.url)

            product_urls = self.extract_list(
                seletor.xpath('//div[@class="media-body"]/a/@ng-href'))
            for product_url in product_urls:
                yield Request(url=product_url, callback=self.parse_product)
Exemplo n.º 11
0
 def parse(self, response):
     if 'browser' in response.meta:
         browser = response.meta['browser']
         for request in self._parse(response, browser):
             yield request
     else:
         with SeleniumBrowser(self, response) as browser:
             for request in self._parse(response, browser):
                 yield request
Exemplo n.º 12
0
    def parse(self, response):
        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            keep_going = True

            while keep_going:
                selector = browser.scroll_until_the_end(5000)

                for review_text in selector.xpath('//article[@itemtype="http://schema.org/BlogPosting"]').extract():
                    review_section = Selector(text=review_text)
                    product = ProductItem()
                    review = ReviewItem()

                    product['OriginalCategoryName'] = "Miscellaneous"
                    review['DBaseCategoryName'] = "PRO"

                    review['TestTitle'] = self.extract(review_section.xpath('//h2[@itemprop="headline"]/a/text()'))

                    review['TestUrl'] = self.extract(review_section.xpath('//h2[@itemprop="headline"]/a/@href'))
                    product['TestUrl'] = review['TestUrl']

                    review['Author'] = self.extract(review_section.xpath('//span[@itemprop="author"]/a/text()'))

                    if review['TestTitle']:
                        matches = re.search("^(.*?) review", review['TestTitle'], re.IGNORECASE)
                        if matches:
                            review['ProductName'] = matches.group(1)
                            product['ProductName'] = matches.group(1)
                        else:
                            review['ProductName'] = review['TestTitle']
                            product['ProductName'] = review['TestTitle']

                    review["TestDateText"] = self.extract(review_section.xpath('//time/@datetime'))

                    review['TestSummary'] = self.extract_all(review_section.xpath('//div[@class="block-excerpt"]/div[@itemprop="articleBody"]/*/text()'), separator=" ")

                    product['PicURL'] = self.extract(review_section.xpath('//div[@class="block-image"]/a/img/@src'))

                    yield product
                    yield review

                if self.extract(selector.xpath('//div[@id="load-more-posts"]')):

                    #if self.extract(selector.xpath('//div[@id="load-more-posts"]/div')):
                    #    print "Current URL: ", self.browser.browser.current_url
                    #    self.browser.browser.refresh()
                    #else:
                    pre_click_headline = self.extract(selector.xpath('//article[@itemtype="http://schema.org/BlogPosting" and position()=2]//h2[@itemprop="headline"]/a/text()'))
                    browser.click('//div[@id="load-more-posts"]')
                    post_click_headline = self.extract(selector.xpath('//article[@itemtype="http://schema.org/BlogPosting" and position()=2]//h2[@itemprop="headline"]/a/text()'))
                    if pre_click_headline == post_click_headline:
                        keep_going = False
                else:
                    keep_going = False
Exemplo n.º 13
0
    def parse(self, response):
        #Must use only product_page
        category_xpaths = {
            "category_leaf":
            "//div[@id='breadcrumb']/a[@class='home']/following-sibling::a[last()-1]/text()"
        }
        category_path_xpath = "//div[@id='breadcrumb']/a[@class='home']/following-sibling::a/text()"

        product_xpaths = {
            "PicURL": "(//*[@property='og:image'])[1]/@content",
            "source_internal_id": "//form[@id='productSheet']/@data-product",
            "ProductName": "//div[@itemprop='name']/h1/text()",
            "ProductManufacturer": "//*[@class='nameBrand']/text()"
        }
        category_path_selector = response.xpath(category_path_xpath)
        category_path_selector = category_path_selector[:-1]

        category = self.init_item_by_xpaths(response, "category",
                                            category_xpaths)
        category["category_path"] = self.extract_all(category_path_selector,
                                                     separator=' | ')
        print category

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product["OriginalCategoryName"] = category["category_path"]

        product_id = ProductIdItem()
        product_id['source_internal_id'] = product["source_internal_id"]
        product_id['ProductName'] = product["ProductName"]
        product_id['ID_kind'] = "conforama_fr_id"
        product_id['ID_value'] = product["source_internal_id"]
        yield product_id

        yield category
        yield product
        yield self.get_rm_kidval(product, response)

        reviews_xpath = "//a[@id='rating']"

        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            selector = browser.click(reviews_xpath)

            response.meta['browser'] = browser
            response.meta['product'] = product
            response.meta['product_id'] = product_id
            response.meta['_been_in_decorator'] = True

            for review in self.parse_reviews(response, selector=selector):
                yield review
Exemplo n.º 14
0
    def parse(self, response):
        cookies = response.meta.get('cookies', None)
        if not cookies:
            with SeleniumBrowser(self, response, no_images=False,
                                 no_css=False) as browser:
                cookies = self.login_selenium(browser)

        cat_url_xpath = "//footer[@class='report-category__footer']/a/@href"
        cat_urls = self.extract_list_xpath(response, cat_url_xpath)
        for cat_url in cat_urls:
            cat_url = get_full_url(response, cat_url)
            request = Request(cat_url, callback=self.parse_category_leafs)
            request.meta['cookies'] = cookies
            yield request
Exemplo n.º 15
0
    def parse(self, response):
        #Must use only product_page
        category_xpaths = {
            "category_leaf": "//*[@id='moreFrom-catLink']/a/text()",
            "category_path": "//*[@id='moreFrom-catLink']/a/text()"
        }

        product_xpaths = {
            "PicURL": "(//li[@class='productImageItem'])[1]//img/@src",
            "ProductName": "//h1[@class='productHeading']//text()",
            "ProductManufacturer": "//h1[@class='productHeading']/text()"
        }

        category = self.init_item_by_xpaths(response, "category",
                                            category_xpaths)
        product = self.init_item_by_xpaths(response, "product", product_xpaths)

        match = re.search(self.source_internal_id_re, response.url)
        if match:
            product['source_internal_id'] = match.group(1)
        product["OriginalCategoryName"] = category["category_path"]
        yield category
        yield product
        yield self.get_rm_kidval(product, response)

        mpn_value = self.extract(
            response.xpath("//span[@id='productMPN']/text()"))
        if mpn_value:
            mpn = ProductIdItem()
            mpn['source_internal_id'] = product["source_internal_id"]
            mpn['ProductName'] = product["ProductName"]
            mpn['ID_kind'] = "MPN"
            mpn['ID_value'] = mpn_value
            yield mpn

        ean_value = self.extract(
            response.xpath("//span[@id='productEAN']/text()"))
        if ean_value:
            ean = ProductIdItem()
            ean['source_internal_id'] = product["source_internal_id"]
            ean['ProductName'] = product["ProductName"]
            ean['ID_kind'] = "EAN"
            ean['ID_value'] = ean_value
            yield ean

        with SeleniumBrowser(self, response) as browser:
            selector = browser.get(response.url)
            for review in self._parse_reviews(selector, browser, product):
                yield review
Exemplo n.º 16
0
    def parse(self, response):
        selector = Selector(response)
        next_page_xpath = "//*[@class='more-button']"

        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            while not self.stop_scraping(selector):
                selector = browser.click(next_page_xpath)

        review_urls = self.extract_list(
            selector.xpath('//h3[@class="news-item-title"]/a/@href'))

        for review_url in review_urls:
            review_url = get_full_url(response, review_url)
            request = Request(review_url, callback=self.parse_review)
            yield request
Exemplo n.º 17
0
    def parse_product(self, response):
        category = response.meta['category']

        product = ProductItem()
        product['TestUrl'] = response.url
        product['OriginalCategoryName'] = category['category_path']
        name = self.extract(response.xpath('//h1/span/text()'))
        name_match = re.findall(r'[^()]+', name)
        product['ProductName'] = name_match[0]
        pic_url = self.extract(
            response.xpath('//div[@class="main-image"]/img/@src'))
        if pic_url:
            product['PicURL'] = get_full_url(response, pic_url)
        product['ProductManufacturer'] = self.extract(
            response.xpath('//span[@class="brand"]/text()'))
        product['source_internal_id'] = self.extract(
            response.xpath(
                '//body[@id="MasterPageBodyTag"]/@data-productcode'))
        yield product

        mpn = self.extract(
            response.xpath('//span[contains(text(),"Mfg")]/span/text()'))
        if mpn:
            mpn_id = self.product_id(product)
            mpn_id['ID_kind'] = "MPN"
            mpn_id['ID_value'] = mpn
            yield mpn_id

        product_id = self.product_id(product)
        product_id['ID_kind'] = "cdw_id"
        product_id['ID_value'] = product['source_internal_id']
        yield product_id

        with SeleniumBrowser(self, response) as browser:
            selector = browser.get(response.url, timeout=10)

            response.meta['browser'] = browser
            response.meta['product'] = product
            response.meta['product_id'] = product_id
            response.meta['_been_in_decorator'] = True

            for review in self.parse_reviews(response,
                                             selector,
                                             incremental=True):
                yield review
Exemplo n.º 18
0
    def parse(self, response):
        product_page_xpath = "//li[contains(@class, 'parentTerminal')]//a[contains(@class, 'btn') and not(contains(@href, 'movil/tarjeta'))]/@href"

        category = CategoryItem()
        category['category_path'] = "Cell Phones"
        yield category

        with SeleniumBrowser(self, response, no_images=False,
                             no_css=False) as browser:
            selector = browser.get(response.url)
            product_urls = self.extract_list_xpath(selector,
                                                   product_page_xpath)
            for product_url in product_urls:
                product_url = get_full_url(response.url, product_url)
                request = self.selenium_request(product_url,
                                                callback=self.parse_product)
                request.meta['category'] = category
                yield request
Exemplo n.º 19
0
    def parse(self, response):
        #Must use only product_page
        category_xpaths = {
            "category_leaf":
            "(//div[@class='breadcrumb']//a/span)[last()]//text()"
        }
        category_path_xpath = "(//div[@class='breadcrumb']//a/span)//text()"
        product_xpaths = {
            "PicURL":
            "(//*[@property='og:image'])[1]/@content",
            "ProductName":
            "//h1[contains(@class, 'page-title')]/span//text()",
            "ProductManufacturer":
            "//h1[contains(@class,'page-title')]/span[@itemprop='brand']/text()"
        }

        category = self.init_item_by_xpaths(response, "category",
                                            category_xpaths)
        category["category_path"] = self.extract_all(
            response.xpath(category_path_xpath), separator=' | ')

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product["OriginalCategoryName"] = category["category_path"]
        product["source_internal_id"] = None

        source_internal_id_xpath = "//meta[@itemprop='identifier']/@content"
        source_internal_id = self.extract(
            response.xpath(source_internal_id_xpath))
        match = re.match(self.source_internal_id_re, source_internal_id)
        if match:
            product["source_internal_id"] = match.group(1)
        yield category
        yield product
        yield self.get_rm_kidval(product, response)

        reviews_xpath = "//a[contains(text(),' customer reviews')]"

        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            selector = browser.click(reviews_xpath)

            for review in self._parse_reviews(selector, browser, product):
                yield review
Exemplo n.º 20
0
    def parse_product(self, response):
        color_url_xpath = "//fieldset[contains(@class, 'color-picker')]/ul/li/a/@href"
        size_id_xpath = "//fieldset[contains(@class, 'size-picker')]//input/@id"
        single_size_xpath = "//fieldset[contains(@class, 'size-picker')]//input[@id='%s']/following-sibling::label[1]"

        with SeleniumBrowser(self, response, no_images=False,
                             no_css=False) as browser:
            selector = browser.get(response.url)
            color_urls = self.extract_list_xpath(selector, color_url_xpath)
            for color_url in color_urls:
                color_url = get_full_url(response.url, color_url)
                selector = browser.get(color_url)
                size_ids = self.extract_list_xpath(selector, size_id_xpath)

                for size_id in size_ids:
                    selector = browser.click(single_size_xpath % size_id)
                    for item in self._parse_product(response, browser,
                                                    selector):
                        yield item
Exemplo n.º 21
0
    def parse_category(self, response):
        all_products_xpath = "//a[@data-selector='INTRO_Link']/@href"
        all_products_url = self.extract(response.xpath(all_products_xpath))
        all_products_url = get_full_url(response.url, all_products_url)

        if all_products_url:
            request = self.selenium_request(all_products_url,
                                            callback=self.parse_category)
            request.meta['category'] = response.meta["category"]
            yield request
            return

        with SeleniumBrowser(self, response, no_css=False,
                             no_images=False) as browser:
            self.login_selenium(browser)
            products_div_xpath = "//section[contains(@class,'js-listing--desktop')]"
            #products_div = response.xpath(products_div_xpath)
            #if products_div:
            for item in self.parse_product_list_div(response, browser):
                yield item
Exemplo n.º 22
0
    def parse_product(self, response):
        select_xpath = "//*[contains(@class, 'product-stage')]" \
                 "//select[contains(@class, 'select_to')]"

        select_values_xpath = "//*[contains(@class, 'product-stage')]" \
                        "//select[contains(@class, 'select_to')]/option/@value"
        color_variant_url_xpath = "//li[contains(@class, 'product-stage__colors__color')]/a/@href"
        color_variants = self.extract_list_xpath(response,
                                                 color_variant_url_xpath)
        select = response.xpath(select_xpath)

        if select or color_variants:
            with SeleniumBrowser(self, response, no_images=False,
                                 no_css=False) as browser:
                browser.get(response.url)
                if select:
                    select_values = self.extract_list_xpath(
                        response, select_values_xpath)

                    for value in select_values:
                        if not value:
                            return
                        selector = browser.select_by_value(select_xpath, value)
                        color_variants = self.extract_list_xpath(
                            selector, color_variant_url_xpath)
                        if color_variants:
                            for item in self.parse_color_variants(
                                    response, color_variants, browser=browser):
                                yield item
                        else:
                            for item in self._parse_product(
                                    response, selector):
                                yield item
                elif color_variants:
                    for item in self.parse_color_variants(
                            response, color_variants, browser):
                        yield item
        else:
            for item in self._parse_product(response):
                yield item
Exemplo n.º 23
0
    def parse(self, response):

        original_url = response.url
        product = response.meta.get("product", {})
        review = response.meta.get("review", {})

        with SeleniumBrowser(self, response) as browser:

            wait_for = None
            wait_type, wait_for_xpath = "wait_none", ""
            if wait_for_xpath:
                wait_for = EC.wait_none((By.XPATH, wait_for_xpath))
            browser.get(response.url)
            selector = browser.scroll_until_the_end(2000, wait_for)
            response = selector.response
        urls_xpath = "//div[@class='landing-feed--special-content']/a/@href"
        params_regex = {}
        urls = self.extract_list(response.xpath(urls_xpath))

        for single_url in urls:
            matches = None
            if "":
                matches = re.search("", single_url, re.IGNORECASE)
                if matches:
                    single_url = matches.group(0)
                else:
                    continue
            single_url = get_full_url(original_url, single_url)

            request = Request(single_url, callback=self.level_2)

            try:
                request.meta["product"] = product
            except:
                pass
            try:
                request.meta["review"] = review
            except:
                pass
            yield request
Exemplo n.º 24
0
    def parse(self, response):
        #Must use only product_page
        product_xpaths = {
            "PicURL": "(//*[@property='og:image'])[1]/@content",
            "ProductName": "//h1/span[@itemprop='name']/text()",
            "ProductManufacturer": "(//h1/span[@itemprop='name']/text())[1]"
        }

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        parsed_url = urlparse(response.url)

        match = re.search(self.source_internal_id_re, parsed_url.path)
        if match:
            product["source_internal_id"] = match.group(1)

            product_id = ProductIdItem()
            product_id['source_internal_id'] = product["source_internal_id"]
            product_id['ProductName'] = product["ProductName"]
            product_id['ID_kind'] = "richersounds_id"
            product_id['ID_value'] = product["source_internal_id"]
            yield product_id
            yield product
            yield self.get_rm_kidval(product, response)

            reviews_xpath = "//h4/a[contains(@href,'review')]"

            with SeleniumBrowser(self, response) as browser:
                browser.get(response.url)
                selector = browser.click(reviews_xpath)

                response.meta['browser'] = browser
                response.meta['product'] = product
                response.meta['product_id'] = product_id
                response.meta['_been_in_decorator'] = True

                for review in self.parse_reviews(response, selector=selector):
                    yield review
Exemplo n.º 25
0
    def parse(self, response):
        #Must use only product_page
        category_xpaths = {
            "category_leaf": "(//ul[@id='breadcrumb']/li/a)[last()]/text()"
        }
        category_path_xpath = "//ul[@id='breadcrumb']/li/a/text()"

        product_xpaths = {
            "PicURL":
            "//div[@id='productImage']/img/@src",
            "ProductName":
            "//div[@typeof='v:Product']/h1/text()",
            "ProductManufacturer":
            "//div[@typeof='v:Product']/h1/span[@property='v:brand']/text()"
        }

        category = self.init_item_by_xpaths(response, "category",
                                            category_xpaths)
        category["category_path"] = self.extract_all(
            response.xpath(category_path_xpath), separator=" | ")

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product["PicURL"] = get_full_url(response, product["PicURL"])
        product["OriginalCategoryName"] = category["category_path"]
        product["ProductName"] = "%s %s" % (product['ProductManufacturer'],
                                            product["ProductName"])
        yield category
        yield product
        yield self.get_rm_kidval(product, response)

        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            selector = browser.click("//a[@class='reviewLinks']")

            for review in self._parse_reviews(selector, browser, product):
                yield review
Exemplo n.º 26
0
    def parse(self, response):
        category_xpaths = {
            "category_leaf": "(//a[@class='klickpfad'])[last()]//text()"
        }

        category_path_xpath = "//a[@class='klickpfad']/text()"
        product_xpaths = {
            "PicURL": "//div[@id='big_handy_img']/img/@src",
            "ProductName": "//h1/span[@itemprop='name']/text()",
            "ProductManufacturer": "//h1/span[@itemprop='brand']//text()"
        }

        category = self.init_item_by_xpaths(response, "category",
                                            category_xpaths)
        category["category_path"] = self.extract_all(
            response.xpath(category_path_xpath), separator=' | ')

        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product["OriginalCategoryName"] = category["category_path"]
        product["ProductName"] = "%s %s" % (product["ProductManufacturer"],
                                            product["ProductName"])

        yield category
        yield product
        yield self.get_rm_kidval(product, response)

        reviews_xpath = "//a[@id='ekomi_button']"
        all_reviews_button = "//span[@id='lade_bewertungen']"

        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            browser.click(reviews_xpath)
            selector = browser.click(all_reviews_button)

            for review in self._parse_reviews(selector, browser, product):
                yield review
Exemplo n.º 27
0
    def parse(self, response):

        original_url = response.url

        with SeleniumBrowser(self, response) as browser:

            wait_for = None
            wait_type, wait_for_xpath = "wait_none", ""
            if wait_for_xpath:
                wait_for = EC.wait_none((By.XPATH, wait_for_xpath))
            browser.get(response.url)
            while True:
                try:
                    urls_xpath = "//ul[@class='review-listing']//h3/a/@href"
                    params_regex = {}
                    urls = self.extract_list(response.xpath(urls_xpath))

                    for single_url in urls:
                        matches = None
                        if "":
                            matches = re.search("", single_url, re.IGNORECASE)
                            if matches:
                                single_url = matches.group(0)
                            else:
                                continue
                        single_url = get_full_url(original_url, single_url)

                        request = Request(single_url, callback=self.level_2)

                        yield request
                    selector = browser.click_link(
                        "//ul[contains(@class,'pager')]//a[contains(text(),'next')]",
                        wait_for)
                    response = selector.response
                except:
                    break
Exemplo n.º 28
0
    def level_5(self, response):
                                     
        original_url = response.url
        


        
        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
        
            first_time = True
            while True:
                if not first_time:
                    try:
                        selector = browser.click_link("//button[contains(@class, 'loadmore')]", None)
                        response = selector.response
                    except:
                        break

                first_time = False
                containers_xpath = ".//div[@itemprop='review']"
                containers = response.xpath(containers_xpath)
                for review_container in containers:
                    review = ReviewItem()
                    
                    
                    review['ProductName'] = self.extract(review_container.xpath("//a[contains(@class, 'breadcrumb-link')]/@title"))
                    
                    
                    review['SourceTestRating'] = self.extract(review_container.xpath(".//meta[@itemprop='ratingValue']/@content"))
                    
                    
                    review['TestDateText'] = self.extract(review_container.xpath(".//span[contains(@class, 'review-date')]//text()"))
                    
                    
                    
                    
                    review['TestSummary'] = self.extract(review_container.xpath(".//p[@itemprop='description']/text()"))
                    
                    
                    
                    review['Author'] = self.extract(review_container.xpath(".//span[contains(@class, 'user-nickname')]/text()"))
                    
                    
                    review['TestTitle'] = self.extract(review_container.xpath(".//h4[@itemprop='name']/text()"))
                    
                    
                    
                    review['TestUrl'] = original_url
                    try:
                        review['ProductName'] = product['ProductName']
                        review['source_internal_id'] = product['source_internal_id']
                    except:
                        pass
           

                
                    review["DBaseCategoryName"] = "USER"
                 
                                    

                
                    review["SourceTestScale"] = "5"
                
                                    

                
                    if review["TestDateText"]:
                        
                        review["TestDateText"] = date_format(review["TestDateText"], "%d-%m-%Y", ["en"])
                
                                    

        
                
                            
                    yield review
Exemplo n.º 29
0
 def __init__(self, *a, **kw):
     AlaCrawlSpider.__init__(self, *a, **kw)
     self.browser = SeleniumBrowser(self)
Exemplo n.º 30
0
    def level_5(self, response):

        original_url = response.url
        category_leaf_xpath = "//li[@typeof='v:Breadcrumb'][last()]/a//text()"
        category_path_xpath = "//li[@typeof='v:Breadcrumb']/a//text()"
        category = CategoryItem()
        category['category_url'] = original_url
        category['category_leaf'] = self.extract(
            response.xpath(category_leaf_xpath))
        category['category_path'] = self.extract_all(
            response.xpath(category_path_xpath), ' | ')
        if self.should_skip_category(category):
            return
        yield category

        product_xpaths = {
            "source_internal_id":
            "//span[@ng-bind='product.metadata.partnumber']/text()",
            "ProductName":
            "//div[@class='stp--grid']//*[@ng-bind-html='product.metadata.name']/text()",
            "PicURL": "//div[@id='STP--Product-Image']//img/@src",
        }
        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        try:
            product["OriginalCategoryName"] = category['category_path']
        except:
            pass

        id_value = self.extract(
            response.xpath(
                "//span[@ng-bind='product.metadata.partnumber']/text()"))
        if id_value:
            product_id = self.product_id(product)
            product_id['ID_kind'] = "sku"
            product_id['ID_value'] = id_value
            yield product_id

        yield product

        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)

            first_time = True
            while response.xpath(
                    "//span[contains(@class, 'yotpo_next')]") or first_time:
                if not first_time:
                    selector = browser.click_link(
                        "//span[contains(@class, 'yotpo_next')]", None)
                    response = selector.response

                first_time = False
                containers_xpath = "//div[@data-review-id]"
                containers = response.xpath(containers_xpath)
                for review_container in containers:
                    review = ReviewItem()

                    review['source_internal_id'] = self.extract(
                        response.xpath(
                            "//span[@ng-bind='product.metadata.partnumber']/text()"
                        ))

                    review['ProductName'] = self.extract(
                        review_container.xpath(
                            "//div[@class='stp--grid']//*[@ng-bind-html='product.metadata.name']/text()"
                        ))

                    review['SourceTestRating'] = self.extract(
                        review_container.xpath(
                            "count(.//span[contains(@class, 'yotpo-icon-star')])"
                        ))

                    review['TestDateText'] = self.extract(
                        review_container.xpath(
                            ".//div[contains(@class, 'yotpo-header-element')]//label[contains(@class, 'yotpo-review-date')]/text()"
                        ))

                    review['TestSummary'] = self.extract(
                        review_container.xpath(
                            ".//div[@class='content-review']/text()"))

                    review['Author'] = self.extract(
                        review_container.xpath(
                            ".//div[contains(@class, 'yotpo-header-element')]//label[contains(@class, 'yotpo-user-name')]/text()"
                        ))

                    review['TestTitle'] = self.extract(
                        review_container.xpath(
                            ".//div[contains(@class, 'content-title')]/text()")
                    )

                    review['TestUrl'] = original_url
                    try:
                        review['ProductName'] = product['ProductName']
                        review['source_internal_id'] = product[
                            'source_internal_id']
                    except:
                        pass

                    review["DBaseCategoryName"] = "USER"

                    if review["TestDateText"]:

                        review["TestDateText"] = date_format(
                            review["TestDateText"], "%m/%d/%y", ["en"])

                    review["SourceTestScale"] = "5"

                    yield review