Exemplo n.º 1
0
    def parse_item(self, response):

        cate = response.meta['categor']

        url_paths = response.xpath('//div[@class="a-section a-spacing-double-large"]/div[@class="a-row a-spacing-mini olpOffer"]')

        item = AmazonscraperItem()
        if (cate == "f_new"):
            item['Category'] = "New"

        if (cate == "f_usedLikeNew"):
            item['Category'] = "LikeNew"

        if (cate == "f_usedVeryGood"):
            item['Category'] = "VeryGood"

        if (cate == "f_usedGood"):
            item['Category'] = "Good"

        for url_path in url_paths:

            url_buf = url_path.xpath('div[@class="a-column a-span2 olpSellerColumn"]/h3/span/a/@href').extract()
            if url_buf:

                url = "https://www.amazon.com" + url_buf[0]
                item['Url'] = url

                # self.logger.info( "-----------------------------" )
                # self.logger.info( url )
                yield item
        next_page  = response.xpath('//div[@class="a-text-center a-spacing-large"]/ul[@class="a-pagination"]/li[@class="a-last"]/a/@href').extract()

        if ( len(next_page) == 1 ) :

            req = self.set_proxies(url='https://www.amazon.com/' + next_page[0], callback=self.parse_item)
            req.meta["categor"] = response.meta["categor"]
            yield req
Exemplo n.º 2
0
    def getData(self, response):
        print "===== Get Data ====="

        try:
            if "To discuss automated access to Amazon data please contact" in response.body:
                print "========= > None data"
                req = self.set_proxies(response.url,
                                       self.getData,
                                       headers=self.headers)
                sleep(2)
                yield req

            itemPaths = response.xpath(
                '//ul[contains(@class, "s-result-list")]/li[contains(@id, "result")]'
            )
            for cc, element in enumerate(itemPaths):
                print "----------------------------"
                itemUrl = element.xpath(
                    './/a[@class="a-link-normal s-access-detail-page  s-color-twister-title-link a-text-normal"]/@href'
                ).extract_first()
                # print itemUrl
                if "/gp/slredirect/" in itemUrl:
                    continue

                item = AmazonscraperItem()
                s = requests.Session()
                error_count = 0
                asin = ""
                reviewList = []
                reviewSummary = {}

                customReviewUrl = ""
                while True:
                    s.cookies.clear()

                    agent = config.rotateAgent()
                    proxy = config.rotateProxy()
                    print proxy

                    proxies = {
                        'http': 'http://{}@{}'.format(config.proxy_auth,
                                                      proxy),
                        'https': 'http://{}@{}'.format(config.proxy_auth,
                                                       proxy)
                    }
                    try:
                        res = s.request('GET',
                                        itemUrl,
                                        headers=self.headers,
                                        proxies=proxies)
                        # print itemUrl
                    except:
                        error_count = error_count + 1
                        continue

                    if error_count == 10:
                        break
                    # print res.status_code
                    elif res.status_code == 200:
                        if "To discuss automated access to Amazon data please contact" in res.content:
                            print "==========> No none ========="
                            # sleep(2)
                            continue
                        else:
                            r = res.text
                            htmlBody = Selector(text=r)
                            asin = ''.join(
                                htmlBody.xpath('//input[@id="ASIN"]/@value').
                                extract()).strip()
                            if asin != "":
                                break
                            else:
                                print "***** Asin None *****"
                                # print htmlBody
                                error_count = error_count + 1
                                if error_count == 10:
                                    break
                                continue
                if error_count == 10:
                    continue

                catetxt = htmlBody.xpath(
                    '//ul[@class="a-unordered-list a-horizontal a-size-small"]//text()'
                ).extract()
                cate = re.sub(" +", " ",
                              re.sub("\s", " ", ''.join(catetxt)).strip())

                if "Beauty" and "Personal" not in cate:
                    continue

                item['Category'] = cate
                # print category

                item['Page_url'] = itemUrl
                item['ASIN'] = asin

                brand = ''.join(
                    htmlBody.xpath('//div[@id="brandBarLogoWrapper"]//img/@alt'
                                   ).extract()).strip()
                if brand == "":
                    brand = ''.join(
                        htmlBody.xpath(
                            '//div[@id="mbc"]/@data-brand').extract()).strip()
                item['Brand_Name'] = brand

                product_name = ''.join(
                    htmlBody.xpath(
                        '//h1[@id="title"]//text()').extract()).strip()
                item['Product_Name'] = product_name
                # print product_name

                rankSummary = {}
                rankingText1 = ''.join(
                    htmlBody.xpath(
                        '//li[@id="SalesRank"]/text()').extract()).strip()
                # print rankingText
                # print "----------------"
                rank1 = rankingText1.replace("#", "").replace(" ()", "")
                rankSummary['category rank'] = rank1

                rankList = []
                paths = htmlBody.xpath('//ul[@class="zg_hrsr"]/li')
                for ele in paths:

                    category = ''.join(
                        ele.xpath('.//span[@class="zg_hrsr_ladder"]//text()').
                        extract()).strip()
                    rankingText2 = ''.join(
                        ele.xpath('.//span[@class="zg_hrsr_rank"]/text()').
                        extract()).strip()
                    rankingText2 = rankingText2.replace("#", "")
                    rank2 = rankingText2 + " " + category
                    rankList.append(rank2)

                rankSummary['sub category rank'] = rankList
                # print ranking
                item['Ranking'] = rankSummary

                price = ''.join(
                    htmlBody.xpath(
                        '//span[contains(@id, "priceblock_")]/text()').extract(
                        )).strip()
                item['Price'] = price
                # print price

                Description = ""
                sentence = htmlBody.xpath(
                    '//div[@id="visual-rich-product-description"]//div[contains(@class, "a-section a-text-left visualRpdColumnSmall")]'
                )
                # print len(sentence)
                for element in sentence:
                    # print "----------------"
                    text = ''.join(
                        element.xpath('.//h4/text()').extract()).strip()

                    if "Description" in text:
                        Description = ''.join(
                            element.xpath(
                                './/span[@class="a-size-small a-color-base visualRpdText"]/text()'
                            ).extract()).strip()
                        item['Description'] = Description
                        # print Description

                    elif "Benefits" in text:
                        Benefits = ''.join(
                            element.xpath(
                                './/span[@class="a-size-small a-color-base visualRpdText"]/text()'
                            ).extract()).strip()
                        item['Benefits'] = Benefits
                        # print Benefits

                    elif "Suggested" in text:
                        Suggested_Use = ''.join(
                            element.xpath(
                                './/span[@class="a-size-small a-color-base visualRpdText"]/text()'
                            ).extract()).strip()
                        item['Suggested_Use'] = Suggested_Use
                        # print Suggested_Use

                if Description == "":
                    Description = ''.join(
                        htmlBody.xpath(
                            '//div[@id="productDescription"]//p/text()').
                        extract()).strip()
                    item['Description'] = Description

                importantInfo = ''.join(
                    htmlBody.xpath(
                        '//div[@class="bucket"]/div[@class="content"]/text()').
                    extract()).strip()
                if importantInfo:
                    item['Important_Info'] = importantInfo

                rating = ''.join(
                    htmlBody.xpath(
                        '//div[@id="reviewSummary"]//span[@class="a-icon-alt"]/text()'
                    ).extract()).strip()
                item['Rating'] = rating
                # print rating

                reviews = ''.join(
                    htmlBody.xpath(
                        '//div[@id="reviewSummary"]//span[@data-hook="total-review-count"]/text()'
                    ).extract()).strip()
                item['Reviews'] = reviews
                # print reviews

                star5 = ''.join(
                    htmlBody.xpath(
                        '//a[@class="a-size-base a-link-normal 5star histogram-review-count"]/text()'
                    ).extract()).strip()
                reviewSummary["5 star"] = star5
                # print star5

                star4 = ''.join(
                    htmlBody.xpath(
                        '//a[@class="a-size-base a-link-normal 4star histogram-review-count"]/text()'
                    ).extract()).strip()
                reviewSummary["4 star"] = star4
                # print star4

                star3 = ''.join(
                    htmlBody.xpath(
                        '//a[@class="a-size-base a-link-normal 3star histogram-review-count"]/text()'
                    ).extract()).strip()
                reviewSummary["3 star"] = star3
                # print star3

                star2 = ''.join(
                    htmlBody.xpath(
                        '//a[@class="a-size-base a-link-normal 2star histogram-review-count"]/text()'
                    ).extract()).strip()
                reviewSummary["2 star"] = star2
                # print star2

                star1 = ''.join(
                    htmlBody.xpath(
                        '//a[@class="a-size-base a-link-normal 1star histogram-review-count"]/text()'
                    ).extract()).strip()
                reviewSummary["1 star"] = star1
                # print star1

                item['ReviewSummary'] = reviewSummary

                print "*****************************************"
                print "Reviews=", reviews
                print "*****************************************"

                reviews = reviews.encode('utf-8')

                if reviews != "":

                    customReviewUrl = ''.join(
                        htmlBody.xpath(
                            '//a[@id="dp-summary-see-all-reviews"]/@href').
                        extract()).strip()
                    try:
                        customReviewUrl = re.sub('(ref=.*)', '',
                                                 customReviewUrl)

                    except:
                        pass
                    reviewUrl = self.baseUrl + customReviewUrl + "ref=cm_cr_arp_d_show_all?ie=UTF8&reviewerType=all_reviews&pageNumber=1"
                    # print reviewUrl

                    s = requests.Session()
                    error_count1 = 0
                    while True:
                        s.cookies.clear()
                        agent = config.rotateAgent()

                        proxy = config.rotateProxy()
                        print proxy
                        proxies = {
                            'http':
                            'http://{}@{}'.format(config.proxy_auth, proxy),
                            'https':
                            'http://{}@{}'.format(config.proxy_auth, proxy)
                        }
                        try:
                            res = s.request('GET',
                                            reviewUrl,
                                            headers=self.headers,
                                            proxies=proxies)
                        except:
                            error_count1 = error_count1 + 1
                            continue

                        if error_count1 == 10:
                            break

                        # print res.status_code
                        elif res.status_code == 200:
                            if "To discuss automated access to Amazon data please contact" in res.content:
                                print "==========> No none ========="

                                # sleep(2)
                                continue
                            else:
                                r = res.text
                                htmlText = Selector(text=r)
                                reviewitems = htmlText.xpath(
                                    '//div[@id="cm_cr-review_list"]/div[@class="a-section review"]'
                                )
                                if len(reviewitems) == 0:
                                    print "****** Review Items None ******"
                                    error_count = error_count + 1
                                    if error_count == 10:
                                        break
                                    continue
                                else:
                                    break

                    total_review = reviews.replace(",", "")
                    total_review_count = int(total_review) / 10
                    total_review_mod = int(total_review) % 10
                    if total_review_mod != 0:
                        total_review_count = total_review_count + 1
                    if total_review_count < 0:
                        total_review_count = 1

                    # print "*****************************************"
                    # print "Review Count ==== >> " ,len(reviewitems)
                    # print "*****************************************"

                    for element in reviewitems:
                        sitem = {}

                        # print "-----------------------"
                        review_rating = ''.join(
                            element.xpath(
                                './/i[@data-hook="review-star-rating"]/span/text()'
                            ).extract()).strip()
                        sitem["review_rating"] = review_rating
                        # print review_rating

                        review_title = ''.join(
                            element.xpath(
                                './/a[@data-hook="review-title"]/text()').
                            extract()).strip()
                        sitem["review_title"] = review_title
                        # print review_title

                        is_verified_purchase = ''.join(
                            element.xpath(
                                './/span[@data-hook="avp-badge"]/text()').
                            extract()).strip()
                        sitem["is_verified_purchase"] = is_verified_purchase
                        # print is_verified_purchase

                        reviewer_name = ''.join(
                            element.xpath(
                                './/a[@data-hook="review-author"]/text()').
                            extract()).strip()
                        sitem["reviewer_name"] = reviewer_name
                        # print reviewer_name

                        review_date = ''.join(
                            element.xpath(
                                './/span[@data-hook="review-date"]/text()').
                            extract()).strip()
                        review_date = review_date.replace("on ", "")
                        sitem["review_date"] = review_date
                        # print review_date

                        review_text = ''.join(
                            element.xpath(
                                './/span[@data-hook="review-body"]/text()').
                            extract()).strip()
                        sitem["review_text"] = review_text
                        # print review_text

                        people_found_usefull = ''.join(
                            element.xpath(
                                './/span[@data-hook="helpful-vote-statement"]/text()'
                            ).extract()).strip()
                        sitem["people_found_usefull"] = people_found_usefull
                        # print people_found_usefull

                        reviewList.append(sitem)
                    # print total_review_count

                    # print "*****************************************"
                    # print "Total Review = ", total_review_count
                    # print "*****************************************"
                    if total_review_count > 1:

                        for page_count in range(2, total_review_count + 1):

                            reviewUrl = self.baseUrl + customReviewUrl + "ref=cm_cr_arp_d_paging_btm_next_" + str(
                                page_count
                            ) + "?ie=UTF8&reviewerType=all_reviews&pageNumber=" + str(
                                page_count)

                            error_count2 = 0
                            s = requests.Session()
                            while True:
                                s.cookies.clear()
                                agent = config.rotateAgent()
                                # print "++++++++++++++++++++"
                                # print agent
                                # print "++++++++++++++++++++"
                                proxy = config.rotateProxy()
                                proxies = {
                                    'http':
                                    'http://{}@{}'.format(
                                        config.proxy_auth, proxy),
                                    'https':
                                    'http://{}@{}'.format(
                                        config.proxy_auth, proxy)
                                }
                                try:
                                    res = s.request('GET',
                                                    reviewUrl,
                                                    headers=self.headers,
                                                    proxies=proxies)
                                except:
                                    error_count2 = error_count2 + 1
                                    continue

                                if error_count2 == 10:
                                    break

                                elif res.status_code == 200:
                                    if "To discuss automated access to Amazon data please contact" in res.content:
                                        print "==========> No none ========="

                                        # sleep(2)
                                        continue
                                    else:
                                        r = res.text
                                        htmlText = Selector(text=r)
                                        reviewitems = htmlText.xpath(
                                            '//div[@id="cm_cr-review_list"]/div[@class="a-section review"]'
                                        )
                                        if len(reviewitems) == 0:
                                            print "****** Review Items None ******"
                                            error_count = error_count + 1
                                            if error_count == 10:
                                                break
                                            continue
                                        else:
                                            break

                            # print "*****************************************"
                            # print "Review Count 2 ==== >> " ,len(reviewitems)
                            # print "*****************************************"

                            for element in reviewitems:
                                sitem = {}

                                # print "-----------------------"
                                review_rating = ''.join(
                                    element.xpath(
                                        './/i[@data-hook="review-star-rating"]/span/text()'
                                    ).extract()).strip()
                                sitem["review_rating"] = review_rating
                                # print review_rating

                                review_title = ''.join(
                                    element.xpath(
                                        './/a[@data-hook="review-title"]/text()'
                                    ).extract()).strip()
                                sitem["review_title"] = review_title
                                # print review_title

                                is_verified_purchase = ''.join(
                                    element.xpath(
                                        './/span[@data-hook="avp-badge"]/text()'
                                    ).extract()).strip()
                                sitem[
                                    "is_verified_purchase"] = is_verified_purchase
                                # print is_verified_purchase

                                reviewer_name = ''.join(
                                    element.xpath(
                                        './/a[@data-hook="review-author"]/text()'
                                    ).extract()).strip()
                                sitem["reviewer_name"] = reviewer_name
                                # print reviewer_name

                                review_date = ''.join(
                                    element.xpath(
                                        './/span[@data-hook="review-date"]/text()'
                                    ).extract()).strip()
                                review_date = review_date.replace("on ", "")
                                sitem["review_date"] = review_date
                                # print review_date

                                review_text = ''.join(
                                    element.xpath(
                                        './/span[@data-hook="review-body"]/text()'
                                    ).extract()).strip()
                                sitem["review_text"] = review_text
                                # print review_text

                                people_found_usefull = ''.join(
                                    element.xpath(
                                        './/span[@data-hook="helpful-vote-statement"]/text()'
                                    ).extract()).strip()
                                sitem[
                                    "people_found_usefull"] = people_found_usefull
                                # print people_found_usefull

                                reviewList.append(sitem)

                    item["Consumer_Reviews"] = reviewList
                    # print customReviewUrl
                yield item
                # print item
                reviewList = []
                reviewSummary = {}
                self.total = self.total + 1
                print "*****************************************"
                print "Total ===== > " + str(self.total)
                print "page_url ===== > " + itemUrl
                print "*****************************************"

                # return
            # test
            # self.page_count = self.page_count + 1
            # if self.page_count==6:
            #     return

            nextUrl = response.xpath(
                '//a[@title="Next Page"]/@href').extract_first()

            if nextUrl:
                nextPage = self.baseUrl + nextUrl
                # print "------------------"
                # print nextPage
                req = self.set_proxies(nextPage,
                                       self.getData,
                                       headers=self.headers)
                self.makeLog(nextPage)
                yield req

        except Exception as e:
            print "******************Except**************"
            print e
            self.makeLog(itemUrl)