def getReviewList(asin): reviewsResult = [] baseUrl = 'http://www.amazon.com/product-reviews/' html = MyHtml.getHtml(baseUrl + asin, ffhead=True) spanPaging = html.xpath('.//span[@class="paging"]') if len(spanPaging) > 0: totalPage = int(spanPaging[0].xpath('./a')[-2].text.strip()) else: totalPage = 1 sortBy = 'recent' pageNumber = totalPage while pageNumber > 0: url = baseUrl + asin + \ '?pageNumber={}&sortBy={}'.format(str(pageNumber), sortBy) # print pageNumber, url html = MyHtml.getHtml(url, ffhead=True) tableProductReviews = html.xpath('.//table[@id="productReviews"]')[0] divReviewList = tableProductReviews.xpath('./tr/td/div')[::-1] for divReview in divReviewList: a = divReview.xpath('./preceding-sibling::a[1]')[0] reviewID = a.attrib['name'] reviewsResult.append(reviewID) # end of for pageNumber -= 1 # end of while return reviewsResult
def getReviewIndex(asin, targetReviewID): baseUrl = 'http://www.amazon.com/product-reviews/' html = MyHtml.getHtml(baseUrl + asin, ffhead=True) spanPaging = html.xpath('.//span[@class="paging"]') if len(spanPaging) > 0: totalPage = int(spanPaging[0].xpath('./a')[-2].text.strip()) else: totalPage = 1 sortBy = 'bySubmissionDateAscending' pageNumber = 1 rank = 0 while pageNumber <= totalPage: url = baseUrl + asin + \ '?pageNumber={}&sortBy={}'.format(str(pageNumber), sortBy) # print pageNumber, url html = MyHtml.getHtml(url, ffhead=True) tableProductReviews = html.xpath('.//table[@id="productReviews"]')[0] divReviewList = tableProductReviews.xpath('./tr/td/div') for divReview in divReviewList: rank += 1 a = divReview.xpath('./preceding-sibling::a[1]')[0] reviewID = a.attrib['name'] if reviewID == targetReviewID: return rank # end of for pageNumber += 1 return -1
def getReviewList(self, asin): reviewsResult = [] totalPage=0 baseUrl = 'http://www.amazon.com/product-reviews/' html = MyHtml.getHtml(baseUrl + asin, ffhead=True) pageNumberList = html.xpath(".//ul[@class='a-pagination']//li") if len(pageNumberList)>0: countOfListItems=len(pageNumberList) indexOfPageTotal=countOfListItems-2 listitem=pageNumberList[indexOfPageTotal] totalPage=int(listitem.xpath('.//a')[0].text.strip()) else: totalPage=1 sortBy = 'recent' pageNumber = 1 foundKnownReview = False while pageNumber <= totalPage: url = baseUrl + asin + \ '?pageNumber={}&sortBy={}'.format(str(pageNumber), sortBy) html = MyHtml.getHtml(url, ffhead=True) isCount=html.xpath('.//div[@id="cm_cr-product_info"]/div/div[1]/div[2]/span') if isCount is not None and len(isCount)>0: countOfReviews=int(isCount[0].text.strip()) else: countofReviews=0 print countOfReviews,"count of reviews" if countOfReviews>0: divWholeReviewList = html.xpath('.//div[@id="cm_cr-review_list"]')[0] divReviewList = divWholeReviewList.xpath('./div[@id]') for divReview in divReviewList: reviewID = divReview.attrib['id'] if (reviewID in reviewsResult) or (reviewID in self.reviewList): foundKnownReview = True break aReview = Review.Review() aReview.reviewID = reviewID Review.saveReview(review=aReview) reviewsResult.append(reviewID) # end of for pageNumber += 1 if foundKnownReview: break else: pageNumber+=1 # end of while return reviewsResult[::-1]
def getComments(self): self.comment = '' if self.numOfComments == 0: return url = 'http://www.amazon.com/review/{0}/'.format(self.reviewID) flag = True while flag: try: html = MyHtml.getHtml(url) commentTexts = html.xpath('.//div[@class="postBody"]/div[3]') for commentText in commentTexts: self.comment += commentText.text.strip().replace('\n', '<br />') self.comment += '<end />' try: url = html.xpath( './/div[@class="cdPageSelectorPagination"]\ /a[text="Next >"]')[0].attrib['href'] except LookupError: flag = False break except Exception, e: sys.stderr.write( 'getComments -> getHtml reviewID: {0} errmsg: {1}\n'.format (self.reviewID, str(e))) flag = False break
def __init__(self, url): self.url = url self.html = MyHtml.getHtml(url) self.revList = [] self.solveRankingsTable() # self.printData() self.printReviewers()
def getProductLinkPages(self): """ get the urls of web pages which contains the list of products reviewers made reviews about, and save those urls to self.pLinkList. This pLinkList will be used in getProductLinks() :return: null """ print "allRevLinks",self.allRevLink if self.allRevLink == '': self.totalPage = 0 return allLinks = MyHtml.getHtml(self.allRevLink, self.rID + "_AllProductLinks_1") if int(self.rNum) % 10 != 0: self.totalPage = int(self.rNum) / 10 + 1 else: self.totalPage = int(self.rNum) / 10 print self.totalPage, 'totalPage' linkHead = self.allRevLink.replace('/ref=pdp_new', '').strip() self.pLinkList.append(self.allRevLink) page = 2 while page <= self.totalPage: tempLink = linkHead + "?ie=UTF8&display=public&page=" + \ str(page) + "&sort_by=MostRecentReview" page = page + 1 self.pLinkList.append(tempLink) del page
def getReviews(asin, numOfReviews=1, fetchDate=date.today(), bookPublishDate=date.today()): reviewsResult = [] totalPage=0 baseUrl = 'http://www.amazon.com/product-reviews/' html = MyHtml.getHtml(baseUrl + asin, ffhead=True) countOfReviews=int(html.xpath('.//div[@id="cm_cr-product_info"]/div/div[1]/div[2]/span')[0].text.strip()) if countOfReviews==0: return pageNumberList = html.xpath(".//ul[@class='a-pagination']//li") if len(pageNumberList)>0: countOfListItems=len(pageNumberList) print "countOflIstItems",countOfListItems indexOfPageTotal=countOfListItems-2 listitem=pageNumberList[indexOfPageTotal] totalPage=int(listitem.xpath('.//a')[0].text.strip()) else: totalPage=1 print "totalpage",totalPage # Most Helpful First sortBy = 'helpful' pageNumber = 1 rank = 0 while pageNumber <= totalPage: url = baseUrl + asin + \ '?pageNumber={}&sortBy={}'.format(str(pageNumber), sortBy) print pageNumber, url, totalPage rank = solveReviewPage(asin, rank, url, fetchDate, bookPublishDate) sortBy = 'recent' maxRank = rank rank += 1 rank = solveReviewPage2(reviewsResult, rank, url,fetchDate) pageNumber += 1
def __init__(self, rID, hasPhoto='0', linkFlag=False): if rID != '': self.url = "http://www.amazon.com/gp/pdp/profile/" + rID self.html = MyHtml.getHtml(self.url, rID + '_profile', ffhead=True) if self.html is None: self.initEmptyObj() else: self._saveToFile = True self.pLinkList = [] self.allProductLinks = [] self.avgRate=0.0 self.duration=0 self.previousBookPublishDate='N/A' self.previousBookReviewDate='N/A' self.rRank='' self.fRevTime="" self.lRevTime = "" self.counter = 0 self.aboutMe='N/A' self.sum = 0.0 self.solveProfileUp() self.rID = rID userName = self.html.xpath(".//div[@class='a-section']/h1")[0].text.strip() if len(userName) == 0: self.username = '******' self.rName='N/A' else: self.username = userName self.rName=userName # getLocation #/html/body/div[3]/div[2]/div/div/div/div[1]/div[2]/span[2]/div/div[1]/span span=bool(self.html.xpath(".//div[@class='profile-info']/div[1]/span")) if span: loc = self.html.xpath(".//div[@class='profile-info']/div[1]/span")[0].text.strip() try: self.location=loc print loc if self.location == '': self.location = 'N/A' except LookupError: self.location = 'N/A' else: self.location='N/A' self.hasPhoto = hasPhoto if rID is not None and len(rID)>0: if self.rNum >0: self.getPreviousReviewedBook(rID) del self.html saveReviewer(self) else: self.initEmptyObj()
def __init__(self, asin): self._printWithTag = False self.asin = asin self.url = getURLFromAsin(asin) self.html = MyHtml.getHtml(self.url) if self.html is None: self.asin = None sys.stderr.write('getHtml Error: not found\n') return self.fetchDate = date.today() self.tag = [] self.initAttrib() self.reviewList = []
def getBookAsinList(self): self.bookAsinList = [] html = MyHtml.getHtml( self.url, name=self.tag, crawlDate=self.fetchDate) divProductList = html.xpath(".//div[@class='productList']")[0] trListProductList = divProductList.xpath("./table/tr[@class='small']") for i, tr in enumerate(trListProductList): if i % 2 == 1: continue aUrl = tr.xpath("./td[2]/a")[0].attrib['href'].strip() asin = Book.getAsinFromUrl(aUrl) if asin != '' and asin[0] != 'B': book = Book.loadBookByAsin(asin, self.fetchDate) if self.checkBook(book): self.bookAsinList.append(asin)
def getBookAsinList(self): self.bookAsinList=[] html=MyHtml.getHtml(self.url,name="NewReleasedBooks",crawlDate=self.fetchDate) divBooksList = html.xpath(".//div[@id='zg_centerListWrapper']")[0] divItemsList=divBooksList.xpath("./div[@class='zg_itemImmersion']") for item in divItemsList: #item=divItemsList[0] #if item: aUrl=item.xpath("./div[2]/div[2]/a")[0].attrib['href'].strip() rank=item.xpath("./div[@class='zg_rankDiv']/span")[0].text.strip() asin=Book.getAsinFromUrl(aUrl) if asin!='': book=Book.loadBookByAsin(asin,self.fetchDate) #if self.checkBook(book,rank): self.bookAsinList.append(asin) print self.bookAsinList
def solveReviewPage2(reviewsResult, rank, url, fetchDate): import traceback html = MyHtml.getHtml(url, ffhead=True) divWholeReviewList = html.xpath('.//div[@id="cm_cr-review_list"]')[0] divReviewList = divWholeReviewList.xpath('./div[@id]') # [::-1] for divReview in divReviewList: reviewID = divReview.attrib['id'] rank -= 1 aReview = loadReview(reviewID) aReview.timeRank=rank saveReview(review=aReview) reviewsResult.append(aReview.reviewID) # end of for traceback.print_exc() rank += 1 return rank
def update(self): self.html = MyHtml.getHtml(self.url) # update bookRating, numberOfReviews, bookRanking self.getReviews() divCenterCol = self.html.xpath("//div[@id='centerCol']") if divCenterCol is not None and len(divCenterCol)>0: divCenterCol=divCenterCol[0] divAveReviews = divCenterCol.xpath( ".//div[@id='averageCustomerReviews']") try: self.rate = divAveReviews[0].xpath( "./span")[0].attrib['title'].split(' out of ')[0] self.numOfReviews = divAveReviews[0].xpath( ".//span[@id='acrCustomerReviewText']")[0].text.split(' ')[0] self.numOfReviews = int( filter(lambda x: x.isdigit(), self.numOfReviews)) except Exception: self.rate = 0 self.numOfReviews = 0 del divAveReviews self.solveProductDetails()
def getPreviousReviewedBook(self,reviewerId): initUrl="http://www.amazon.com/gp/cdp/member-reviews/"+reviewerId self.allRevLink=initUrl html=MyHtml.getHtml(initUrl) ftable=html.xpath('.//body/table[2]')[0] pages=ftable.xpath('./tr/td[2]/table[1]/tr[1]/td[2]/b/a[last()]') if pages is not None and len(pages)>0: totalPages=pages[0].text.strip() else: totalPages=1 strPages=str(totalPages) print strPages if "-" in strPages: totalPages=totalPages.split('-') print "totalPages",totalPages totalPages=totalPages[1] else: totalPages=totalPages print totalPages,"totalPages" sortBy='MostRecentReview' j=1 self.counter=0 flag=0 for j in range(1,(int(totalPages)+1)): baseUrl="http://www.amazon.com/gp/cdp/member-reviews/"+reviewerId baseUrl=baseUrl+ \ '?pageNumber={}&sortBy={}'.format(str(j), sortBy) print baseUrl,j,totalPages html=MyHtml.getHtml(baseUrl) ftable=html.xpath('.//body/table[2]')[0] mainTable=ftable.xpath('./tr/td[2]/table[2]/tr[@valign="top"]') for row in mainTable: if row is not None: isBook=row.xpath('./td[5]/table/tr[2]/td/b') if isBook is not None and len(isBook)>0: if isBook[0].text is not None and flag==0: edition=isBook[0].text.strip() if "Edition" in edition: print "got the previous book" flag=1 reviewdate=row.xpath('./following-sibling::*') reviewdate=reviewdate[0].xpath('.//nobr') if reviewdate: reviewdate=reviewdate[0].text.strip() print "got the reviewDate",reviewdate self.previousBookReviewDate=CommonTool.strToDate(reviewdate) #to get link of the previous reviewedbook url=row.xpath('./td[5]/table/tr[1]/td/b/a')[0].attrib['href'] asin=Book.getAsinFromUrl(url) previousBook=Book.loadBookByAsin(asin) print "asinofPrevious",asin print "previousBook",previousBook self.previousBookPublishDate=previousBook.publishDate if j==1 and self.lRevTime=='': reviewdate=mainTable[0].xpath('./following-sibling::*') reviewdate=reviewdate[0].xpath('.//nobr') print reviewdate,"lRevtime" if reviewdate: reviewdate=reviewdate[0].text.strip() self.lRevTime=CommonTool.strToDate(reviewdate) print "value of j",j if j==int(totalPages) and self.fRevTime=='': print "inside frevtime loop" reviewdate=mainTable[-1].xpath('./following-sibling::*') reviewdate=reviewdate[0].xpath('.//nobr') print reviewdate,"fRevtime" if reviewdate: reviewdate=reviewdate[0].text.strip() self.fRevTime=CommonTool.strToDate(reviewdate) # rate rateObj = row.xpath("./following-sibling::*") rate = 'N/A' if len(rateObj) != 0: rateObj1 = rateObj[0].xpath(".//img") title = rateObj1[0].attrib['title'] rate = title.split("out")[0].strip() self.sum = self.sum + float(rate) j=j+1 #end of inner for loop print "sum",self.sum if self.rNum is not 0: self.avgRate=self.sum/self.rNum self.avgRate=round(self.avgRate,2) if self.lRevTime=='' or self.fRevTime=='': duration=0 else : duration = (self.lRevTime-self.fRevTime).days self.duration=int(duration)
def getProductLinksUsingThreads(self,productList): print "getting products using thread method" for link in productList: linkPage = MyHtml.getHtml( link, self.rID + "_AllProductLinks_" + str(self.page)) trObjs = linkPage.xpath( ".//body/table[2]/tr[1]/td[2]/table[2]/tr[@valign='top']") if link is self.firstLink: try: firstTimeObj = trObjs[-1] except LookupError: print link exit(0) fTime = firstTimeObj.xpath("./following-sibling::*") if len(fTime) == 0: self.fRevTime = 'N/A' else: # print etree.tostring(fTime[0]) fTime = fTime[0].xpath(".//nobr") if len(fTime) == 0: self.fRevTime = 'N/A' else: self.fRevTime = fTime[0].text.strip() self.fRevTime = CommonTool.strToDate(self.fRevTime) del fTime if link is self.lastLink: lastTimeObj = trObjs[0] lTime = lastTimeObj.xpath("./following-sibling::*") if len(lTime) == 0: self.lRevTime = 'N/A' else: # print etree.tostring(lTime[0]) lTime = lTime[0].xpath(".//nobr") if len(lTime) == 0: self.lRevTime = 'N/A' else: self.lRevTime = lTime[0].text.strip() self.lRevTime = CommonTool.strToDate(self.lRevTime) del lTime for trObj in trObjs: tableObj = trObj.xpath( "./td[@class='small'][3]/table[@class='small']") # aLink if len(tableObj) != 0: aLink = tableObj[0].xpath(".//a") if len(aLink) == 0: aLink = '' else: aLink = aLink[0].attrib['href'] # rate rateObj = trObj.xpath("./following-sibling::*") rate = 'N/A' if len(rateObj) != 0: try: rateObj1 = rateObj[0].xpath(".//img") title = rateObj1[0].attrib['title'] rate = title.split("out")[0].strip() self.sum = self.sum + float(rate) self.counter = self.counter + 1 except Exception, e: sys.stderr.write(str(e) + ' rate Exception\n') # reviewID reviewID = '' rIDObj = rateObj[0].xpath(".//a") if len(rIDObj) != 0: reviewID = rIDObj[0].attrib['name'] # label The review is from label = '' labelObj = rateObj[0].xpath(".//div[@class='tiny']") if len(labelObj) != 0: # verified purchase + the review is from aObj = labelObj[-1].xpath(".//a") if len(aObj) != 0: label = filtTag.filter_tags( etree.tostring(aObj[0]).strip()) # 1---book 0---product parLeft = label.find('(') parRight = label.find(')') if parLeft == -1 and parRight == -1: label = '0' elif label[-1] == ')': label = label.split('(') label = label[-1][:-1] if (label.find('Paperback') != -1) or (label.find('Hardcover') != -1): label = '1' bookID = aLink.replace('/ref=cm_cr-mr-title', '') bookID = bookID[-10:] self.reviewedBookList.append(bookID) productList = [] productList.append(aLink) productList.append(rate) productList.append(reviewID) productList.append(label) self.allProductLinks.append(productList) del productList self.page = self.page + 1
def solveCustomerReview(self): if not hasattr(self, 'html'): self.html = MyHtml.getHtml(self.url,ffhead=True) self.getReviews(self.asin) self.solveReviewSummary()
def solveReviewPage(asin, rank, url, fetchDate, bookPublishDate): hlre = re.compile( r'^(\d+) of (\d+) people found the following review helpful') html = MyHtml.getHtml(url) print "solving Review Page" countOfReviews=int(html.xpath('.//div[@id="cm_cr-product_info"]/div/div[1]/div[2]/span')[0].text.strip()) if countOfReviews>0: divWholeReviewList = html.xpath('.//div[@id="cm_cr-review_list"]')[0] divReviewList = divWholeReviewList.xpath('./div[@id]') print divReviewList for divReview in divReviewList: aReview = Review() rank += 1 aReview.helpfulRank = rank aReview.asin = asin aReview.reviewID = divReview.attrib['id'] # helpful line parentNode=divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]') print "helpful Match",parentNode helpfulMatch=None if parentNode is not None: match1 = divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]') if match1 is not None and len(match1)>0: if match1[0].text is not None: helpfulMatch=hlre.match(match1[0].text.strip()) else: match2=divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]/span[1]') if match2 is not None and len(match2)>0: if match2[0].text is not None: helpfulMatch=hlre.match(match2[0].text.strip()) print helpfulMatch if helpfulMatch: aReview.helpful = int(helpfulMatch.group(1)) aReview.total = int(helpfulMatch.group(2)) aReview.helpfulness = aReview.helpful * \ 100 / aReview.total / 100.0 else: aReview.helpful=0 aReview.total=0 aReview.helpfulness=0 del helpfulMatch print "getting rate data" rateData = divReview.xpath('.//span[@class="a-icon-alt"]')[0].text.strip() aReview.rate=rateData.split(' ')[0].strip() aReview.title = divReview.xpath('.//a[@class="a-size-base a-link-normal review-title a-color-base a-text-bold"]')[0].text.strip() print "getting reviewerId" reviewer = divReview.xpath('.//span[@class="a-size-base a-color-secondary review-byline"]/a') if reviewer is not None and len(reviewer)>0: aReview.reviewerID=reviewer[0].attrib['href'].split('/')[4].split('?')[0] print "reviewerId",aReview.reviewerID aReview.date = CommonTool.strToDate(divReview.xpath('./div[@class="a-row"]/span[4]')[0].text.strip()) aReview.elapsedDate = (fetchDate - aReview.date).days print bookPublishDate if bookPublishDate=='N/A': aReview.reviewBookDate='N/A' else: aReview.reviewBookDate = (aReview.date - (bookPublishDate)).days # format line try: strFormat = divReview[3].xpath('./a[1]')[0].text.strip() aReview.fromFormat = strFormat.split(' ')[1] except IndexError: aReview.fromFormat = '' spanVerifiedPurchase = divReview.xpath('.//span[@class="a-size-mini a-color-state a-text-bold"]') if spanVerifiedPurchase: spanVerifiedPurchase=spanVerifiedPurchase[0].text.strip() if spanVerifiedPurchase=="Verified Purchase": aReview.verified = 1 else: aReview.verified = 0 # review text line divReviewText = divReview.xpath('.//div[@class="a-row review-data"]/span')[0] aReview.description = filtTag.filter_tags( etree.tostring(divReviewText).strip()).strip() aReview.description = aReview.description.replace('\n', '<br />') del divReviewText # review comments line aReview.numOfComments = CommonTool.strToInt( divReview.xpath('.//div[@class="a-row a-spacing-top-small review-comments"]/div/a/span/span[1]')[0].text.strip()) aReview.getComments() saveReview(review=aReview) # try: # reviewer = Reviewer.loadReviewer(aReview.reviewerID) # aReview.lastReviewRank = reviewer.getPreBookReviewRanking( # aReview.reviewID) # except Exception, e: # sys.stderr.write(str(e) + '\n') # sys.stderr.write('lastReviewRank not found! url: {0} id: {1} \ # reviewerID: {2}\n'.format( # url, aReview.reviewID, aReview.reviewerID)) # import traceback # traceback.print_exc() # end of for # end of else return rank