def solveEditorialReview(self): if len(self.html.xpath(".//h2[text()='Editorial Reviews']")) > 0: self.hasEditorialReview = 1 self.editorialReview = filtTag.filter_tags(etree.tostring( self.html.xpath( ".//h2[text()='Editorial Reviews'][1]/following-sibling::*" )[0])).strip() self.editorialReview = self.editorialReview.replace('\n', '<br />') else: self.hasEditorialReview = 0 self.editorialReview = ""
def getProductLinksUsingThreads(self,productList): print "getting products using thread method" for link in productList: linkPage = MyHtml.getHtml( link, self.rID + "_AllProductLinks_" + str(self.page)) trObjs = linkPage.xpath( ".//body/table[2]/tr[1]/td[2]/table[2]/tr[@valign='top']") if link is self.firstLink: try: firstTimeObj = trObjs[-1] except LookupError: print link exit(0) fTime = firstTimeObj.xpath("./following-sibling::*") if len(fTime) == 0: self.fRevTime = 'N/A' else: # print etree.tostring(fTime[0]) fTime = fTime[0].xpath(".//nobr") if len(fTime) == 0: self.fRevTime = 'N/A' else: self.fRevTime = fTime[0].text.strip() self.fRevTime = CommonTool.strToDate(self.fRevTime) del fTime if link is self.lastLink: lastTimeObj = trObjs[0] lTime = lastTimeObj.xpath("./following-sibling::*") if len(lTime) == 0: self.lRevTime = 'N/A' else: # print etree.tostring(lTime[0]) lTime = lTime[0].xpath(".//nobr") if len(lTime) == 0: self.lRevTime = 'N/A' else: self.lRevTime = lTime[0].text.strip() self.lRevTime = CommonTool.strToDate(self.lRevTime) del lTime for trObj in trObjs: tableObj = trObj.xpath( "./td[@class='small'][3]/table[@class='small']") # aLink if len(tableObj) != 0: aLink = tableObj[0].xpath(".//a") if len(aLink) == 0: aLink = '' else: aLink = aLink[0].attrib['href'] # rate rateObj = trObj.xpath("./following-sibling::*") rate = 'N/A' if len(rateObj) != 0: try: rateObj1 = rateObj[0].xpath(".//img") title = rateObj1[0].attrib['title'] rate = title.split("out")[0].strip() self.sum = self.sum + float(rate) self.counter = self.counter + 1 except Exception, e: sys.stderr.write(str(e) + ' rate Exception\n') # reviewID reviewID = '' rIDObj = rateObj[0].xpath(".//a") if len(rIDObj) != 0: reviewID = rIDObj[0].attrib['name'] # label The review is from label = '' labelObj = rateObj[0].xpath(".//div[@class='tiny']") if len(labelObj) != 0: # verified purchase + the review is from aObj = labelObj[-1].xpath(".//a") if len(aObj) != 0: label = filtTag.filter_tags( etree.tostring(aObj[0]).strip()) # 1---book 0---product parLeft = label.find('(') parRight = label.find(')') if parLeft == -1 and parRight == -1: label = '0' elif label[-1] == ')': label = label.split('(') label = label[-1][:-1] if (label.find('Paperback') != -1) or (label.find('Hardcover') != -1): label = '1' bookID = aLink.replace('/ref=cm_cr-mr-title', '') bookID = bookID[-10:] self.reviewedBookList.append(bookID) productList = [] productList.append(aLink) productList.append(rate) productList.append(reviewID) productList.append(label) self.allProductLinks.append(productList) del productList self.page = self.page + 1
def solveReviewPage(asin, rank, url, fetchDate, bookPublishDate): hlre = re.compile( r'^(\d+) of (\d+) people found the following review helpful') html = MyHtml.getHtml(url) print "solving Review Page" countOfReviews=int(html.xpath('.//div[@id="cm_cr-product_info"]/div/div[1]/div[2]/span')[0].text.strip()) if countOfReviews>0: divWholeReviewList = html.xpath('.//div[@id="cm_cr-review_list"]')[0] divReviewList = divWholeReviewList.xpath('./div[@id]') print divReviewList for divReview in divReviewList: aReview = Review() rank += 1 aReview.helpfulRank = rank aReview.asin = asin aReview.reviewID = divReview.attrib['id'] # helpful line parentNode=divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]') print "helpful Match",parentNode helpfulMatch=None if parentNode is not None: match1 = divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]') if match1 is not None and len(match1)>0: if match1[0].text is not None: helpfulMatch=hlre.match(match1[0].text.strip()) else: match2=divReview.xpath('.//span[@class="a-size-base cr-vote"]/span[1]/span[1]/span[1]') if match2 is not None and len(match2)>0: if match2[0].text is not None: helpfulMatch=hlre.match(match2[0].text.strip()) print helpfulMatch if helpfulMatch: aReview.helpful = int(helpfulMatch.group(1)) aReview.total = int(helpfulMatch.group(2)) aReview.helpfulness = aReview.helpful * \ 100 / aReview.total / 100.0 else: aReview.helpful=0 aReview.total=0 aReview.helpfulness=0 del helpfulMatch print "getting rate data" rateData = divReview.xpath('.//span[@class="a-icon-alt"]')[0].text.strip() aReview.rate=rateData.split(' ')[0].strip() aReview.title = divReview.xpath('.//a[@class="a-size-base a-link-normal review-title a-color-base a-text-bold"]')[0].text.strip() print "getting reviewerId" reviewer = divReview.xpath('.//span[@class="a-size-base a-color-secondary review-byline"]/a') if reviewer is not None and len(reviewer)>0: aReview.reviewerID=reviewer[0].attrib['href'].split('/')[4].split('?')[0] print "reviewerId",aReview.reviewerID aReview.date = CommonTool.strToDate(divReview.xpath('./div[@class="a-row"]/span[4]')[0].text.strip()) aReview.elapsedDate = (fetchDate - aReview.date).days print bookPublishDate if bookPublishDate=='N/A': aReview.reviewBookDate='N/A' else: aReview.reviewBookDate = (aReview.date - (bookPublishDate)).days # format line try: strFormat = divReview[3].xpath('./a[1]')[0].text.strip() aReview.fromFormat = strFormat.split(' ')[1] except IndexError: aReview.fromFormat = '' spanVerifiedPurchase = divReview.xpath('.//span[@class="a-size-mini a-color-state a-text-bold"]') if spanVerifiedPurchase: spanVerifiedPurchase=spanVerifiedPurchase[0].text.strip() if spanVerifiedPurchase=="Verified Purchase": aReview.verified = 1 else: aReview.verified = 0 # review text line divReviewText = divReview.xpath('.//div[@class="a-row review-data"]/span')[0] aReview.description = filtTag.filter_tags( etree.tostring(divReviewText).strip()).strip() aReview.description = aReview.description.replace('\n', '<br />') del divReviewText # review comments line aReview.numOfComments = CommonTool.strToInt( divReview.xpath('.//div[@class="a-row a-spacing-top-small review-comments"]/div/a/span/span[1]')[0].text.strip()) aReview.getComments() saveReview(review=aReview) # try: # reviewer = Reviewer.loadReviewer(aReview.reviewerID) # aReview.lastReviewRank = reviewer.getPreBookReviewRanking( # aReview.reviewID) # except Exception, e: # sys.stderr.write(str(e) + '\n') # sys.stderr.write('lastReviewRank not found! url: {0} id: {1} \ # reviewerID: {2}\n'.format( # url, aReview.reviewID, aReview.reviewerID)) # import traceback # traceback.print_exc() # end of for # end of else return rank
priceValue = priceValue.text.strip() if cmp(priceType, 'Kindle') == 0: self.kindlePrice = CommonTool.strToFloat(priceValue) elif cmp(priceType, 'Hardcover') == 0: self.hardcoverPrice = CommonTool.strToFloat(priceValue) elif cmp(priceType, 'Paperback') == 0: self.paperbackPrice = CommonTool.strToFloat(priceValue) del spans del priceType del priceValue del priceList try: strBookDesc = etree.tostring(divCenterCol.xpath( "./div[@id='bookDescription_feature_div']/noscript")[0]) self.bookDsc = filtTag.filter_tags(strBookDesc).strip() self.bookDsc = self.bookDsc.replace('\n', '<br />') del strBookDesc except Exception, e: print 'self.bookDsc error: %s' % e self.bookDsc = "" del divCenterCol # end of solveCenterCol def initCenterCol(self): self.title = 'N/A' self.binding = 'N/A' self.publishDate = 'N/A' self.elapsedDate = 'N/A' self.author = 'N/A'