def getBookAsinList(self): self.bookAsinList = [] self.list404 = [] self.listnq = [] cnt = 0 #try: # with open('./rand1000List', 'a+') as f: # with open('./newAsinListShuffle', 'r') as f: # with open('./random1000final', 'w') as fout: with open('./list1.txt', 'r') as f: for eachLine in f: if eachLine.strip() in self.bookAsinList: continue if eachLine.strip()[0].isdigit() is False: continue print eachLine book = Book.loadBookByAsin( eachLine.strip(), self.fetchDate) if self.checkBook(book): self.bookAsinList.append(eachLine.strip()) cnt += 1 print 'load {0} of {1}: {2}'.format( cnt, self.numOfBooks, eachLine.strip()) if cnt >= self.numOfBooks: break
def getBookAsinList(self): self.bookAsinList = [] html = MyHtml.getHtml( self.url, name=self.tag, crawlDate=self.fetchDate) divProductList = html.xpath(".//div[@class='productList']")[0] trListProductList = divProductList.xpath("./table/tr[@class='small']") for i, tr in enumerate(trListProductList): if i % 2 == 1: continue aUrl = tr.xpath("./td[2]/a")[0].attrib['href'].strip() asin = Book.getAsinFromUrl(aUrl) if asin != '' and asin[0] != 'B': book = Book.loadBookByAsin(asin, self.fetchDate) if self.checkBook(book): self.bookAsinList.append(asin)
def getBookAsinList(self): self.bookAsinList=[] html=MyHtml.getHtml(self.url,name="NewReleasedBooks",crawlDate=self.fetchDate) divBooksList = html.xpath(".//div[@id='zg_centerListWrapper']")[0] divItemsList=divBooksList.xpath("./div[@class='zg_itemImmersion']") for item in divItemsList: #item=divItemsList[0] #if item: aUrl=item.xpath("./div[2]/div[2]/a")[0].attrib['href'].strip() rank=item.xpath("./div[@class='zg_rankDiv']/span")[0].text.strip() asin=Book.getAsinFromUrl(aUrl) if asin!='': book=Book.loadBookByAsin(asin,self.fetchDate) #if self.checkBook(book,rank): self.bookAsinList.append(asin) print self.bookAsinList
def solveBook(self): with open("../data/" + self.fetchDate.isoformat() + "/book.txt", "w") as\ fout: ct = CommonTool(fout) fout.write(Book.Book.tableHead) fout.write('\n') fout.flush() # ct.writeln(Book.Book.tableHead) for i, asin in enumerate(self.bookAsinList): print 'solve Book {0} of {1}: {2}'.format( i, self.length(), asin) book = Book.loadBookByAsin(asin) book.addTag(self.tag) book.solveCustomerReview() book.insertBookDataIntoTable() book.printData(ct) Book.saveBook(book) self.reviewList.extend(book.reviewList) with open("./reviewList.txt", "w") as fff: fff.write('\n'.join(self.reviewList))
def getPreviousReviewedBook(self,reviewerId): initUrl="http://www.amazon.com/gp/cdp/member-reviews/"+reviewerId self.allRevLink=initUrl html=MyHtml.getHtml(initUrl) ftable=html.xpath('.//body/table[2]')[0] pages=ftable.xpath('./tr/td[2]/table[1]/tr[1]/td[2]/b/a[last()]') if pages is not None and len(pages)>0: totalPages=pages[0].text.strip() else: totalPages=1 strPages=str(totalPages) print strPages if "-" in strPages: totalPages=totalPages.split('-') print "totalPages",totalPages totalPages=totalPages[1] else: totalPages=totalPages print totalPages,"totalPages" sortBy='MostRecentReview' j=1 self.counter=0 flag=0 for j in range(1,(int(totalPages)+1)): baseUrl="http://www.amazon.com/gp/cdp/member-reviews/"+reviewerId baseUrl=baseUrl+ \ '?pageNumber={}&sortBy={}'.format(str(j), sortBy) print baseUrl,j,totalPages html=MyHtml.getHtml(baseUrl) ftable=html.xpath('.//body/table[2]')[0] mainTable=ftable.xpath('./tr/td[2]/table[2]/tr[@valign="top"]') for row in mainTable: if row is not None: isBook=row.xpath('./td[5]/table/tr[2]/td/b') if isBook is not None and len(isBook)>0: if isBook[0].text is not None and flag==0: edition=isBook[0].text.strip() if "Edition" in edition: print "got the previous book" flag=1 reviewdate=row.xpath('./following-sibling::*') reviewdate=reviewdate[0].xpath('.//nobr') if reviewdate: reviewdate=reviewdate[0].text.strip() print "got the reviewDate",reviewdate self.previousBookReviewDate=CommonTool.strToDate(reviewdate) #to get link of the previous reviewedbook url=row.xpath('./td[5]/table/tr[1]/td/b/a')[0].attrib['href'] asin=Book.getAsinFromUrl(url) previousBook=Book.loadBookByAsin(asin) print "asinofPrevious",asin print "previousBook",previousBook self.previousBookPublishDate=previousBook.publishDate if j==1 and self.lRevTime=='': reviewdate=mainTable[0].xpath('./following-sibling::*') reviewdate=reviewdate[0].xpath('.//nobr') print reviewdate,"lRevtime" if reviewdate: reviewdate=reviewdate[0].text.strip() self.lRevTime=CommonTool.strToDate(reviewdate) print "value of j",j if j==int(totalPages) and self.fRevTime=='': print "inside frevtime loop" reviewdate=mainTable[-1].xpath('./following-sibling::*') reviewdate=reviewdate[0].xpath('.//nobr') print reviewdate,"fRevtime" if reviewdate: reviewdate=reviewdate[0].text.strip() self.fRevTime=CommonTool.strToDate(reviewdate) # rate rateObj = row.xpath("./following-sibling::*") rate = 'N/A' if len(rateObj) != 0: rateObj1 = rateObj[0].xpath(".//img") title = rateObj1[0].attrib['title'] rate = title.split("out")[0].strip() self.sum = self.sum + float(rate) j=j+1 #end of inner for loop print "sum",self.sum if self.rNum is not 0: self.avgRate=self.sum/self.rNum self.avgRate=round(self.avgRate,2) if self.lRevTime=='' or self.fRevTime=='': duration=0 else : duration = (self.lRevTime-self.fRevTime).days self.duration=int(duration)
def printData(self): with open("../data/" + self.fetchDate.isoformat() + "/dataAll.txt", "w") as fout: ct = CommonTool(fout) fout.write("\t".join(self.tableHeadList)) fout.write('\n') fout.flush() for i, reviewID in enumerate(self.reviewList): if reviewID == '': continue print 'solve review {0} of {1}: {2}'.format( i, len(self.reviewList), reviewID) review = Review.loadReview(reviewID) book = Book.loadBookByAsin(review.asin) reviewer = Reviewer.loadReviewer(review.reviewerID) ct.write(reviewID) ct.write(review.asin) ct.write(review.reviewerID) ct.write(reviewer.rName) ct.write(reviewer.tRev1) ct.write(reviewer.tRev10) ct.write(reviewer.tRev50) ct.write(reviewer.tRev100) ct.write(reviewer.tRev500) ct.write(reviewer.tRev1000) ct.write(reviewer.tRevHall) ct.write(reviewer.vVoice) ct.write(review.verified) ct.write(review.rate) ct.write(review.title) ct.write(review.date) ct.write(review.fetchDate) ct.write(review.reviewBookDate) ct.write(review.elapsedDate) ct.write(review.helpful) ct.write(review.total) ct.write(review.helpfulness) ct.write(review.helpfulRank) ct.write(review.timeRank) #ct.write(review.top1Percent) #ct.write(review.top5Percent) #ct.write(review.top10Percent) ct.write(review.description) ct.write(review.numOfComments) ct.write(review.comment) #ct.write(review.isQuoteTable) ct.write(review.lastReviewRank) ct.write(book.url) ct.write(book.tag) ct.write(book.allowPreview) ct.write(book.binding) ct.write(book.publishDate) ct.write(book.author) ct.write(book.authorInfo) ct.write(book.rate) ct.write(book.numOfReviews) ct.write(book.kindlePrice) ct.write(book.hardcoverPrice) ct.write(book.paperbackPrice) ct.write(book.bookDsc) ct.write(book.listPrice) ct.write(book.pages) ct.write(book.isbn10) ct.write(book.isbn13) ct.write(book.subrank) ct.write(book.hasEditorialReview) ct.write(book.editorialReview) #ct.write(book.hasQuoteTable) ct.write(reviewer.email) ct.write(reviewer.webPage) ct.write(reviewer.hasPhoto) ct.write(reviewer.rNum) ct.write(reviewer.helpRate) ct.write(reviewer.hVote) ct.write(reviewer.tVote) ct.write(reviewer.avgRate) ct.write(reviewer.fRevTime) ct.write(reviewer.lRevTime) ct.write(reviewer.duration) if reviewer.rReal == "N/A": ct.write(0) else: ct.write(1) if reviewer.location == "N/A": ct.write(0) else: ct.write(1) if reviewer.aboutMe == "N/A": ct.write(0) else: ct.write(1) if reviewer.interest == "N/A": ct.write(0) else: ct.write(1) ct.write(review.fromFormat) # if review.fromFormat == "Hardcover": # ct.write(0) # elif review.fromFormat == "Paperback": # ct.write(1) # else: # ct.write(2) if reviewer.rRank == "N/A": ct.write(0) else: ct.write(reviewer.rRank) ct.writeln(book.rank)