示例#1
0
 def getBookAsinList(self):
     self.bookAsinList = []
     self.list404 = []
     self.listnq = []
     cnt = 0
 #try:
     # with open('./rand1000List', 'a+') as f:
     # with open('./newAsinListShuffle', 'r') as f:
     #    with open('./random1000final', 'w') as fout:
     with open('./list1.txt', 'r') as f:
         for eachLine in f:
             if eachLine.strip() in self.bookAsinList:
                 continue
             if eachLine.strip()[0].isdigit() is False:
                 continue
             print eachLine
             book = Book.loadBookByAsin(
                 eachLine.strip(), self.fetchDate)
             if self.checkBook(book):
                 self.bookAsinList.append(eachLine.strip())
                 cnt += 1
                 print 'load {0} of {1}: {2}'.format(
                     cnt, self.numOfBooks, eachLine.strip())
                 if cnt >= self.numOfBooks:
                     break
示例#2
0
 def getBookAsinList(self):
     self.bookAsinList = []
     html = MyHtml.getHtml(
         self.url, name=self.tag, crawlDate=self.fetchDate)
     divProductList = html.xpath(".//div[@class='productList']")[0]
     trListProductList = divProductList.xpath("./table/tr[@class='small']")
     for i, tr in enumerate(trListProductList):
         if i % 2 == 1:
             continue
         aUrl = tr.xpath("./td[2]/a")[0].attrib['href'].strip()
         asin = Book.getAsinFromUrl(aUrl)
         if asin != '' and asin[0] != 'B':
             book = Book.loadBookByAsin(asin, self.fetchDate)
             if self.checkBook(book):
                 self.bookAsinList.append(asin)
示例#3
0
 def getBookAsinList(self):
     self.bookAsinList=[]
     html=MyHtml.getHtml(self.url,name="NewReleasedBooks",crawlDate=self.fetchDate)
     divBooksList = html.xpath(".//div[@id='zg_centerListWrapper']")[0]
     divItemsList=divBooksList.xpath("./div[@class='zg_itemImmersion']")
     
     for item in divItemsList:
     #item=divItemsList[0]
     #if item:
         aUrl=item.xpath("./div[2]/div[2]/a")[0].attrib['href'].strip()
         rank=item.xpath("./div[@class='zg_rankDiv']/span")[0].text.strip()
         asin=Book.getAsinFromUrl(aUrl)
         if asin!='':
             book=Book.loadBookByAsin(asin,self.fetchDate)
             #if self.checkBook(book,rank):
             self.bookAsinList.append(asin)
     print self.bookAsinList
示例#4
0
    def solveBook(self):
        with open("../data/" + self.fetchDate.isoformat() + "/book.txt", "w") as\
                fout:
            ct = CommonTool(fout)
            fout.write(Book.Book.tableHead)
            fout.write('\n')
            fout.flush()
#             ct.writeln(Book.Book.tableHead)
            for i, asin in enumerate(self.bookAsinList):
                print 'solve Book {0} of {1}: {2}'.format(
                    i, self.length(), asin)
                book = Book.loadBookByAsin(asin)
                book.addTag(self.tag)
                book.solveCustomerReview()
                book.insertBookDataIntoTable()
                book.printData(ct)
                Book.saveBook(book)
                self.reviewList.extend(book.reviewList)
            with open("./reviewList.txt", "w") as fff:
                fff.write('\n'.join(self.reviewList))
示例#5
0
 def getPreviousReviewedBook(self,reviewerId):
     initUrl="http://www.amazon.com/gp/cdp/member-reviews/"+reviewerId
     self.allRevLink=initUrl
     html=MyHtml.getHtml(initUrl)
     ftable=html.xpath('.//body/table[2]')[0]
     pages=ftable.xpath('./tr/td[2]/table[1]/tr[1]/td[2]/b/a[last()]')
     if pages is not None and len(pages)>0:
         totalPages=pages[0].text.strip()
     else:
         totalPages=1
     strPages=str(totalPages)
     print strPages
     if "-" in strPages:
         totalPages=totalPages.split('-')
         print "totalPages",totalPages
         totalPages=totalPages[1]
     else:
         totalPages=totalPages
     print totalPages,"totalPages"
     sortBy='MostRecentReview'
     j=1
     self.counter=0
     flag=0
     for j in range(1,(int(totalPages)+1)):
         baseUrl="http://www.amazon.com/gp/cdp/member-reviews/"+reviewerId
         baseUrl=baseUrl+ \
         '?pageNumber={}&sortBy={}'.format(str(j), sortBy)
         print baseUrl,j,totalPages
         html=MyHtml.getHtml(baseUrl)
         ftable=html.xpath('.//body/table[2]')[0]
         mainTable=ftable.xpath('./tr/td[2]/table[2]/tr[@valign="top"]')
         for row in mainTable:
             if row is not None:
                 isBook=row.xpath('./td[5]/table/tr[2]/td/b')
                 if isBook is not None and len(isBook)>0:
                     if isBook[0].text is not None and flag==0:
                         edition=isBook[0].text.strip()
                         if "Edition" in edition:
                             print "got the previous book"
                             flag=1
                             reviewdate=row.xpath('./following-sibling::*')
                             reviewdate=reviewdate[0].xpath('.//nobr')
                             if reviewdate:
                                 reviewdate=reviewdate[0].text.strip()
                                 print "got the reviewDate",reviewdate
                                 self.previousBookReviewDate=CommonTool.strToDate(reviewdate)
                                 
                             #to get link of the previous reviewedbook 
                             url=row.xpath('./td[5]/table/tr[1]/td/b/a')[0].attrib['href']
                             asin=Book.getAsinFromUrl(url)
                             previousBook=Book.loadBookByAsin(asin)
                             print "asinofPrevious",asin
                             print "previousBook",previousBook
                             self.previousBookPublishDate=previousBook.publishDate
                 
                             
                 if j==1 and self.lRevTime=='':
                     reviewdate=mainTable[0].xpath('./following-sibling::*')
                     reviewdate=reviewdate[0].xpath('.//nobr')
                     print reviewdate,"lRevtime"
                     if reviewdate:
                         reviewdate=reviewdate[0].text.strip()
                         self.lRevTime=CommonTool.strToDate(reviewdate)
                 print "value of j",j
                 if j==int(totalPages) and self.fRevTime=='':
                     print "inside frevtime loop"
                     reviewdate=mainTable[-1].xpath('./following-sibling::*')
                     reviewdate=reviewdate[0].xpath('.//nobr')
                     print reviewdate,"fRevtime"
                     if reviewdate:
                         reviewdate=reviewdate[0].text.strip()
                         self.fRevTime=CommonTool.strToDate(reviewdate)
                     
                 # rate
                 rateObj = row.xpath("./following-sibling::*")
                 rate = 'N/A'
                 if len(rateObj) != 0:
                     rateObj1 = rateObj[0].xpath(".//img")
                     title = rateObj1[0].attrib['title']
                     rate = title.split("out")[0].strip()
                     self.sum = self.sum + float(rate)        
     j=j+1
         #end of inner for loop
     print "sum",self.sum
     if self.rNum is not 0:
         self.avgRate=self.sum/self.rNum 
         self.avgRate=round(self.avgRate,2)
     if self.lRevTime=='' or self.fRevTime=='':
         duration=0
     else :  
         duration = (self.lRevTime-self.fRevTime).days
     self.duration=int(duration)
示例#6
0
 def printData(self):
     with open("../data/" + self.fetchDate.isoformat() + "/dataAll.txt",
               "w") as fout:
         ct = CommonTool(fout)
         fout.write("\t".join(self.tableHeadList))
         fout.write('\n')
         fout.flush()
         for i, reviewID in enumerate(self.reviewList):
             if reviewID == '':
                 continue
             print 'solve review {0} of {1}: {2}'.format(
                 i, len(self.reviewList), reviewID)
             review = Review.loadReview(reviewID)
             book = Book.loadBookByAsin(review.asin)
             reviewer = Reviewer.loadReviewer(review.reviewerID)
             ct.write(reviewID)
             ct.write(review.asin)
             ct.write(review.reviewerID)
             ct.write(reviewer.rName)
             ct.write(reviewer.tRev1)
             ct.write(reviewer.tRev10)
             ct.write(reviewer.tRev50)
             ct.write(reviewer.tRev100)
             ct.write(reviewer.tRev500)
             ct.write(reviewer.tRev1000)
             ct.write(reviewer.tRevHall)
             ct.write(reviewer.vVoice)
             ct.write(review.verified)
             ct.write(review.rate)
             ct.write(review.title)
             ct.write(review.date)
             ct.write(review.fetchDate)
             ct.write(review.reviewBookDate)
             ct.write(review.elapsedDate)
             ct.write(review.helpful)
             ct.write(review.total)
             ct.write(review.helpfulness)
             ct.write(review.helpfulRank)
             ct.write(review.timeRank)
             #ct.write(review.top1Percent)
             #ct.write(review.top5Percent)
             #ct.write(review.top10Percent)
             ct.write(review.description)
             ct.write(review.numOfComments)
             ct.write(review.comment)
             #ct.write(review.isQuoteTable)
             ct.write(review.lastReviewRank)
             ct.write(book.url)
             ct.write(book.tag)
             ct.write(book.allowPreview)
             ct.write(book.binding)
             ct.write(book.publishDate)
             ct.write(book.author)
             ct.write(book.authorInfo)
             ct.write(book.rate)
             ct.write(book.numOfReviews)
             ct.write(book.kindlePrice)
             ct.write(book.hardcoverPrice)
             ct.write(book.paperbackPrice)
             ct.write(book.bookDsc)
             ct.write(book.listPrice)
             ct.write(book.pages)
             ct.write(book.isbn10)
             ct.write(book.isbn13)
             ct.write(book.subrank)
             ct.write(book.hasEditorialReview)
             ct.write(book.editorialReview)
             #ct.write(book.hasQuoteTable)
             ct.write(reviewer.email)
             ct.write(reviewer.webPage)
             ct.write(reviewer.hasPhoto)
             ct.write(reviewer.rNum)
             ct.write(reviewer.helpRate)
             ct.write(reviewer.hVote)
             ct.write(reviewer.tVote)
             ct.write(reviewer.avgRate)
             ct.write(reviewer.fRevTime)
             ct.write(reviewer.lRevTime)
             ct.write(reviewer.duration)
             if reviewer.rReal == "N/A":
                 ct.write(0)
             else:
                 ct.write(1)
             if reviewer.location == "N/A":
                 ct.write(0)
             else:
                 ct.write(1)
             if reviewer.aboutMe == "N/A":
                 ct.write(0)
             else:
                 ct.write(1)
             if reviewer.interest == "N/A":
                 ct.write(0)
             else:
                 ct.write(1)
             ct.write(review.fromFormat)
             # if review.fromFormat == "Hardcover":
             #     ct.write(0)
             # elif review.fromFormat == "Paperback":
             #     ct.write(1)
             # else:
             #     ct.write(2)
             if reviewer.rRank == "N/A":
                 ct.write(0)
             else:
                 ct.write(reviewer.rRank)
             ct.writeln(book.rank)