def createNewSubIE(self, subIdx): subNode = self.subNodes[subIdx] url = subNode.getAttribute("href") ie = IEExplorer() ie.openURL(url) ie.setVisible(1) self.subIESet.append(ie)
def test_TaobaoBaobeiViewer(): url = u"http://detail.tmall.com/item.htm?id=20440371673&spm=a230r.1.14.1.0YUkKR&ad_id=&am_id=&cm_id=140105335569ed55e27b&pm_id=" baobeiIE = IEExplorer() baobeiIE.openURL(url) baobeiIE.setVisible(1) targetViewer = TaobaoBaobeiViewer(baobeiIE) targetViewer.baobeiSrcollBeg() targetViewer.openCurBaobei()
def doSearch(self): # open taobao url = u"http://www.taobao.com/" self.searchPageIE = IEExplorer() self.searchPageIE.openURL(url) self.searchPageIE.setVisible(1) while self.searchPageIE.waitBusy(IE_TIME_OUT_NEW_PAGE)==True: self.searchPageIE.stop() time.sleep(0.1) #input search key nodeSearchInput = self.getSearchUnputNode() nodeSearchInput.click() nodeSearchInput.focus() logging.debug(self.searchKey) enumHumanInput(nodeSearchInput, self.searchKey) #search nodeSearchButton = self.getSearchButtonNode() nodeSearchButton.focus() nodeSearchButton.click() time.sleep(3) self.getTargetUrlInfo() while self.searchPageIE.waitBusy(IE_TIME_OUT_NEW_PAGE)==True: if self.searchPageIE.locationURL == url: time.sleep(3) self.searchPageIE.stop() time.sleep(0.1) self.curPageIdx, self.totalPageCount = self.getPageInfo() dbgInfo = u"Total page num: {0}".format(self.totalPageCount) logging.debug(dbgInfo) #find the target self.doFindTargetBaobei()
def test_refresh(): url = u"www.taobao.com" baobeiIE = IEExplorer() baobeiIE.openURL(url) baobeiIE.setVisible(1) print "before sleep" time.sleep(20) print "after sleep" baobeiIE.getIE().Refresh()
def doFindTargetBaobei(self): # page next loop while True: self.getCurPageSearchItem() self.procViewRandomBaobei() if self.isTargetInThisPage()==True: break else: self.curPageIdx += 1 nextPageNode = self.getNextPageNode() self.searchPageIE.scrollToNode(nextPageNode) nextPageNode.focus() nextPageNode.click() time.sleep(4) dbgInfo = u"randomBaobei: " + str2unicode( str(self.randomBaobei) ) logging.debug(dbgInfo) # go to the targe baobei page targetNode = self.allSearchPages[self.curPageIdx][self.curPageInnerIdx] rcd = SearchRecord(targetNode) summaryNode = rcd.getSummaryNode() self.searchPageIE.scrollToNode(summaryNode) summaryNode.focus() url = summaryNode.getAttribute(u"href") baobeiIE = IEExplorer() baobeiIE.openURL(url) baobeiIE.setVisible(1) self.targetViewer = TaobaoBaobeiViewer(baobeiIE) self.targetViewer.baobeiSrcollBeg() self.targetViewer.openCurBaobei() # stay in baobei page timeTotal = random.randint(TIME_BAOBEI_VIEW_MIN, TIME_BAOBEI_VIEW_MAX) timeBeg = self.targetViewer.getTimeBegOp() timeNow = datetime.datetime.now() timePass = (timeNow-timeBeg).seconds timeSleep = timeTotal - timePass if timeSleep <= 0: timeSleep = 10 dbgInfo = u"stay in baobei page time: " + str2unicode(str(timeSleep)) logging.debug(dbgInfo) time.sleep(timeSleep)
def viewRandomBaobei(self, randomIdx): randomNode = self.allSearchPages[self.curPageIdx][randomIdx] rcd = SearchRecord(randomNode) summaryNode = rcd.getSummaryNode() self.searchPageIE.scrollToNode(summaryNode) summaryNode.focus() url = summaryNode.getAttribute(u"href") baobeiIE = IEExplorer() baobeiIE.openURL(url) baobeiIE.setVisible(1) isReady = baobeiIE.waitReadyState(IE_TIME_OUT_NEW_PAGE) timeOut = random.randint(3, 5) baobeiIE.stayInSubPage(timeOut) # set the search page into top while self.searchPageIE.waitBusy(IE_TIME_OUT_NEW_PAGE)==True: self.targetViewer.getMainIE().stop() time.sleep(0.1) self.searchPageIE.waitReadyState(IE_TIME_OUT_NEW_PAGE) self.searchPageIE.setForeground() time.sleep(1) #self.searchPageIE.resizeMax() time.sleep(1)
class BaobeiSearher(object): def __init__(self, searchKey, targetUrl): self.searchKey = searchKey self.targetUrl = targetUrl self.targetID = getTBIDFromUrl(self.targetUrl) self.targetTitle = None self.targetViewer = None self.searchPageIE = None self.curPageIdx = 0 self.totalPageCount = 0 self.curPageInnerIdx = -1 self.allSearchPages = [] self.randomBaobei = [] def doSearch(self): # open taobao url = u"http://www.taobao.com/" self.searchPageIE = IEExplorer() self.searchPageIE.openURL(url) self.searchPageIE.setVisible(1) while self.searchPageIE.waitBusy(IE_TIME_OUT_NEW_PAGE)==True: self.searchPageIE.stop() time.sleep(0.1) #input search key nodeSearchInput = self.getSearchUnputNode() nodeSearchInput.click() nodeSearchInput.focus() logging.debug(self.searchKey) enumHumanInput(nodeSearchInput, self.searchKey) #search nodeSearchButton = self.getSearchButtonNode() nodeSearchButton.focus() nodeSearchButton.click() time.sleep(3) self.getTargetUrlInfo() while self.searchPageIE.waitBusy(IE_TIME_OUT_NEW_PAGE)==True: if self.searchPageIE.locationURL == url: time.sleep(3) self.searchPageIE.stop() time.sleep(0.1) self.curPageIdx, self.totalPageCount = self.getPageInfo() dbgInfo = u"Total page num: {0}".format(self.totalPageCount) logging.debug(dbgInfo) #find the target self.doFindTargetBaobei() def getPageInfo(self): scrollDelta = getScrollDelta() for delta in scrollDelta: self.searchPageIE.getWindow().scrollBy(0,delta) while self.searchPageIE.waitBusy(10)==True: self.searchPageIE.stop() time.sleep(0.1) body = self.searchPageIE.getBody() nodesSpan = getSubNodesByTag(body, u"span") nodesPageinfo = [] for node in nodesSpan: if node.className == u"page-info": nodesPageinfo.append(node) if len(nodesPageinfo) != 1: return 0,1 #raise ValueError, u"The page info element count error: {0}".format(len(nodesPageinfo)) nodePage = nodesPageinfo[0] allTextPos = [] allTextPos.append(u"beforeBegin") allTextPos.append(u"afterBegin") allTextPos.append(u"beforeEnd") allTextPos.append(u"afterEnd") pageStr = u"" for text in allTextPos: pageStr = nodePage.getAdjacentText(text) if u"/" in pageStr: break dbgInfo = u"pageStr: " + str2unicode(pageStr) logging.debug(dbgInfo) infos = pageStr.split(u"/") #index = (int)(infos[0]) - 1 total = (int)(infos[1]) nodesStrong = getSubNodesByTag(nodePage, u"strong") if len(nodesStrong) != 1: index = 0 else: nodePageIndex = nodesStrong[0] for text in allTextPos: pageStr = nodePageIndex.getAdjacentText(text) if len(pageStr) != 0: break index = (int)(pageStr) - 1 return index,total def doFindTargetBaobei(self): # page next loop while True: self.getCurPageSearchItem() self.procViewRandomBaobei() if self.isTargetInThisPage()==True: break else: self.curPageIdx += 1 nextPageNode = self.getNextPageNode() self.searchPageIE.scrollToNode(nextPageNode) nextPageNode.focus() nextPageNode.click() time.sleep(4) dbgInfo = u"randomBaobei: " + str2unicode( str(self.randomBaobei) ) logging.debug(dbgInfo) # go to the targe baobei page targetNode = self.allSearchPages[self.curPageIdx][self.curPageInnerIdx] rcd = SearchRecord(targetNode) summaryNode = rcd.getSummaryNode() self.searchPageIE.scrollToNode(summaryNode) summaryNode.focus() url = summaryNode.getAttribute(u"href") baobeiIE = IEExplorer() baobeiIE.openURL(url) baobeiIE.setVisible(1) self.targetViewer = TaobaoBaobeiViewer(baobeiIE) self.targetViewer.baobeiSrcollBeg() self.targetViewer.openCurBaobei() # stay in baobei page timeTotal = random.randint(TIME_BAOBEI_VIEW_MIN, TIME_BAOBEI_VIEW_MAX) timeBeg = self.targetViewer.getTimeBegOp() timeNow = datetime.datetime.now() timePass = (timeNow-timeBeg).seconds timeSleep = timeTotal - timePass if timeSleep <= 0: timeSleep = 10 dbgInfo = u"stay in baobei page time: " + str2unicode(str(timeSleep)) logging.debug(dbgInfo) time.sleep(timeSleep) def viewRandomBaobei(self, randomIdx): randomNode = self.allSearchPages[self.curPageIdx][randomIdx] rcd = SearchRecord(randomNode) summaryNode = rcd.getSummaryNode() self.searchPageIE.scrollToNode(summaryNode) summaryNode.focus() url = summaryNode.getAttribute(u"href") baobeiIE = IEExplorer() baobeiIE.openURL(url) baobeiIE.setVisible(1) isReady = baobeiIE.waitReadyState(IE_TIME_OUT_NEW_PAGE) timeOut = random.randint(3, 5) baobeiIE.stayInSubPage(timeOut) # set the search page into top while self.searchPageIE.waitBusy(IE_TIME_OUT_NEW_PAGE)==True: self.targetViewer.getMainIE().stop() time.sleep(0.1) self.searchPageIE.waitReadyState(IE_TIME_OUT_NEW_PAGE) self.searchPageIE.setForeground() time.sleep(1) #self.searchPageIE.resizeMax() time.sleep(1) def procViewRandomBaobei(self): if self.isTargetInThisPage()==True: targetBaobei = (self.curPageIdx, self.curPageInnerIdx) self.randomBaobei.append(targetBaobei) if len(self.allSearchPages[-1]) < 3: return if self.curPageIdx == 0: # page 0, we have a random baobei view randomIdx = None while True: innerIdx = random.randint(0, len(self.allSearchPages[-1])-1) toupleIdx = (self.curPageIdx, innerIdx) if toupleIdx not in self.randomBaobei: randomIdx = innerIdx self.randomBaobei.append(toupleIdx) break self.viewRandomBaobei(randomIdx) if self.isTargetInThisPage()==True: # page 0, we have a random baobei view randomIdx = None while True: innerIdx = random.randint(0, len(self.allSearchPages[-1])-1) toupleIdx = (self.curPageIdx, innerIdx) if toupleIdx not in self.randomBaobei: randomIdx = innerIdx self.randomBaobei.append(toupleIdx) break self.viewRandomBaobei(randomIdx) def isTargetInThisPage(self): return self.curPageInnerIdx != -1 def getCurPageSearchItem(self): nodesItem = [] self.allSearchPages.append(nodesItem) self.refreshOutAllItem() while self.searchPageIE.waitBusy(IE_TIME_OUT_NEW_PAGE)==True: self.searchPageIE.stop() time.sleep(0.1) # get all item node strDbgInfo = u"Page:" + str(self.curPageIdx) + u", locationURL: " + self.searchPageIE.locationURL() logging.debug(strDbgInfo) body = self.searchPageIE.getBody() nodesDiv = getSubNodesByTag(body, u"div") for node in nodesDiv: if node.className == u"item-box" and node.childNodes.length>=5: nodesItem.append(node) strDbgInfo = u"cur page item count: " + str(len(nodesItem)) logging.debug(strDbgInfo) # get the target node self.allSearchPages[self.curPageIdx] = nodesItem for i in range(len(nodesItem)): node = nodesItem[i] try: rcd = SearchRecord(node) dbgInfo = str2unicode(str(i)) + u":" + rcd.getSummaryStr() + rcd.getSummaryID() logging.debug( dbgInfo ) if self.isRecordTarget(rcd)==True: self.curPageInnerIdx = i except: traceStr = traceback.format_exc() logging.error(traceStr) def isRecordTarget(self, rcd): if rcd.getSummaryStr() in self.targetTitle: titleBeg = u"<title>" titleEnd = u"-淘宝网</title>" beg = titleBeg+rcd.getSummaryStr() if beg in self.targetTitle: return rcd.getSummaryID() == self.targetID return False def refreshOutAllItem(self): #self.searchPageIE.stayInSubPage(10) for i in range(3): scrollDelta = getScrollDelta() for delta in scrollDelta: self.searchPageIE.getWindow().scrollBy(0,delta) time.sleep(IE_INTERVAL_TIME_SACROLL) #for delta in scrollDelta: #self.searchPageIE.getWindow().scrollBy(0,delta) #time.sleep(IE_INTERVAL_TIME_SACROLL) #for delta in scrollDelta: #self.searchPageIE.getWindow().scrollBy(0,delta) #time.sleep(IE_INTERVAL_TIME_SACROLL) while self.searchPageIE.waitBusy(10)==True: self.searchPageIE.stop() time.sleep(0.1) nextPageNode = self.getNextPageNode() if self.curPageIdx == self.totalPageCount-1: if nextPageNode == None: break else: if nextPageNode == None: self.searchPageIE.getIE().Refresh() time.sleep(4) else: break def getNextPageNode(self): body = self.searchPageIE.getBody() nodesA = getSubNodesByTag(body, u"a") nodesNextPage = [] for node in nodesA: if node.className==u"page-next" and \ node.getAttribute(u"title")==u"下一页": nodesNextPage.append(node) num = len(nodesNextPage) if num == 1: return nodesNextPage[0] elif num == 0: return None else: strDbg = u"num of next page button: " + str( len(nodesNextPage) ) logging.error(strDbg) raise ValueError, strDbg def getTargetUrlInfo(self): content = urllib.urlopen(self.targetUrl).read() soup = BeautifulSoup(content) self.targetTitle = soup.find(u'title') self.targetTitle = str2unicode(str(self.targetTitle)) def getSearchUnputNode(self): body = self.searchPageIE.getBody() nodesInput = getSubNodesByTag(body, u"input") nodeSearchInput = getNodeByAttr(nodesInput, u"id", u"q") if nodeSearchInput == None: raise ValueError, u"Can't find the input edit" return nodeSearchInput def getSearchButtonNode(self): body = self.searchPageIE.getBody() nodesInput = getSubNodesByTag(body, u"button") nodeSearchButton = None for node in nodesInput: value = u"submit" try: if node.getAttribute(u"type")==u"submit": value = None value = node.getAttribute(u"tabIndex") except: a = 0 if value==0: nodeSearchButton = node break if nodeSearchButton == None: raise ValueError, u"Can't find the submit button" return nodeSearchButton