def scanPage(self, url, depth): req = urllib2.Request(url) webutils.setupRequest(req) response = self._opener.open(req) if response == None: raise StopIteration() try: html = response.read() except: raise StopIteration() links = self._reexp.findall(html) linkRec = set() for link in links: if re.search(r'^javascript:', link): continue link = self.adjustUrl(url, link) if not link in self._linkList and not link in linkRec: if link.find(self._scope) != -1: linkRec.add(link) yield link self._linkList = self._linkList.union(linkRec) if self._maxCount >= 0 and len(self._linkList) >= self._maxCount: raise StopIteration() depth -= 1 if depth <= 0: raise StopIteration() for link in linkRec: for link2 in self.scanPage(link, depth): yield link2
def _refreshCookie(opener, what): what = urllib2.quote(what) url = GFSOSO_HOME + '?q=%s' % (what) req = urllib2.Request(url) webutils.setupRequest(req) req.add_header('Referer', GFSOSO_HOME) try: response = opener.open(req, timeout=REQ_TIMEOUT) # print response.geturl() if response.geturl().find(GFSOSO_HOME) == -1: global RedirectedUrl RedirectedUrl = response.geturl() RedirectedUrl = RedirectedUrl[0:RedirectedUrl.find('/', 7) + 1] # print 'Redirect', RedirectedUrl return False html = response.read() except Exception, e: print e html = '' if e.code == 301: # moved # html = reduce(lambda x,y: x + y, e.readlines()) for line in e.readlines(): html += line else: print "Exception: url: %s - " % url, e return False
def _refreshCookie(opener, what): what = urllib2.quote(what) url = GFSOSO_HOME + '?q=%s' % (what) req = urllib2.Request(url) webutils.setupRequest(req) req.add_header('Referer', GFSOSO_HOME) try: response = opener.open(req, timeout = REQ_TIMEOUT) # print response.geturl() if response.geturl().find(GFSOSO_HOME) == -1: global RedirectedUrl RedirectedUrl = response.geturl() RedirectedUrl = RedirectedUrl[0 : RedirectedUrl.find('/', 7) + 1] # print 'Redirect', RedirectedUrl return False html = response.read() except Exception, e: print e html = '' if e.code == 301: # moved # html = reduce(lambda x,y: x + y, e.readlines()) for line in e.readlines(): html += line else: print "Exception: url: %s - " % url, e return False
def _gfsosoPageHandler(opener, url): req = urllib2.Request(url) webutils.setupRequest(req) req.add_header('Referer', url[:-4]) try: response = opener.open(req, timeout=REQ_TIMEOUT) html = response.read() #print html except Exception, e: print "Exception: url: %s - " % url, e raise StopIteration()
def _gfsosoPageHandler(opener, url): req = urllib2.Request(url) webutils.setupRequest(req) req.add_header('Referer', url[:-4]) try: response = opener.open(req, timeout = REQ_TIMEOUT) html = response.read() #print html except Exception, e: print "Exception: url: %s - " % url, e raise StopIteration()
def _pageHandler(self, url): # print 'page handler' req = urllib2.Request(url) webutils.setupRequest(req) req.add_header('Referer', url[:-4]) try: response = self._opener.open(req, timeout=self.reqTimeout) html = response.read() # print html except Exception, e: print "Exception: url: %s - " % url, e raise StopIteration()
def _bingSearchPageHandler(opener, url): #print url #response = opener.open(url, data = None, timeout = 10) req = urllib2.Request(url) webutils.setupRequest(req) req.add_header('Proxy-Connection', 'Keep-Alive') try: response = opener.open(req, timeout=REQ_TIMEOUT) html = response.read() #print html except Exception, e: print "Exception: url: %s - " % url, e raise StopIteration()
def _bingSearchPageHandler(opener, url): #print url #response = opener.open(url, data = None, timeout = 10) req = urllib2.Request(url) webutils.setupRequest(req) req.add_header('Proxy-Connection', 'Keep-Alive') try: response = opener.open(req, timeout = REQ_TIMEOUT) html = response.read() #print html except Exception, e: print "Exception: url: %s - " % url, e raise StopIteration()