def __init__(self, beginurl, reg, priorcontent, pages, downloadFolder): self.reg = reg self.beginurl = beginurl self.pages = pages self.downFolder = downloadFolder self.priorcontent = priorcontent if not os.path.isdir( downloadFolder ): os.mkdir( downloadFolder ) self.priQueue = PriorityQueue() self.downlist = PriorityQueue()
class Spider(object): musthave = '' beginurl = '' pages = '' downFolder = '' priQueue = PriorityQueue() downlist = PriorityQueue() def __init__(self, beginurl, reg, priorcontent, pages, downloadFolder): self.reg = reg self.beginurl = beginurl self.pages = pages self.downFolder = downloadFolder self.priorcontent = priorcontent if not os.path.isdir( downloadFolder ): os.mkdir( downloadFolder ) self.priQueue = PriorityQueue() self.downlist = PriorityQueue() def md5(self, content): m=hashlib.md5() m.update(content) return m.hexdigest() def updatePriQueue(self, priQueue, url ): "input new web page to be crawled into priQueue" extraPrior = url.endswith('.html') and 2 or 0 #url with 'html' has higher priority extraURL = self.priorcontent in url and 5 or 0 # item = priQueue.getitem(url) if item : newitem = ( item[0]+1+extraPrior+extraURL, item[1] ) priQueue.remove(item) priQueue.push( newitem ) else : priQueue.push( (1+extraPrior+extraURL,url) ) def getMainUrl(self, url): "obtain the host address for URL" ix = url.find('/',len('http://') ) if ix > 0 : return url[:ix] else : return url def analyseHtml(self, url, html): "analyze the html to find new URL to be crawled" p = Parser() try : p.feed(html) p.close() except: print "" mainurl = self.getMainUrl(url) for k, v in p.anchors.items(): for u in v : if (not u.startswith('http://')) : #handle the relative URL, transfer href ='test/1.html' to href='http://xxx/test/1.html' u = mainurl + u if not self.downlist.count(u) and re.match(self.reg, u) : #if url is not downloaded and match the required regular expression print 'add new url: ' + u self.updatePriQueue( self.priQueue, u ) def downloadUrl(self, id, url): "download the web page related to the specified URL and analyze the web page" headers = { 'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6' } postdata=urllib.urlencode({ 'emailAddress':'*****@*****.**', 'password':'******', 'signIn':'1' }) req = urllib2.Request( url = url, headers = headers, data = postdata ) downFileName = self.downFolder+'/%d.html' %(id) print 'crawling: ', url, try: fp = urllib2.urlopen(req) except: print '[ failed ]' return False else : print '[ success ]' self.downlist.push( url ) #add the downloaded url into downlist html = fp.read() self.saveHTML(url,html,downFileName) fp.close() print 'analyzing: ', url self.analyseHtml(url,html) return True def saveHTML(self,url,html,path=''): if path != '': print 'downloading',url,'as', path , op = open(path,"wb") op.write( html ) op.close() def usage(self): print >> sys.stderr,'this is a test for function' def run(self): "spider running function, fetch the url from priority queue and crawl the web page, and then add new url into priority queque" self.priQueue.push( (1,self.beginurl) ) i = 0 fp = open('url.txt','w') while not self.priQueue.empty() and i < self.pages : k, url = self.priQueue.pop() if self.downloadUrl(i+1, url): i += 1 fp.write(url+' '+self.md5(url)+'\n') fp.flush() fp.close() print '\nDownload',i,'pages, Totally.'