def my_fetcher(self): #fetcher的工作内容就是从fetcher_queue中取节点,操作后,放入crawler_queue中 while self.fetcher_flag: if not self.fetcher_queue.empty(): #如果不为空 tmp_node = self.fetcher_queue.get(block = False) fetcher(tmp_node,self.spider_type) print str(time.ctime()) + ' ' + tmp_node.url self.crawler_queue.put(tmp_node) else: #如果下载队里为空 gevent.sleep(0) return
def my_fetcher( self): #fetcher的工作内容就是从fetcher_queue中取节点,操作后,放入crawler_queue中 while self.fetcher_flag: if not self.fetcher_queue.empty(): #如果不为空 tmp_node = self.fetcher_queue.get(block=False) fetcher(tmp_node, self.spider_type) print str(time.ctime()) + ' ' + tmp_node.url self.crawler_queue.put(tmp_node) else: #如果下载队里为空 gevent.sleep(0) return
pass #print "Table data is already exists" def SaveDB(self, nodes): self.nodes = nodes for x in self.nodes: self.cursor.execute("insert into data (key,url,html) values (?,?,?)",(self.key, x.url, x.html)) self.conn.commit() def Fetch_url_from_DB(self, keyword = None): self.cursor.execute("select * from data") r = self.cursor.fetchall() result = [] for i in r: if i[1] == self.key: result.append(i[2]) return result def CloseDB(self): self.cursor.close() self.conn.close() if __name__ == '__main__': t = DataNode("http://www.sohu.com") from Fetcher import fetcher fetcher(t) key = 'sina' sql_db = sql3_DB('sina') sql_db.SaveDB((t,)) print sql_db.Fetch_url_from_DB() sql_db.CloseDB()
link_list = list(set([i[2] for i in links])) node.set_links(link_list) except Exception, e: return ''' 待补充其他操作 如获取页面title等 ''' try: pass except Exception, e: pass node.reset_html() return if __name__ == '__main__': t = DataNode("http://www.sina.com.cn") fetcher(t) print 'static:' print len(t.html) crawler(t) print len(set(t.links)) print "dynamic:" fetcher(t,"dynamic") print len(t.html) crawler(t) print len(set(t.links))