def __init__( self): self._istart = False self._status = Status() """--- load config file----""" self._config = Configuration(); """--- core object ----""" self._downloader = None self._parser = None """--- memory models --- """ self._download_pool = SafeQueue() #Store the html objects to be downloaded by the downloader self._parse_pool = SafeQueue() #Store the html objects to be parsed by the parser """--- checker threads --- """ """The target is the function passed in to run in the thread. Those two threads keep checking and assigning jobs to the two thread pools""" self._downloader_pool_checker = Thread( target=self.download_pool_checker) self._parse_pool_checker = Thread( target=self.parse_pool_checker) """--- threads --- """ self._status_update = Thread( target=self.status_update) #every second, this thread post runtime info to remote mysql """ ---strategies--- """ self._earlyvisithandler = EarlyVisitHandler() self._robothandler = RobotHandler() self._cgihandler = CGIHandler() self._nestlevelhandler = NestLevelHandler() self._schemehandler = SchemeHandler() self._filetypehandler = FileTypeHandler() self._bookmarkhandler = BookMarkHandler() self._omitindex = OmitIndex() self._urlextender = URLExtender() """ ---init the path for saving data, if the folder don't exist, create it ---""" self._path = self._config._down_path+"/"+ strftime('%Y-%m-%d', localtime())+"/"+ strftime('%H-%M-%S', localtime())+"/" if not os.path.exists(self._path): os.makedirs(self._path) self._config._down_path = self._path self._keywords_links= [] """ ---Mysql Manager--- """ self.sqlex = DatabseManager(self._config)
class Engine(object): def __init__( self): self._istart = False self._status = Status() """--- load config file----""" self._config = Configuration(); """--- core object ----""" self._downloader = None self._parser = None """--- memory models --- """ self._download_pool = SafeQueue() #Store the html objects to be downloaded by the downloader self._parse_pool = SafeQueue() #Store the html objects to be parsed by the parser """--- checker threads --- """ """The target is the function passed in to run in the thread. Those two threads keep checking and assigning jobs to the two thread pools""" self._downloader_pool_checker = Thread( target=self.download_pool_checker) self._parse_pool_checker = Thread( target=self.parse_pool_checker) """--- threads --- """ self._status_update = Thread( target=self.status_update) #every second, this thread post runtime info to remote mysql """ ---strategies--- """ self._earlyvisithandler = EarlyVisitHandler() self._robothandler = RobotHandler() self._cgihandler = CGIHandler() self._nestlevelhandler = NestLevelHandler() self._schemehandler = SchemeHandler() self._filetypehandler = FileTypeHandler() self._bookmarkhandler = BookMarkHandler() self._omitindex = OmitIndex() self._urlextender = URLExtender() """ ---init the path for saving data, if the folder don't exist, create it ---""" self._path = self._config._down_path+"/"+ strftime('%Y-%m-%d', localtime())+"/"+ strftime('%H-%M-%S', localtime())+"/" if not os.path.exists(self._path): os.makedirs(self._path) self._config._down_path = self._path self._keywords_links= [] """ ---Mysql Manager--- """ self.sqlex = DatabseManager(self._config) #self.f= open("data.txt", 'w') def load_seeds(self): #load seed info from config file #print "load_seeds 1" #load seed from contacter = SearchGoogle(self._config._keywords, self._config._result_num) self._keywords_links = contacter.getURLs() #append seeds, which from google search result, into download pool #print "load_seeds 2" #self._keywords_links.insert(0, "https://twitter.com/") #self._keywords_links.insert(0, "https://twitter.com/signup?context=login") i = 0 for url in self._keywords_links: if i < self._config._result_num: #print "@@{0}".format(url) html_task = Html(url) #print "@@1" if(self._schemehandler.SchemeChecker(html_task)==False): #print("Ingore the wrong scheme, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) #print "@@2" self._status._scheme+=1 continue if(self._bookmarkhandler.BookMarkChecker(html_task)==True): #print("Ingore bookmark link, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) #print "@@3" self._status._bookmark+=1 continue if(self._cgihandler.FindCGI(html_task)==True): #print("Ingore the link contain cgi, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) #print "@@4" self._status._cgi+=1 continue if(self._nestlevelhandler.checknestlevel(html_task,self._config._parser_nlv)==True): self._status._nestlv +=1 #print "@@5" #print("Ingore the link nested too much, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) continue if(self._filetypehandler.FileTypeChecker(html_task)==False): #print "@@6" self._status._file_type +=1 continue #print "@@7" ''' if(self._earlyvisithandler.check_visited(html_task) == True): self._status._early_visit +=1 #print("Ingore the link visited before, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) continue ''' self._omitindex.Omit(html_task) """ print "@@8" if(self._robothandler.is_allowed(html_task) == False): print "@@9" self._status._robot +=1 #print("Blocked by the Robot.txt, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) continue print "@@10" """ self._earlyvisithandler.add_entry(html_task._md5, html_task) self._download_pool.append(html_task) '''If use the following two line of code, then the program won't run, which means checking for revisit works''' '''however, the dic should be safe with a lock''' #self._visited_dic[html_task._md5] = html_task._url #print(len(self._visited_dic)) #print "@@11" else: break i+=1 #print "load_seeds 3" def show_welcome(self): print("download folder:"+self._path) print "key words:"+self._config._keywords print "donload thread num: {0}".format(self._config._down_num) print "parse thread num: {0}".format(self._config._parser_num) print "Load " +str(self._config._result_num)+" results from google search:" i = 0 for url in self._keywords_links: if i < self._config._result_num: print ("[{0}]".format(i)+url) i+=1 print "\n------------------------------------------------------------------------\n" #raw_input("press any key to start crawling, press second key to stop") def wait_for_start(self): print "ready for start....." print "go to http://dengxu.me/crawling/ to input some key words & see the result " while( self.sqlex.read_if_start(self._config)!= True): sleep(1) print "\n------------------------------------------------------------------------\n" print "starting crawling engine...." def start(self): try: self.wait_for_start() self._istart = True """load seed """ self.load_seeds() #load seeds from google search """show welcome info""" self.show_welcome() self._status._sys_start = time() """start threads""" self._downloader = Downloader( self._config._down_num, self._status) self._downloader.start() self._parser = Parser(self._config._parser_num, self._status ) self._parser.start() self._downloader_pool_checker.start() self._parse_pool_checker.start() self._status_update.start() """notify mysql, i am started""" self.sqlex.write_if_start() except (Exception) as e: Log().debug("start failed") raise(e) return False def stop(self): self._istart = False """"clear download and parse popl""" self._download_pool.clear() self._parse_pool.clear() """stop downloader and parser threads""" self._downloader.stop() self._parser.stop() """"Those two checker threads will end when the thread who calls them ends""" self._downloader_pool_checker.join() self._parse_pool_checker.join() self._status_update.join() print ("Engine is stopping") def pause(self): pass def finish_download(self, html_task): sentence = "Downloaded:[No.{0}] time:{1:0.1f} page:depth_parent {2}_{3} http-code: {4} data-size: {5}byes url: {6}"\ .format(self._status._download_times,time()-self._status._sys_start,html_task._depth,\ html_task._parent,html_task._return_code, html_task._data_size, html_task._url ) #if self._status._download_times <= 500 : # self.f.write(sentence+"\n") """caculate the path for saving files""" full_path = self._path+"[No.{0}]_".format(self._status._download_times)+".html" """save html data to files""" #f= open(full_path, 'w') #f.write(html_task._data) #f.close() """After downloading, pass the data(still using the html objects) to the parse pool""" self._parse_pool.append(html_task) def finish_parse(self, html_task): ''' print("parsed:[No.{0}] time:{1:0.1f} page:depth_parent {2}_{3} http-status: {4} data-size: {5}byes url:{6}"\ .format(self._status._download_times,time()-self._status._sys_start,html_task._depth,\ html_task._parent,html_task._return_code, html_task._data_size, html_task._url)) ''' """After parsing, pass the urls to be downloaded to the download pool""" if(self._earlyvisithandler.check_visited(html_task) == True): #print("Ingore the link visited before, this link is within page {0} , so don't put it in queue".format(html_task._parent), html_task._url) self._status._early_visit +=1 return if(self._robothandler.is_allowed(html_task) == False): #print("Blocked by the Robot.txt, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) self._status._robot +=1 return self._earlyvisithandler.add_entry(html_task._md5, html_task) self._download_pool.append(html_task) def download_pool_checker(self): while (self._istart == True): new_download_task = self._download_pool.pop_left() """If there is no task remain in the download pool, put the thread into sleep""" """else pop the new task, and download it""" """for the engine to get the result to put into the parse pool, we need to pass the function finish_download down as a callback""" if (new_download_task == None): #print("No task remaining in download_pool") sleep(0.1) else: self._downloader.queue_download_task(new_download_task , self.finish_download) def parse_pool_checker(self): while (self._istart == True): new_parse_task = self._parse_pool.pop_left() if (new_parse_task == None): #print("sleeping") sleep(0.1) else: self._parser.queue_parse_task(new_parse_task, self.finish_parse) #~~~see result at http://dengxu.me/crawling/ def status_update(self): while (self._istart == True): self._status._download_queue = self._downloader.len() self._status._parse_queue = self._parser.len() sentence = "[time: {0:0.1f}],queue:{8}, down: {1}, total: {2:0.1f}MB | queue:{9}, parsed: {3},scheme:{10}, cig: {4}, bookmark: {11} type {12} visited: {5}, robot: {6},nestlv: {7} | error: 404: {13} , timeout: {14}"\ .format( time()-self._status._sys_start,\ self._status._download_times, float(self._status._download_size)/1024/1024, self._status._parse_times\ ,self._status._cgi, self._status._early_visit, self._status._robot, self._status._nestlv\ ,self._downloader.len(), self._parser.len(),self._status._scheme_type, self._status._bookmark, self._status._file_type\ ,self._status._404,self._status._socket_timeout) print sentence #if( self._status._download_times > 500): # self.f.write( sentence+"\n") """update status tp mysql""" self.sqlex.write_status(self._status) """update recent download url""" self.sqlex.write_recent_download(self._status) sleep(1)