class Crawler(): ''' classdocs ''' def __init__(self, urllist, recsys): ''' Constructs a new crawler using the specified url list and the specified recommender system to rate the relevance of the downloaded pages @param urllist: A list of urls to start with @param recsys: A recommender system to rate pages ''' self.__urllist = urllist self.__queue = PriorityQueue() self.__recsys = recsys self.__threads = [] self.__visited = [] def crawl(self, threads=2, block=True): ''' Starts crawling using the (optional) specified number of threads Blocks until crawling is finished if no other option is given @param threads: The number of threads used for crawling, default is 2 @param block: False, if the method should be non-blocking (True by default) ''' for i in range(threads): t = _CrawlThread(i, self.__queue, self.__recsys, self.__visited) self.__threads.append(t) t.deamon = True t.start() for url in self.__urllist: self.__queue.enqueue(url, 1.) self.__queue.join() def save(self): ''' Sends a request to all running threads to safe the current state of their libraries ''' for t in self.__threads: t.request_save() def abort(self): ''' Safely aborts the crawling threads and saves the downloaded webpages ''' for t in self.__threads: t.request_stop()
def __init__(self, urllist, recsys): ''' Constructs a new crawler using the specified url list and the specified recommender system to rate the relevance of the downloaded pages @param urllist: A list of urls to start with @param recsys: A recommender system to rate pages ''' self.__urllist = urllist self.__queue = PriorityQueue() self.__recsys = recsys self.__threads = [] self.__visited = []