def test_frontier_with_multi_thread(self): keyFunc = lambda x: x / 10 filterFunc = lambda x: x % 10 > 3 numOfQ = 5 f = Frontier(numOfQ, keyFunc=keyFunc) f.addFilter(filterFunc) for i in range(numOfQ): Thread(target=put_numbers, args=(f, 0, 49)).start() out = [] threads = [] for i in range(6): out.append([]) t = Thread(target=get_numbers, args=(f, out[i])) threads.append(t) t.start() for t in threads: t.join() result = [] for i in range(6): result += out[i] result.sort() for i in range(len(result)): # assert continous output are from different groups assert (keyFunc(result[i][1]) == i % numOfQ)
def test_get_with_filter(self): f = Frontier(6, keyFunc=lambda x: x / 10) f.addFilter(lambda x: x % 10 == 0) for i in range(50): f.put(i) for j in range(1, 10): for i in range(5): self.assertEqual(f.get(), i * 10 + j) self.assertEqual(list_queue(f._frontQ), [])
def test_get_with_filter_but_insufficient_backQs(self): f = Frontier(6, keyFunc=lambda x: x % 10) f.addFilter(lambda x: x / 10 == 3) for i in range(50): f.put(i) for i in range(50): if i / 10 != 3: self.assertEqual(f.get(), i) self.assertEqual(list_queue(f._frontQ), [])
def test_get_with_no_filter(self): f = Frontier(2, keyFunc=hostname) f.put('http://dropbox.com/') f.put('http://google.com/') f.put('http://google.com/index.html') f.put('http://python.org/') self.assertEqual(f.get(), 'http://dropbox.com/') self.assertEqual(f.get(), 'http://google.com/') self.assertEqual(f.get(), 'http://python.org/') self.assertEqual(f.get(), 'http://google.com/index.html') self.assertEqual(list_queue(f._frontQ), [])
def test_puts_with_one_filter(self): f = Frontier(6) f.addFilter(lambda url: 'd' in url) f.put('http://google.com/') f.put('http://dropbox.com/') f.put('http://python.org/') self.assertEqual(list_queue(f._frontQ), [ 'http://google.com/', 'http://python.org/', ])
def test_puts_with_no_filters(self): f = Frontier(6) f.put('http://google.com/') f.put('http://dropbox.com/') f.put('http://python.org/') self.assertEqual(list_queue(f._frontQ), [ 'http://google.com/', 'http://dropbox.com/', 'http://python.org/', ])
def test_transfer_with_insufficient_backQs(self): f = Frontier(2, keyFunc=hostname) f.put('http://google.com/') f.put('http://dropbox.com/') f.put('http://google.com/index.html') f.put('http://python.org/') f._transfer() self.assertEqual( list_queue(f._backQ[0]), ['http://google.com/', 'http://google.com/index.html']) self.assertEqual(list_queue(f._backQ[1]), ['http://dropbox.com/']) self.assertEqual(list_queue(f._frontQ), ['http://python.org/'])
def test_frontier_with_url_file(self): f = Frontier(12, keyFunc=hostname) import os seeds = open(os.path.realpath("test/sample_input"), 'r') urls = [] for line in seeds.readlines(): f.put(line.strip()) seeds.close() output = open("test/sample_output", "w") output.write("f.size() = " + str(f.size()) + "\n") while (f.size() > 0): item = f.get() item = "" if item is None else item line = item.encode('utf8') + "\n" output.write(line) output.close()
def __init__(self, nDownloader = DEFAULT_DOWNLOADERS, manager = DEFAULT_MANAGER, \ regPort = DEFAULT_REG_PORT, dbPort = DEFAULT_DB_PORT, urlPort = None, pagePort = None): """ Initialize a crawler object. --------- Param -------- nDownloader (int): the nubmer of downloader threads. manager: the host on which manager is started. regPort: the port on which manager expects connection requests. urlPort: the port on which this worker sends url to manager. pagePort: the port on which this worker sends page to manager. --------- Return -------- None. """ ## prepare the url frontier and page queue self._pageQ = Queue(MAX_PAGE_QSIZE) self._urlFrontier = Frontier(3*nDownloader, MAX_URL_QSIZE, \ keyFunc=lambda url: urllib2.Request(url).get_host(), \ priorityFunc=self.getLastVisitTime) self._visitSite = {} self._lock = RLock() self._stopEvent = Event() ## prepare filters filetypeFilter = urlFilter.FileTypeFilter(True, ['text/html']) robotFilter = urlFilter.RobotFilter(Downloader.DEFAULT_USER_AGENT) self._urlDupEliminator = urlFilter.DupEliminator() self._urlFrontier.addFilter(filetypeFilter.disallow) self._urlFrontier.addFilter(self._urlDupEliminator.seenBefore) # self._urlFrontier.addFilter(robotFilter.disallow) ## initialize sockets. self._manager = manager self._regPort = regPort self._urlPort = urlPort self._thisHost = socket.gethostbyname(socket.gethostname()) self._dbclient = MongoClient(manager, dbPort) context = zmq.Context() self._regSocket = context.socket(zmq.REQ) self._regSocket.connect("tcp://%s:%d" % (manager, self._regPort)) self._urlPushSocket = context.socket(zmq.PUSH) self._urlPullSocket = context.socket(zmq.PULL) if (self._urlPort is None): self._urlPort = self._urlPullSocket.bind_to_random_port( "tcp://%s" % self._thisHost) else: self._urlPullSocket.bind("tcp://*:%d" % (self._urlPort)) ## prepare log files if (not os.path.exists("log")): os.makedirs("log") self._logger = logging.getLogger("engine") self._logger.addHandler( logging.FileHandler( os.path.abspath("log/engine%d.log" % self._urlPort))) self._logger.setLevel(logging.WARNING) parseLogger = logging.getLogger("parser") parseLogger.addHandler( logging.FileHandler(os.path.abspath("log/parser.log"))) parseLogger.setLevel(logging.WARNING) ## create threads for downloading and parsing tasks self._downloaders = [] for i in range(nDownloader): downloader = Downloader(self._urlFrontier, self._pageQ, self._logger, self.updateLastVisitTime) downloader.daemon = True self._downloaders.append(downloader) self._parser = Parser(self._pageQ, self._urlPushSocket, self._dbclient, parseLogger)
def test_instantiation(self): f = Frontier(6) self.assertIsInstance(f, Frontier)