def test_frontier_with_multi_thread(self):
        keyFunc = lambda x: x / 10
        filterFunc = lambda x: x % 10 > 3
        numOfQ = 5
        f = Frontier(numOfQ, keyFunc=keyFunc)
        f.addFilter(filterFunc)
        for i in range(numOfQ):
            Thread(target=put_numbers, args=(f, 0, 49)).start()

        out = []
        threads = []
        for i in range(6):
            out.append([])
            t = Thread(target=get_numbers, args=(f, out[i]))
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

        result = []
        for i in range(6):
            result += out[i]
        result.sort()
        for i in range(len(result)):
            # assert continous output are from different groups
            assert (keyFunc(result[i][1]) == i % numOfQ)
    def test_get_with_filter(self):
        f = Frontier(6, keyFunc=lambda x: x / 10)
        f.addFilter(lambda x: x % 10 == 0)

        for i in range(50):
            f.put(i)

        for j in range(1, 10):
            for i in range(5):
                self.assertEqual(f.get(), i * 10 + j)
        self.assertEqual(list_queue(f._frontQ), [])
    def test_get_with_filter_but_insufficient_backQs(self):
        f = Frontier(6, keyFunc=lambda x: x % 10)
        f.addFilter(lambda x: x / 10 == 3)

        for i in range(50):
            f.put(i)

        for i in range(50):
            if i / 10 != 3:
                self.assertEqual(f.get(), i)
        self.assertEqual(list_queue(f._frontQ), [])
    def test_get_with_no_filter(self):
        f = Frontier(2, keyFunc=hostname)
        f.put('http://dropbox.com/')
        f.put('http://google.com/')
        f.put('http://google.com/index.html')
        f.put('http://python.org/')

        self.assertEqual(f.get(), 'http://dropbox.com/')
        self.assertEqual(f.get(), 'http://google.com/')
        self.assertEqual(f.get(), 'http://python.org/')
        self.assertEqual(f.get(), 'http://google.com/index.html')
        self.assertEqual(list_queue(f._frontQ), [])
    def test_puts_with_one_filter(self):
        f = Frontier(6)
        f.addFilter(lambda url: 'd' in url)

        f.put('http://google.com/')
        f.put('http://dropbox.com/')
        f.put('http://python.org/')

        self.assertEqual(list_queue(f._frontQ), [
            'http://google.com/',
            'http://python.org/',
        ])
    def test_puts_with_no_filters(self):
        f = Frontier(6)

        f.put('http://google.com/')
        f.put('http://dropbox.com/')
        f.put('http://python.org/')

        self.assertEqual(list_queue(f._frontQ), [
            'http://google.com/',
            'http://dropbox.com/',
            'http://python.org/',
        ])
    def test_transfer_with_insufficient_backQs(self):
        f = Frontier(2, keyFunc=hostname)
        f.put('http://google.com/')
        f.put('http://dropbox.com/')
        f.put('http://google.com/index.html')
        f.put('http://python.org/')

        f._transfer()
        self.assertEqual(
            list_queue(f._backQ[0]),
            ['http://google.com/', 'http://google.com/index.html'])
        self.assertEqual(list_queue(f._backQ[1]), ['http://dropbox.com/'])
        self.assertEqual(list_queue(f._frontQ), ['http://python.org/'])
    def test_frontier_with_url_file(self):
        f = Frontier(12, keyFunc=hostname)
        import os
        seeds = open(os.path.realpath("test/sample_input"), 'r')
        urls = []
        for line in seeds.readlines():
            f.put(line.strip())
        seeds.close()

        output = open("test/sample_output", "w")
        output.write("f.size() = " + str(f.size()) + "\n")
        while (f.size() > 0):
            item = f.get()
            item = "" if item is None else item
            line = item.encode('utf8') + "\n"
            output.write(line)
        output.close()
Пример #9
0
    def __init__(self, nDownloader = DEFAULT_DOWNLOADERS, manager = DEFAULT_MANAGER, \
     regPort = DEFAULT_REG_PORT, dbPort = DEFAULT_DB_PORT, urlPort = None, pagePort = None):
        """
		Initialize a crawler object.
		---------  Param --------
		nDownloader (int):
			the nubmer of downloader threads.
		manager:
			the host on which manager is started.
		regPort:
			the port on which manager expects connection requests.
		urlPort:
			the port on which this worker sends url to manager.
		pagePort:
			the port on which this worker sends page to manager.

		---------  Return --------
		None.
		"""
        ## prepare the url frontier and page queue
        self._pageQ = Queue(MAX_PAGE_QSIZE)
        self._urlFrontier = Frontier(3*nDownloader, MAX_URL_QSIZE, \
           keyFunc=lambda url: urllib2.Request(url).get_host(), \
           priorityFunc=self.getLastVisitTime)
        self._visitSite = {}
        self._lock = RLock()
        self._stopEvent = Event()

        ## prepare filters
        filetypeFilter = urlFilter.FileTypeFilter(True, ['text/html'])
        robotFilter = urlFilter.RobotFilter(Downloader.DEFAULT_USER_AGENT)
        self._urlDupEliminator = urlFilter.DupEliminator()
        self._urlFrontier.addFilter(filetypeFilter.disallow)
        self._urlFrontier.addFilter(self._urlDupEliminator.seenBefore)
        # self._urlFrontier.addFilter(robotFilter.disallow)

        ## initialize sockets.
        self._manager = manager
        self._regPort = regPort
        self._urlPort = urlPort
        self._thisHost = socket.gethostbyname(socket.gethostname())
        self._dbclient = MongoClient(manager, dbPort)
        context = zmq.Context()
        self._regSocket = context.socket(zmq.REQ)
        self._regSocket.connect("tcp://%s:%d" % (manager, self._regPort))
        self._urlPushSocket = context.socket(zmq.PUSH)

        self._urlPullSocket = context.socket(zmq.PULL)
        if (self._urlPort is None):
            self._urlPort = self._urlPullSocket.bind_to_random_port(
                "tcp://%s" % self._thisHost)
        else:
            self._urlPullSocket.bind("tcp://*:%d" % (self._urlPort))

        ## prepare log files
        if (not os.path.exists("log")):
            os.makedirs("log")
        self._logger = logging.getLogger("engine")
        self._logger.addHandler(
            logging.FileHandler(
                os.path.abspath("log/engine%d.log" % self._urlPort)))
        self._logger.setLevel(logging.WARNING)
        parseLogger = logging.getLogger("parser")
        parseLogger.addHandler(
            logging.FileHandler(os.path.abspath("log/parser.log")))
        parseLogger.setLevel(logging.WARNING)

        ## create threads for downloading and parsing tasks
        self._downloaders = []
        for i in range(nDownloader):
            downloader = Downloader(self._urlFrontier, self._pageQ,
                                    self._logger, self.updateLastVisitTime)
            downloader.daemon = True
            self._downloaders.append(downloader)
        self._parser = Parser(self._pageQ, self._urlPushSocket, self._dbclient,
                              parseLogger)
 def test_instantiation(self):
     f = Frontier(6)
     self.assertIsInstance(f, Frontier)