Exemplo n.º 1
0
    def crawlingThread(self):
        """Takes URL from the urlToVisit queue and visits them"""
        logger.log(logging.DEBUG, "CrawlingThread started")

        self.scrapper = scrapping.Scrapper(self.config.userAgent,
                                           self.config.robotParserEnabled,
                                           self.config.domainRestricted,
                                           self.config.crawling)

        while self.isActive:
            try:
                urlList = protocol.deQueue([self.urlToVisit])

                if not urlList:
                    time.sleep(0.2)  #temp - For testing
                    continue

                for url in urlList:
                    session = self.scrapper.visit(url)
                    logger.log(
                        logging.DEBUG, "Session \n" + str(session.url) +
                        "\nCode : " + str(session.returnCode) +
                        "\nRequest time : " + str(session.requestTime) +
                        "\nBs time : " + str(session.bsParsingTime))

                    if not session.failed:
                        if self.crawlingType == protocol.ConfigurationPayload.DYNAMIC_CRAWLING:
                            payload = protocol.URLPayload(
                                session.scrappedURLs,
                                protocol.URLPayload.SCRAPPED_URL)
                            packet = protocol.Packet(protocol.URL, payload)
                            self.outputQueue.put(packet)

                        payload = protocol.URLPayload(
                            [url],
                            protocol.URLPayload.VISITED,
                            session=session)
                        packet = protocol.Packet(protocol.URL, payload)
                        self.outputQueue.put(packet)
                    else:
                        logger.log(logging.INFO, "Skipping URL : " + url)
                        payload = protocol.URLPayload(
                            [url], protocol.URLPayload.SKIPPED, session)
                        packet = protocol.Packet(protocol.URL, payload)
                        self.outputQueue.put(packet)
                        continue

            except:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                message = ''.join(
                    traceback.format_exception(exc_type, exc_value,
                                               exc_traceback))
                logger.log(logging.CRITICAL, message)
                self.isActive = False
Exemplo n.º 2
0
    def mainRoutine(self):
        """To Come in da future. For now, no use"""
        logger.log(logging.INFO, "Starting server mainRoutine")

        for url in self.configurationPayload.config.rootUrls:
            payload = protocol.URLPayload([str(url)],
                                          protocol.URLPayload.TOVISIT)
            packet = protocol.Packet(protocol.URL, payload)
            urlVisited[url] = True
            outputQueue.put(packet)

            if self.configurationPayload.crawlingType == protocol.ConfigurationPayload.STATIC_CRAWLING and (
                    self.configurationPayload.config.crawlDelay != 0):
                if self.configurationPayload.config.crawlDelay != 0:
                    time.sleep(self.configurationPayload.config.crawlDelay)

        while self.isActive:
            try:
                if self.configurationPayload.crawlingType == protocol.ConfigurationPayload.DYNAMIC_CRAWLING:
                    url = urlToVisit.get(True)
                    payload = protocol.URLPayload([str(url)],
                                                  protocol.URLPayload.TOVISIT)
                    packet = protocol.Packet(protocol.URL, payload)
                    outputQueue.put(packet)
                    self.requestCount = self.requestCount + 1

                    if self.configurationPayload.config.crawlDelay != 0:
                        time.sleep(self.configurationPayload.config.crawlDelay)

                    if self.requestLimit != 0 and len(
                            visitedURLlist) + 1 > self.requestLimit:
                        break

                elif self.configurationPayload.crawlingType == protocol.ConfigurationPayload.STATIC_CRAWLING:
                    if (len(skippedURLlist + visitedURLlist) == len(
                            self.configurationPayload.config.rootUrls)):
                        break
                    else:
                        time.sleep(0.3)
            except:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                message = "\n" + ''.join(
                    traceback.format_exception(exc_type, exc_value,
                                               exc_traceback))
                logger.log(logging.ERROR, message)

        logger.log(logging.INFO, "Scrapping complete. Terminating...")
        self.disconnectAllClient()
        self.isActive = False