def crawlingThread(self): """Takes URL from the urlToVisit queue and visits them""" logger.log(logging.DEBUG, "CrawlingThread started") self.scrapper = scrapping.Scrapper(self.config.userAgent, self.config.robotParserEnabled, self.config.domainRestricted, self.config.crawling) while self.isActive: try: urlList = protocol.deQueue([self.urlToVisit]) if not urlList: time.sleep(0.2) #temp - For testing continue for url in urlList: session = self.scrapper.visit(url) logger.log( logging.DEBUG, "Session \n" + str(session.url) + "\nCode : " + str(session.returnCode) + "\nRequest time : " + str(session.requestTime) + "\nBs time : " + str(session.bsParsingTime)) if not session.failed: if self.crawlingType == protocol.ConfigurationPayload.DYNAMIC_CRAWLING: payload = protocol.URLPayload( session.scrappedURLs, protocol.URLPayload.SCRAPPED_URL) packet = protocol.Packet(protocol.URL, payload) self.outputQueue.put(packet) payload = protocol.URLPayload( [url], protocol.URLPayload.VISITED, session=session) packet = protocol.Packet(protocol.URL, payload) self.outputQueue.put(packet) else: logger.log(logging.INFO, "Skipping URL : " + url) payload = protocol.URLPayload( [url], protocol.URLPayload.SKIPPED, session) packet = protocol.Packet(protocol.URL, payload) self.outputQueue.put(packet) continue except: exc_type, exc_value, exc_traceback = sys.exc_info() message = ''.join( traceback.format_exception(exc_type, exc_value, exc_traceback)) logger.log(logging.CRITICAL, message) self.isActive = False
def mainRoutine(self): """To Come in da future. For now, no use""" logger.log(logging.INFO, "Starting server mainRoutine") for url in self.configurationPayload.config.rootUrls: payload = protocol.URLPayload([str(url)], protocol.URLPayload.TOVISIT) packet = protocol.Packet(protocol.URL, payload) urlVisited[url] = True outputQueue.put(packet) if self.configurationPayload.crawlingType == protocol.ConfigurationPayload.STATIC_CRAWLING and ( self.configurationPayload.config.crawlDelay != 0): if self.configurationPayload.config.crawlDelay != 0: time.sleep(self.configurationPayload.config.crawlDelay) while self.isActive: try: if self.configurationPayload.crawlingType == protocol.ConfigurationPayload.DYNAMIC_CRAWLING: url = urlToVisit.get(True) payload = protocol.URLPayload([str(url)], protocol.URLPayload.TOVISIT) packet = protocol.Packet(protocol.URL, payload) outputQueue.put(packet) self.requestCount = self.requestCount + 1 if self.configurationPayload.config.crawlDelay != 0: time.sleep(self.configurationPayload.config.crawlDelay) if self.requestLimit != 0 and len( visitedURLlist) + 1 > self.requestLimit: break elif self.configurationPayload.crawlingType == protocol.ConfigurationPayload.STATIC_CRAWLING: if (len(skippedURLlist + visitedURLlist) == len( self.configurationPayload.config.rootUrls)): break else: time.sleep(0.3) except: exc_type, exc_value, exc_traceback = sys.exc_info() message = "\n" + ''.join( traceback.format_exception(exc_type, exc_value, exc_traceback)) logger.log(logging.ERROR, message) logger.log(logging.INFO, "Scrapping complete. Terminating...") self.disconnectAllClient() self.isActive = False
def sendConfig(self, configuration): """Sends the configuration to the client""" logger.log(logging.DEBUG, self.formattedAddr + "Sending configuration") self.configuration = configuration packet = protocol.Packet(protocol.CONFIG, self.configuration) self.writeSocket(packet) logger.log(logging.DEBUG, self.formattedAddr + "Configuration sent waiting for ACK") packet = self.readSocket(5) if packet.type == protocol.INFO: if packet.payload.info == protocol.InfoPayload.CLIENT_ACK: logger.log( logging.DEBUG, self.formattedAddr + "Working node ACK received (configuration)") return else: self.isActive = False raise Exception("Unable to transmit configuration")
def readConfig(self): """Reads the configuration from the server""" logger.log(logging.DEBUG, "Waiting for configuration from the server.") if self.isActive: try: deserializedPacket = self.readSocket() logger.log(logging.DEBUG, "Configuration received.") if deserializedPacket.type == protocol.CONFIG: self.crawlingType = deserializedPacket.payload.crawlingType self.config = deserializedPacket.payload.config # dynamic module reload basePath = os.path.dirname(sys.argv[0]) if basePath: basePath = basePath + "/" # path building rulePath = basePath + "modules/rule.py" scrappingPath = basePath + "modules/scrapping.py" # re-writing source .py logger.log(logging.INFO, "Importing rule.py from server") ruleFd = open(rulePath, 'w') ruleFd.write(self.config.rule_py) ruleFd.close() logger.log(logging.INFO, "Importing scrapping.py from server") scrappingFd = open(scrappingPath, 'w') scrappingFd.write(self.config.scrapping_py) scrappingFd.close() # compilation test try: code=open(rulePath, 'rU').read() compile(code, "rule_test", "exec") except: exc_type, exc_value, exc_traceback = sys.exc_info() message = ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback)) logger.log(logging.CRITICAL, message) logger.log(logging.ERROR, "Unable to compile rule.py (is the syntax right?)") sys.exit(0) try: code=open(scrappingPath, 'rb').read(os.path.getsize(scrappingPath)) compile(code, "scrapping_test", "exec") except: exc_type, exc_value, exc_traceback = sys.exc_info() message = ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback)) logger.log(logging.CRITICAL, message) logger.log(logging.ERROR, "Unable to compile scrapping.py (is the syntax right?)") sys.exit(0) # dynamic reload of modules # TODO reloading of rule.py should eventually come here logger.log(logging.INFO, "Reloading modules imported for server") reload(sys.modules["modules.scrapping"]) payload = protocol.InfoPayload(protocol.InfoPayload.CLIENT_ACK) packet = protocol.Packet(protocol.INFO, payload) self.writeSocket(packet) logger.log(logging.DEBUG, "Sending ACK for configuration.") else: raise Exception("Unable to parse configuration.") except: exc_type, exc_value, exc_traceback = sys.exc_info() message = ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback)) logger.log(logging.CRITICAL, message) self.isActive = False