def __init__(self,url,dbFile,outputFile,maxCount=None): self.url = url # url to be crawled if maxCount == None: self.maxCount = -1 else: ''' maxcount is the maximum number of links to be fetched by the crawler. It is incremented as we should accommodate the initial user input while counting the total number of links in the repository as the link entered by the user will also be persisted in the repository (i.e)if user requests to crawl python.org and asks to fetch 2 links , the program should terminate when there are 3 links in repository as python.org is also one of the links in repository ''' self.maxCount = maxCount + 1 self.extracter = LinkExtracter() self.dataHandler = DataHandler(self.maxCount,dbFile,outputFile) self.log = CrawlerLogger.getlogger()
def __init__(self, url, dbFile, outputFile, maxCount=None): self.url = url # url to be crawled if maxCount == None: self.maxCount = -1 else: ''' maxcount is the maximum number of links to be fetched by the crawler. It is incremented as we should accommodate the initial user input while counting the total number of links in the repository as the link entered by the user will also be persisted in the repository (i.e)if user requests to crawl python.org and asks to fetch 2 links , the program should terminate when there are 3 links in repository as python.org is also one of the links in repository ''' self.maxCount = maxCount + 1 self.extracter = LinkExtracter() self.dataHandler = DataHandler(self.maxCount, dbFile, outputFile) self.log = CrawlerLogger.getlogger()
def __init__(self): self.log = CrawlerLogger.getlogger()
sys.exit() if cmdlength == 3: try: maxlinks = int(sys.argv[2]) except ValueError: print('Invalid maximum links') sys.exit() if maxlinks < 1: print("maximum links should be minimum 1") sys.exit() else: print("Invalid number of arguments") sys.exit() try: signal.signal(signal.SIGINT, signal_handler) CrawlerLogger.init() log = CrawlerLogger.getlogger() if not url[len(url)-1] == '/': url = url + '/' crawler = Crawler(url,'crawler.db','links.txt',maxlinks) print('Crawling ....') res = crawler.Crawl() if res: webbrowser.open("links.txt") except CrawlerError as ce: print(ce) except Exception as e: if not log is None: log.error(e,exc_info=sys.exc_info()[2])