def runSitemapCrawler(self): startTime = time.time() headers = { 'User-Agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Mobile Safari/537.36" } xmlQueue = set() xmlQueue.add(self.sitemapURL) htmlQueue = set() log('sitemap', 'Crawling XML Sitemap for ' + self.siteName) while (len(xmlQueue) != 0): nextParse = requests.get(xmlQueue.pop(), headers=headers) newXMLLinks = self.findNewLinksXML(nextParse) for link in newXMLLinks: if '.xml' in link: if 'archive' not in link: xmlQueue.add(link) else: htmlQueue.add(link) FileIO.deleteFileContents(self.crawledFile) FileIO.setToFile(htmlQueue, self.crawledFile) log( 'time', 'Finished crawling XML sitemap for ' + self.siteName + ' in ' + str(time.time() - startTime) + ' seconds')
def runSpider(self, iterations): startTime = time.time() for i in range(0, iterations): self.queue = FileIO.fileToSet(self.queueFile) self.crawled = FileIO.fileToSet(self.crawledFile) newLinks = set() newCrawledLinks = set() while (len(self.queue) != 0): nextLink = self.queue.pop() res = self.crawlPage(nextLink) newCrawledLinks.add(nextLink) newLinks = newLinks.union(res) FileIO.deleteFileContents(self.queueFile) FileIO.setToFile(newLinks, self.queueFile) FileIO.setToFile(newCrawledLinks, self.crawledFile) FileIO.writeJsonFile(self.outlinkGraph.nodes, self.outlinkGraphFile) FileIO.writeJsonFile(self.inlinkGraph.nodes, self.inlinkGraphFile) log( 'time', "Crawler for " + self.siteName + " execution Finished. Runtime: " + str(time.time() - startTime) + "seconds. Total links crawled: " + str(self.numCrawled))