def runSpider(self, iterations): startTime = time.time() for i in range(0, iterations): self.queue = FileIO.fileToSet(self.queueFile) self.crawled = FileIO.fileToSet(self.crawledFile) newLinks = set() newCrawledLinks = set() while (len(self.queue) != 0): nextLink = self.queue.pop() res = self.crawlPage(nextLink) newCrawledLinks.add(nextLink) newLinks = newLinks.union(res) FileIO.deleteFileContents(self.queueFile) FileIO.setToFile(newLinks, self.queueFile) FileIO.setToFile(newCrawledLinks, self.crawledFile) FileIO.writeJsonFile(self.outlinkGraph.nodes, self.outlinkGraphFile) FileIO.writeJsonFile(self.inlinkGraph.nodes, self.inlinkGraphFile) log( 'time', "Crawler for " + self.siteName + " execution Finished. Runtime: " + str(time.time() - startTime) + "seconds. Total links crawled: " + str(self.numCrawled))
def runParser(self): if not os.path.isfile(self.crawledFile): log('error', 'No crawled file.') return self self.links = FileIO.fileToSet(self.crawledFile) if not self.links: log('error', 'Crawled file is empty') return self data = FileIO.readJsonFile(self.indexFile) for link in self.links: if link not in data: obj = extractData(link) data[link] = { 'docId': DataParser.docId, 'title': obj['title'], 'body': obj['body'] } DataParser.docId += 1 FileIO.deleteFileContents(self.indexFile) FileIO.writeJsonFile(data, self.indexFile)
def saveLinkGraphs(self): FileIO.writeJsonFile(self.outlinkGraph.nodes, self.outlinkGraphFile) FileIO.writeJsonFile(self.inlinkGraph.nodes, self.inlinkGraphFile)