Exemplo n.º 1
0
    def runSpider(self, iterations):
        startTime = time.time()
        for i in range(0, iterations):
            self.queue = FileIO.fileToSet(self.queueFile)
            self.crawled = FileIO.fileToSet(self.crawledFile)

            newLinks = set()
            newCrawledLinks = set()

            while (len(self.queue) != 0):
                nextLink = self.queue.pop()
                res = self.crawlPage(nextLink)
                newCrawledLinks.add(nextLink)
                newLinks = newLinks.union(res)

            FileIO.deleteFileContents(self.queueFile)
            FileIO.setToFile(newLinks, self.queueFile)
            FileIO.setToFile(newCrawledLinks, self.crawledFile)

        FileIO.writeJsonFile(self.outlinkGraph.nodes, self.outlinkGraphFile)
        FileIO.writeJsonFile(self.inlinkGraph.nodes, self.inlinkGraphFile)

        log(
            'time', "Crawler for " + self.siteName +
            " execution Finished. Runtime: " + str(time.time() - startTime) +
            "seconds. Total links crawled: " + str(self.numCrawled))
Exemplo n.º 2
0
 def runParser(self):
     if not os.path.isfile(self.crawledFile):
         log('error', 'No crawled file.')
         return self
     self.links = FileIO.fileToSet(self.crawledFile)
     if not self.links:
         log('error', 'Crawled file is empty')
         return self
     data = FileIO.readJsonFile(self.indexFile)
     for link in self.links:
         if link not in data:
             obj = extractData(link)
             data[link] = {
                 'docId': DataParser.docId,
                 'title': obj['title'],
                 'body': obj['body']
             }
             DataParser.docId += 1
     FileIO.deleteFileContents(self.indexFile)
     FileIO.writeJsonFile(data, self.indexFile)
Exemplo n.º 3
0
 def saveLinkGraphs(self):
     FileIO.writeJsonFile(self.outlinkGraph.nodes, self.outlinkGraphFile)
     FileIO.writeJsonFile(self.inlinkGraph.nodes, self.inlinkGraphFile)