def testLoadWikipediaTitlesIgnoresIgnorablePages(self): """ L{loadWikipediaTitles} ignores category, file, portal, etc. pages. """ path = sibpath(__file__, 'ignored.xml') loadWikipediaTitles(path, self.pageHandler) self.assertEqual([], self.pageHandler.pages)
def testLoadWikipediaTitlesClosesPageHandler(self): """ The C{close} method on the page handler will be called when L{loadWikipediaTitles} finishes parsing the XML document. """ path = sibpath(__file__, 'basic.xml') loadWikipediaTitles(path, self.pageHandler) self.assertTrue(self.pageHandler.closed)
def testLoadWikipediaTitles(self): """ L{loadWikipediaTitles} yields a L{WikipediaPage} instance for each C{page} element in the specified XML file. """ path = sibpath(__file__, 'basic.xml') loadWikipediaTitles(path, self.pageHandler) [page] = self.pageHandler.pages self.assertEqual('Anaconda', page.title)
def main(inputFilename, outputPath): """Load Wikipedia page titles from an XML file and write out JSON files. @param inputFilename: The path to the Wikipedia XML file to load data from. @param outputPath: The path to write JSON files to. """ basicConfig(format='%(asctime)s %(levelname)8s %(message)s', level=INFO) pageHandler = WikipediaPageHandler(outputPath) loadWikipediaTitles(inputFilename, pageHandler)
def testLoadWikipediaTitlesIgnoresDisambiguationPages(self): """L{loadWikipediaTitles} ignores disambiguation pages.""" path = sibpath(__file__, 'disambiguation.xml') loadWikipediaTitles(path, self.pageHandler) self.assertEqual([], self.pageHandler.pages)
def testLoadWikipediaTitlesIgnoresRedirectPages(self): """L{loadWikipediaTitles} ignores pages that are redirects.""" path = sibpath(__file__, 'redirect.xml') loadWikipediaTitles(path, self.pageHandler) self.assertEqual([], self.pageHandler.pages)