Exemplo n.º 1
0
 def testLoadWikipediaTitlesIgnoresIgnorablePages(self):
     """
     L{loadWikipediaTitles} ignores category, file, portal, etc. pages.
     """
     path = sibpath(__file__, 'ignored.xml')
     loadWikipediaTitles(path, self.pageHandler)
     self.assertEqual([], self.pageHandler.pages)
Exemplo n.º 2
0
 def testLoadWikipediaTitlesClosesPageHandler(self):
     """
     The C{close} method on the page handler will be called when
     L{loadWikipediaTitles} finishes parsing the XML document.
     """
     path = sibpath(__file__, 'basic.xml')
     loadWikipediaTitles(path, self.pageHandler)
     self.assertTrue(self.pageHandler.closed)
Exemplo n.º 3
0
 def testLoadWikipediaTitles(self):
     """
     L{loadWikipediaTitles} yields a L{WikipediaPage} instance for each
     C{page} element in the specified XML file.
     """
     path = sibpath(__file__, 'basic.xml')
     loadWikipediaTitles(path, self.pageHandler)
     [page] = self.pageHandler.pages
     self.assertEqual('Anaconda', page.title)
def main(inputFilename, outputPath):
    """Load Wikipedia page titles from an XML file and write out JSON files.

    @param inputFilename: The path to the Wikipedia XML file to load data from.
    @param outputPath: The path to write JSON files to.
    """
    basicConfig(format='%(asctime)s %(levelname)8s  %(message)s', level=INFO)
    pageHandler = WikipediaPageHandler(outputPath)
    loadWikipediaTitles(inputFilename, pageHandler)
Exemplo n.º 5
0
 def testLoadWikipediaTitlesIgnoresDisambiguationPages(self):
     """L{loadWikipediaTitles} ignores disambiguation pages."""
     path = sibpath(__file__, 'disambiguation.xml')
     loadWikipediaTitles(path, self.pageHandler)
     self.assertEqual([], self.pageHandler.pages)
Exemplo n.º 6
0
 def testLoadWikipediaTitlesIgnoresRedirectPages(self):
     """L{loadWikipediaTitles} ignores pages that are redirects."""
     path = sibpath(__file__, 'redirect.xml')
     loadWikipediaTitles(path, self.pageHandler)
     self.assertEqual([], self.pageHandler.pages)