def test_segment_and_write_all_articles(self):
        tmpf = get_tmpfile('script.tst.json')
        segment_and_write_all_articles(self.fname,
                                       tmpf,
                                       workers=1,
                                       include_interlinks=True)

        # Get the first line from the text file we created.
        with open(tmpf) as f:
            first = next(f)

        # decode JSON line into a Python dictionary object
        article = json.loads(first)
        title, section_titles, interlinks = article['title'], article[
            'section_titles'], article['interlinks']

        self.assertEqual(title, self.expected_title)
        self.assertEqual(section_titles, self.expected_section_titles)

        # Check interlinks
        # JSON has no tuples, only lists. So, we convert lists to tuples explicitly before comparison.
        self.assertEqual(len(interlinks), 685)
        self.assertEqual(tuple(interlinks[0]),
                         ("political philosophy", "political philosophy"))
        self.assertEqual(tuple(interlinks[1]),
                         ("self-governance", "self-governed"))
        self.assertEqual(tuple(interlinks[2]),
                         ("stateless society", "stateless societies"))
Exemplo n.º 2
0
    def test_json_len(self):
        tmpf = get_tmpfile('script.tst.json')
        segment_and_write_all_articles(self.fname, tmpf, workers=1)

        expected_num_articles = 106
        num_articles = sum(1 for line in smart_open(tmpf))
        self.assertEqual(num_articles, expected_num_articles)
Exemplo n.º 3
0
    def test_json_len(self):
        tmpf = get_tmpfile('script.tst.json')
        segment_and_write_all_articles(self.fname, tmpf, workers=1)

        expected_num_articles = 106
        num_articles = sum(1 for line in smart_open(tmpf))
        self.assertEqual(num_articles, expected_num_articles)
Exemplo n.º 4
0
 def run(self, directory):
     '''
     Reads Wikipedia XML export files and produces JSON files containing extracted metadata.
 
     Arguments:
         directory: string, directory containing the Wikipedia XML export files to be processed
     '''
     files = [os.path.join(directory, file) for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file)) and file.endswith(self.FILE_SUFFIX)]
     for input_file in files:
         output_file = input_file.replace('.xml', '.json').replace('.bz2', '.gz')
         segment_wiki.segment_and_write_all_articles(input_file, output_file, min_article_character=200, include_interlinks=True)
Exemplo n.º 5
0
    def test_segment_and_write_all_articles(self):
        tmpf = get_tmpfile('script.tst.json')
        segment_and_write_all_articles(self.fname, tmpf, workers=1, include_interlinks=True)

        # Get the first line from the text file we created.
        with open(tmpf) as f:
            first = next(f)

        # decode JSON line into a Python dictionary object
        article = json.loads(first)
        title, section_titles, interlinks = article['title'], article['section_titles'], article['interlinks']

        self.assertEqual(title, self.expected_title)
        self.assertEqual(section_titles, self.expected_section_titles)

        # Check interlinks
        self.assertTrue(interlinks['self-governance'] == 'self-governed')
        self.assertTrue(interlinks['Hierarchy'] == 'hierarchical')
        self.assertTrue(interlinks['Pierre-Joseph Proudhon'] == 'Proudhon')
Exemplo n.º 6
0
    def test_segment_and_write_all_articles(self):
        tmpf = get_tmpfile('script.tst.json')
        segment_and_write_all_articles(self.fname, tmpf, workers=1, include_interlinks=True)

        # Get the first line from the text file we created.
        with open(tmpf) as f:
            first = next(f)

        # decode JSON line into a Python dictionary object
        article = json.loads(first)
        title, section_titles, interlinks = article['title'], article['section_titles'], article['interlinks']

        self.assertEqual(title, self.expected_title)
        self.assertEqual(section_titles, self.expected_section_titles)

        # Check interlinks
        self.assertTrue(interlinks['self-governance'] == 'self-governed')
        self.assertTrue(interlinks['Hierarchy'] == 'hierarchical')
        self.assertTrue(interlinks['Pierre-Joseph Proudhon'] == 'Proudhon')