def test_segment_and_write_all_articles(self): tmpf = get_tmpfile('script.tst.json') segment_and_write_all_articles(self.fname, tmpf, workers=1, include_interlinks=True) # Get the first line from the text file we created. with open(tmpf) as f: first = next(f) # decode JSON line into a Python dictionary object article = json.loads(first) title, section_titles, interlinks = article['title'], article[ 'section_titles'], article['interlinks'] self.assertEqual(title, self.expected_title) self.assertEqual(section_titles, self.expected_section_titles) # Check interlinks # JSON has no tuples, only lists. So, we convert lists to tuples explicitly before comparison. self.assertEqual(len(interlinks), 685) self.assertEqual(tuple(interlinks[0]), ("political philosophy", "political philosophy")) self.assertEqual(tuple(interlinks[1]), ("self-governance", "self-governed")) self.assertEqual(tuple(interlinks[2]), ("stateless society", "stateless societies"))
def test_json_len(self): tmpf = get_tmpfile('script.tst.json') segment_and_write_all_articles(self.fname, tmpf, workers=1) expected_num_articles = 106 num_articles = sum(1 for line in smart_open(tmpf)) self.assertEqual(num_articles, expected_num_articles)
def test_json_len(self): tmpf = get_tmpfile('script.tst.json') segment_and_write_all_articles(self.fname, tmpf, workers=1) expected_num_articles = 106 num_articles = sum(1 for line in smart_open(tmpf)) self.assertEqual(num_articles, expected_num_articles)
def run(self, directory): ''' Reads Wikipedia XML export files and produces JSON files containing extracted metadata. Arguments: directory: string, directory containing the Wikipedia XML export files to be processed ''' files = [os.path.join(directory, file) for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file)) and file.endswith(self.FILE_SUFFIX)] for input_file in files: output_file = input_file.replace('.xml', '.json').replace('.bz2', '.gz') segment_wiki.segment_and_write_all_articles(input_file, output_file, min_article_character=200, include_interlinks=True)
def test_segment_and_write_all_articles(self): tmpf = get_tmpfile('script.tst.json') segment_and_write_all_articles(self.fname, tmpf, workers=1, include_interlinks=True) # Get the first line from the text file we created. with open(tmpf) as f: first = next(f) # decode JSON line into a Python dictionary object article = json.loads(first) title, section_titles, interlinks = article['title'], article['section_titles'], article['interlinks'] self.assertEqual(title, self.expected_title) self.assertEqual(section_titles, self.expected_section_titles) # Check interlinks self.assertTrue(interlinks['self-governance'] == 'self-governed') self.assertTrue(interlinks['Hierarchy'] == 'hierarchical') self.assertTrue(interlinks['Pierre-Joseph Proudhon'] == 'Proudhon')
def test_segment_and_write_all_articles(self): tmpf = get_tmpfile('script.tst.json') segment_and_write_all_articles(self.fname, tmpf, workers=1, include_interlinks=True) # Get the first line from the text file we created. with open(tmpf) as f: first = next(f) # decode JSON line into a Python dictionary object article = json.loads(first) title, section_titles, interlinks = article['title'], article['section_titles'], article['interlinks'] self.assertEqual(title, self.expected_title) self.assertEqual(section_titles, self.expected_section_titles) # Check interlinks self.assertTrue(interlinks['self-governance'] == 'self-governed') self.assertTrue(interlinks['Hierarchy'] == 'hierarchical') self.assertTrue(interlinks['Pierre-Joseph Proudhon'] == 'Proudhon')