def test_read_gold_standard(self): all_files = data_processing.get_list_all_corrected_files(FIXTURES) all_files.sort() chars = {'ascii':u'ascii yo!', 'iso-8859-1':u'\xd3', 'utf-8':u'\xae', 'utf-16':u'\xae'} for e in chars: content_comments = data_processing.read_gold_standard( FIXTURES, e) actual_content = (u"Content here\nmore content\n" + chars[e] + u"\n") self.assertEqual(content_comments[0], actual_content) self.assertEqual(content_comments[1], '\nsome comments\n')
def test_list_all_files(self): prefixes = ['bbc.co.story', 'f1', 'sad8-2sdkfj'] datadir = tempfile.mkdtemp() os.mkdir(datadir + '/Corrected') for froot in prefixes: with open(datadir + '/Corrected/%s.html.corrected.txt' % froot, 'w') as f: f.write('.') all_files = data_processing.get_list_all_corrected_files(datadir) all_files.sort() self.assertEqual(all_files, [('%s/Corrected/%s.html.corrected.txt' % (datadir, froot), froot) for froot in prefixes]) rmtree(datadir)