Пример #1
0
    def test_read_gold_standard(self):
        all_files = data_processing.get_list_all_corrected_files(FIXTURES)
        all_files.sort()

        chars = {'ascii':u'ascii yo!', 'iso-8859-1':u'\xd3', 'utf-8':u'\xae', 'utf-16':u'\xae'}

        for e in chars:
            content_comments = data_processing.read_gold_standard(
                FIXTURES, e)
            actual_content = (u"Content here\nmore content\n" + chars[e] + u"\n")
            self.assertEqual(content_comments[0], actual_content)
            self.assertEqual(content_comments[1], '\nsome comments\n')
Пример #2
0
    def test_list_all_files(self):
        prefixes = ['bbc.co.story', 'f1', 'sad8-2sdkfj']
        datadir = tempfile.mkdtemp()
        os.mkdir(datadir + '/Corrected')
        for froot in prefixes:
            with open(datadir + '/Corrected/%s.html.corrected.txt' % froot, 'w') as f:
                f.write('.')

        all_files = data_processing.get_list_all_corrected_files(datadir)
        all_files.sort()
        self.assertEqual(all_files,
            [('%s/Corrected/%s.html.corrected.txt' %
                (datadir, froot), froot)
              for froot in prefixes])

        rmtree(datadir)