Пример #1
0
 def test_create_output_dir_if_it_does_not_exist(self, mocked_input):
     mocked_input.side_effect = ['y']
     common = {'output_directory': 'test_creating_dir'}
     step = self.configuration['steps'][0]
     test_config = {'common': common, 'steps': [step]}
     test_filter = OpusFilter(test_config)
     test_filter.execute_steps()
     self.assertTrue(os.path.isfile('test_creating_dir/RF1_sents.en'))
     self.assertTrue(os.path.isfile('test_creating_dir/RF1_sents.sv'))
     shutil.rmtree('test_creating_dir')
Пример #2
0
 def setUp(self):
     self.tempdir = tempfile.mkdtemp()
     self.opus_filter = OpusFilter({
         'common': {
             'output_directory': self.tempdir
         },
         'steps': []
     })
     with open(os.path.join(self.tempdir, 'input_src'), 'w') as f:
         f.write(''.join('Sent_{}\n'.format(idx) for idx in range(6)))
     with open(os.path.join(self.tempdir, 'input_tgt'), 'w') as f:
         f.write(''.join('sent_{}\n'.format(idx) for idx in range(6)))
Пример #3
0
 def setUp(self):
     self.tempdir = tempfile.mkdtemp()
     self.opus_filter = OpusFilter({
         'common': {
             'output_directory': self.tempdir
         },
         'steps': []
     })
     with open(os.path.join(self.tempdir, 'input_src'), 'w') as f:
         f.write('Sentence3\nSentence4\nSentence2\nSentence1\n')
     with open(os.path.join(self.tempdir, 'input_tgt'), 'w') as f:
         f.write('sentence3\nsentence4\nsentence2\nsentence1\n')
Пример #4
0
 def test_write_to_current_dir_if_common_not_specified(self, mocked_input):
     mocked_input.side_effect = ['y']
     step = self.configuration['steps'][0]
     test_config = {'steps': [step]}
     test_filter = OpusFilter(test_config)
     test_filter.execute_steps()
     self.assertTrue(os.path.isfile('RF1_sents.en'))
     self.assertTrue(os.path.isfile('RF1_sents.sv'))
     os.remove('RF1_sents.en')
     os.remove('RF1_sents.sv')
     os.remove('RF_latest_xml_en.zip')
     os.remove('RF_latest_xml_sv.zip')
     os.remove('RF_latest_xml_en-sv.xml.gz')
Пример #5
0
 def setUp(self):
     self.tempdir = tempfile.mkdtemp()
     self.opus_filter = OpusFilter({
         'common': {
             'output_directory': self.tempdir
         },
         'steps': []
     })
     with open(os.path.join(self.tempdir, 'input_src'), 'w') as f:
         f.write('\n'.join(['a', 'b', 'c', 'd', 'e', 'a', 'b', 'b', 'f']) +
                 '\n')
     with open(os.path.join(self.tempdir, 'input_tgt'), 'w') as f:
         f.write('\n'.join(['A', 'B', 'C', 'D', 'E', 'A', 'B', 'F', 'C']) +
                 '\n')
Пример #6
0
 def setUp(self):
     self.tempdir = tempfile.mkdtemp()
     self.opus_filter = OpusFilter({
         'common': {
             'output_directory': self.tempdir
         },
         'steps': []
     })
     with open(os.path.join(self.tempdir, 'scores_input'), 'w') as f:
         for item in [{
                 'MyScore': {
                     'src': 1,
                     'tgt': 0.5
                 },
                 'OtherScore': 0
         }, {
                 'MyScore': {
                     'src': 0.8,
                     'tgt': 0
                 },
                 'OtherScore': 0
         }, {
                 'MyScore': {
                     'src': 0.5,
                     'tgt': 2
                 },
                 'OtherScore': 0
         }]:
             f.write(json.dumps(item) + '\n')
     with open(os.path.join(self.tempdir, 'scores_input_2'), 'w') as f:
         for item in [{
                 'OtherScore': 2
         }, {
                 'OtherScore': 8
         }, {
                 'OtherScore': 5
         }]:
             f.write(json.dumps(item) + '\n')
     with open(os.path.join(self.tempdir, 'ranks_input'), 'w') as f:
         f.write('0.5\n0\n2')
Пример #7
0
 def setUp(self):
     self.tempdir = tempfile.mkdtemp()
     self.opus_filter = OpusFilter({
         'common': {
             'output_directory': self.tempdir
         },
         'steps': []
     })
     with open(os.path.join(self.tempdir, 'rank_input_src'), 'w') as f:
         f.write('Sentence3\nSentence4\nSentence2\nSentence1')
     with open(os.path.join(self.tempdir, 'rank_input_tgt'), 'w') as f:
         f.write('Sentence3\nSentence4\nSentence2\nSentence1')
     with open(os.path.join(self.tempdir, 'ranks_input'), 'w') as f:
         f.write('0.5\n0\n2\n10')
     with open(os.path.join(self.tempdir, 'scores_input'), 'w') as f:
         for item in [{
                 'MyScore': {
                     'src': 1,
                     'tgt': 0.5
                 }
         }, {
                 'MyScore': {
                     'src': 0.8,
                     'tgt': 0
                 }
         }, {
                 'MyScore': {
                     'src': 0.5,
                     'tgt': 2
                 }
         }, {
                 'MyScore': {
                     'src': 1,
                     'tgt': 10
                 }
         }]:
             f.write(json.dumps(item) + '\n')
Пример #8
0
class TestRemoveDuplicates(unittest.TestCase):
    def setUp(self):
        self.tempdir = tempfile.mkdtemp()
        self.opus_filter = OpusFilter({
            'common': {
                'output_directory': self.tempdir
            },
            'steps': []
        })
        with open(os.path.join(self.tempdir, 'input_src'), 'w') as f:
            f.write('\n'.join(['a', 'b', 'c', 'd', 'e', 'a', 'b', 'b', 'f']) +
                    '\n')
        with open(os.path.join(self.tempdir, 'input_tgt'), 'w') as f:
            f.write('\n'.join(['A', 'B', 'C', 'D', 'E', 'A', 'B', 'F', 'C']) +
                    '\n')

    def tearDown(self):
        shutil.rmtree(self.tempdir)

    def test_defaults(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'input_src'),
                os.path.join(self.tempdir, 'input_tgt')
            ],
            'outputs': [
                os.path.join(self.tempdir, 'output_src'),
                os.path.join(self.tempdir, 'output_tgt')
            ]
        }
        self.opus_filter.remove_duplicates(parameters)
        with open(os.path.join(self.tempdir, 'output_src')) as f:
            self.assertEqual(f.read(), 'a\nb\nc\nd\ne\nb\nf\n')
        with open(os.path.join(self.tempdir, 'output_tgt')) as f:
            self.assertEqual(f.read(), 'A\nB\nC\nD\nE\nF\nC\n')

    def test_nohash(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'input_src'),
                os.path.join(self.tempdir, 'input_tgt')
            ],
            'outputs': [
                os.path.join(self.tempdir, 'output_src'),
                os.path.join(self.tempdir, 'output_tgt')
            ],
            'hash':
            None
        }
        self.opus_filter.remove_duplicates(parameters)
        with open(os.path.join(self.tempdir, 'output_src')) as f:
            self.assertEqual(f.read(), 'a\nb\nc\nd\ne\nb\nf\n')
        with open(os.path.join(self.tempdir, 'output_tgt')) as f:
            self.assertEqual(f.read(), 'A\nB\nC\nD\nE\nF\nC\n')

    def test_src_key_only(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'input_src'),
                os.path.join(self.tempdir, 'input_tgt')
            ],
            'outputs': [
                os.path.join(self.tempdir, 'output_src'),
                os.path.join(self.tempdir, 'output_tgt')
            ],
            'compare': [0]
        }
        self.opus_filter.remove_duplicates(parameters)
        with open(os.path.join(self.tempdir, 'output_src')) as f:
            self.assertEqual(f.read(), 'a\nb\nc\nd\ne\nf\n')
        with open(os.path.join(self.tempdir, 'output_tgt')) as f:
            self.assertEqual(f.read(), 'A\nB\nC\nD\nE\nC\n')

    def test_tgt_key_only(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'input_src'),
                os.path.join(self.tempdir, 'input_tgt')
            ],
            'outputs': [
                os.path.join(self.tempdir, 'output_src'),
                os.path.join(self.tempdir, 'output_tgt')
            ],
            'compare': [1]
        }
        self.opus_filter.remove_duplicates(parameters)
        with open(os.path.join(self.tempdir, 'output_src')) as f:
            self.assertEqual(f.read(), 'a\nb\nc\nd\ne\nb\n')
        with open(os.path.join(self.tempdir, 'output_tgt')) as f:
            self.assertEqual(f.read(), 'A\nB\nC\nD\nE\nF\n')

    def test_single_file(self):
        parameters = {
            'inputs': [os.path.join(self.tempdir, 'input_src')],
            'outputs': [os.path.join(self.tempdir, 'output_src')]
        }
        self.opus_filter.remove_duplicates(parameters)
        with open(os.path.join(self.tempdir, 'output_src')) as f:
            self.assertEqual(f.read(), 'a\nb\nc\nd\ne\nf\n')
Пример #9
0
class TestSplit(unittest.TestCase):
    def setUp(self):
        self.tempdir = tempfile.mkdtemp()
        self.opus_filter = OpusFilter({
            'common': {
                'output_directory': self.tempdir
            },
            'steps': []
        })
        with open(os.path.join(self.tempdir, 'input_src'), 'w') as f:
            f.write(''.join('Sent_{}\n'.format(idx) for idx in range(6)))
        with open(os.path.join(self.tempdir, 'input_tgt'), 'w') as f:
            f.write(''.join('sent_{}\n'.format(idx) for idx in range(6)))

    def tearDown(self):
        shutil.rmtree(self.tempdir)

    def test_split_single_out(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'input_src'),
                os.path.join(self.tempdir, 'input_tgt')
            ],
            'outputs': [
                os.path.join(self.tempdir, 'output_src'),
                os.path.join(self.tempdir, 'output_tgt')
            ],
            'divisor':
            2,
            'hash':
            'xx_64'
        }
        self.opus_filter.split(parameters)
        with open(os.path.join(self.tempdir, 'output_src')) as f:
            self.assertEqual(
                f.read(), ''.join('Sent_{}\n'.format(idx) for idx in [0, 1]))
        with open(os.path.join(self.tempdir, 'output_tgt')) as f:
            self.assertEqual(
                f.read(), ''.join('sent_{}\n'.format(idx) for idx in [0, 1]))

    def test_split_single_out_seed(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'input_src'),
                os.path.join(self.tempdir, 'input_tgt')
            ],
            'outputs': [
                os.path.join(self.tempdir, 'output_src'),
                os.path.join(self.tempdir, 'output_tgt')
            ],
            'divisor':
            2,
            'hash':
            'xx_64',
            'seed':
            123
        }
        self.opus_filter.split(parameters)
        with open(os.path.join(self.tempdir, 'output_src')) as f:
            self.assertEqual(
                f.read(),
                ''.join('Sent_{}\n'.format(idx) for idx in [2, 3, 4]))
        with open(os.path.join(self.tempdir, 'output_tgt')) as f:
            self.assertEqual(
                f.read(),
                ''.join('sent_{}\n'.format(idx) for idx in [2, 3, 4]))

    def test_split_two_out(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'input_src'),
                os.path.join(self.tempdir, 'input_tgt')
            ],
            'outputs': [
                os.path.join(self.tempdir, 'output_src'),
                os.path.join(self.tempdir, 'output_tgt')
            ],
            'outputs_2': [
                os.path.join(self.tempdir, 'output_src_2'),
                os.path.join(self.tempdir, 'output_tgt_2')
            ],
            'divisor':
            2,
            'hash':
            'xx_64'
        }
        self.opus_filter.split(parameters)
        with open(os.path.join(self.tempdir, 'output_src')) as f:
            self.assertEqual(
                f.read(), ''.join('Sent_{}\n'.format(idx) for idx in [0, 1]))
        with open(os.path.join(self.tempdir, 'output_tgt')) as f:
            self.assertEqual(
                f.read(), ''.join('sent_{}\n'.format(idx) for idx in [0, 1]))

        with open(os.path.join(self.tempdir, 'output_src_2')) as f:
            self.assertEqual(
                f.read(),
                ''.join('Sent_{}\n'.format(idx) for idx in [2, 3, 4, 5]))
        with open(os.path.join(self.tempdir, 'output_tgt_2')) as f:
            self.assertEqual(
                f.read(),
                ''.join('sent_{}\n'.format(idx) for idx in [2, 3, 4, 5]))

    def test_split_src_key(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'input_src'),
                os.path.join(self.tempdir, 'input_tgt')
            ],
            'outputs': [
                os.path.join(self.tempdir, 'output_src'),
                os.path.join(self.tempdir, 'output_tgt')
            ],
            'divisor':
            2,
            'compare': [0],
            'hash':
            'xx_64'
        }
        self.opus_filter.split(parameters)
        with open(os.path.join(self.tempdir, 'output_src')) as f:
            self.assertEqual(
                f.read(),
                ''.join('Sent_{}\n'.format(idx) for idx in [0, 3, 4]))
        with open(os.path.join(self.tempdir, 'output_tgt')) as f:
            self.assertEqual(
                f.read(),
                ''.join('sent_{}\n'.format(idx) for idx in [0, 3, 4]))

    def test_split_tgt_key(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'input_src'),
                os.path.join(self.tempdir, 'input_tgt')
            ],
            'outputs': [
                os.path.join(self.tempdir, 'output_src'),
                os.path.join(self.tempdir, 'output_tgt')
            ],
            'divisor':
            2,
            'compare': [1],
            'hash':
            'xx_64'
        }
        self.opus_filter.split(parameters)
        with open(os.path.join(self.tempdir, 'output_src')) as f:
            self.assertEqual(
                f.read(),
                ''.join('Sent_{}\n'.format(idx) for idx in [0, 2, 3, 5]))
        with open(os.path.join(self.tempdir, 'output_tgt')) as f:
            self.assertEqual(
                f.read(),
                ''.join('sent_{}\n'.format(idx) for idx in [0, 2, 3, 5]))

    def test_split_modulo_threshold(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'input_src'),
                os.path.join(self.tempdir, 'input_tgt')
            ],
            'outputs': [
                os.path.join(self.tempdir, 'output_src'),
                os.path.join(self.tempdir, 'output_tgt')
            ],
            'divisor':
            10,
            'threshold':
            3,
            'hash':
            'xx_64'
        }
        self.opus_filter.split(parameters)
        with open(os.path.join(self.tempdir, 'output_src')) as f:
            self.assertEqual(
                f.read(), ''.join('Sent_{}\n'.format(idx) for idx in [4, 5]))
        with open(os.path.join(self.tempdir, 'output_tgt')) as f:
            self.assertEqual(
                f.read(), ''.join('sent_{}\n'.format(idx) for idx in [4, 5]))
Пример #10
0
class TestHeadTailSlice(unittest.TestCase):
    def setUp(self):
        self.tempdir = tempfile.mkdtemp()
        self.opus_filter = OpusFilter({
            'common': {
                'output_directory': self.tempdir
            },
            'steps': []
        })
        with open(os.path.join(self.tempdir, 'input_src'), 'w') as f:
            f.write('Sentence3\nSentence4\nSentence2\nSentence1\n')
        with open(os.path.join(self.tempdir, 'input_tgt'), 'w') as f:
            f.write('sentence3\nsentence4\nsentence2\nsentence1\n')

    def tearDown(self):
        shutil.rmtree(self.tempdir)

    def test_head(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'input_src'),
                os.path.join(self.tempdir, 'input_tgt')
            ],
            'outputs': [
                os.path.join(self.tempdir, 'output_src'),
                os.path.join(self.tempdir, 'output_tgt')
            ],
            'n':
            2
        }
        self.opus_filter.head(parameters)
        with open(os.path.join(self.tempdir, 'output_src')) as f:
            self.assertEqual(f.read(), 'Sentence3\nSentence4\n')
        with open(os.path.join(self.tempdir, 'output_tgt')) as f:
            self.assertEqual(f.read(), 'sentence3\nsentence4\n')

    def test_tail(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'input_src'),
                os.path.join(self.tempdir, 'input_tgt')
            ],
            'outputs': [
                os.path.join(self.tempdir, 'output_src'),
                os.path.join(self.tempdir, 'output_tgt')
            ],
            'n':
            2
        }
        self.opus_filter.tail(parameters)
        with open(os.path.join(self.tempdir, 'output_src')) as f:
            self.assertEqual(f.read(), 'Sentence2\nSentence1\n')
        with open(os.path.join(self.tempdir, 'output_tgt')) as f:
            self.assertEqual(f.read(), 'sentence2\nsentence1\n')

    def test_slice_head(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'input_src'),
                os.path.join(self.tempdir, 'input_tgt')
            ],
            'outputs': [
                os.path.join(self.tempdir, 'output_src'),
                os.path.join(self.tempdir, 'output_tgt')
            ],
            'stop':
            2
        }
        self.opus_filter.slice(parameters)
        with open(os.path.join(self.tempdir, 'output_src')) as f:
            self.assertEqual(f.read(), 'Sentence3\nSentence4\n')
        with open(os.path.join(self.tempdir, 'output_tgt')) as f:
            self.assertEqual(f.read(), 'sentence3\nsentence4\n')

    def test_slice(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'input_src'),
                os.path.join(self.tempdir, 'input_tgt')
            ],
            'outputs': [
                os.path.join(self.tempdir, 'output_src'),
                os.path.join(self.tempdir, 'output_tgt')
            ],
            'start':
            1,
            'stop':
            None,
            'step':
            2
        }
        self.opus_filter.slice(parameters)
        with open(os.path.join(self.tempdir, 'output_src')) as f:
            self.assertEqual(f.read(), 'Sentence4\nSentence1\n')
        with open(os.path.join(self.tempdir, 'output_tgt')) as f:
            self.assertEqual(f.read(), 'sentence4\nsentence1\n')
Пример #11
0
class TestJoin(unittest.TestCase):
    def setUp(self):
        self.tempdir = tempfile.mkdtemp()
        self.opus_filter = OpusFilter({
            'common': {
                'output_directory': self.tempdir
            },
            'steps': []
        })
        with open(os.path.join(self.tempdir, 'scores_input'), 'w') as f:
            for item in [{
                    'MyScore': {
                        'src': 1,
                        'tgt': 0.5
                    },
                    'OtherScore': 0
            }, {
                    'MyScore': {
                        'src': 0.8,
                        'tgt': 0
                    },
                    'OtherScore': 0
            }, {
                    'MyScore': {
                        'src': 0.5,
                        'tgt': 2
                    },
                    'OtherScore': 0
            }]:
                f.write(json.dumps(item) + '\n')
        with open(os.path.join(self.tempdir, 'scores_input_2'), 'w') as f:
            for item in [{
                    'OtherScore': 2
            }, {
                    'OtherScore': 8
            }, {
                    'OtherScore': 5
            }]:
                f.write(json.dumps(item) + '\n')
        with open(os.path.join(self.tempdir, 'ranks_input'), 'w') as f:
            f.write('0.5\n0\n2')

    def tearDown(self):
        shutil.rmtree(self.tempdir)

    def test_join_scores_flat(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'scores_input'),
                os.path.join(self.tempdir, 'scores_input_2')
            ],
            'output':
            os.path.join(self.tempdir, 'scores_output')
        }
        self.opus_filter.join_scores(parameters)
        with open(os.path.join(self.tempdir, 'scores_output')) as f:
            out = []
            for line in f:
                out.append(json.loads(line))
        self.assertSequenceEqual(out, [{
            'MyScore': {
                'src': 1,
                'tgt': 0.5
            },
            'OtherScore': 2
        }, {
            'MyScore': {
                'src': 0.8,
                'tgt': 0
            },
            'OtherScore': 8
        }, {
            'MyScore': {
                'src': 0.5,
                'tgt': 2
            },
            'OtherScore': 5
        }])

    def test_join_scores_keys(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'scores_input'),
                os.path.join(self.tempdir, 'scores_input_2')
            ],
            'output':
            os.path.join(self.tempdir, 'scores_output'),
            'keys': [None, 'others']
        }
        self.opus_filter.join_scores(parameters)
        with open(os.path.join(self.tempdir, 'scores_output')) as f:
            out = []
            for line in f:
                out.append(json.loads(line))
        self.assertSequenceEqual(out, [{
            'MyScore': {
                'src': 1,
                'tgt': 0.5
            },
            'OtherScore': 0,
            'others': {
                'OtherScore': 2
            }
        }, {
            'MyScore': {
                'src': 0.8,
                'tgt': 0
            },
            'OtherScore': 0,
            'others': {
                'OtherScore': 8
            }
        }, {
            'MyScore': {
                'src': 0.5,
                'tgt': 2
            },
            'OtherScore': 0,
            'others': {
                'OtherScore': 5
            }
        }])

    def test_join_scores_append(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'scores_input'),
                os.path.join(self.tempdir, 'ranks_input')
            ],
            'output':
            os.path.join(self.tempdir, 'scores_output'),
            'keys': [None, 'MyScore.value']
        }
        self.opus_filter.join_scores(parameters)
        with open(os.path.join(self.tempdir, 'scores_output')) as f:
            out = []
            for line in f:
                out.append(json.loads(line))
        self.assertSequenceEqual(out, [{
            'MyScore': {
                'src': 1,
                'tgt': 0.5,
                'value': 0.5
            },
            'OtherScore': 0
        }, {
            'MyScore': {
                'src': 0.8,
                'tgt': 0,
                'value': 0
            },
            'OtherScore': 0
        }, {
            'MyScore': {
                'src': 0.5,
                'tgt': 2,
                'value': 2
            },
            'OtherScore': 0
        }])

    def test_join_scores_plain(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'scores_input_2'),
                os.path.join(self.tempdir, 'ranks_input')
            ],
            'output':
            os.path.join(self.tempdir, 'scores_output'),
            'keys': [None, 'rank']
        }
        self.opus_filter.join_scores(parameters)
        with open(os.path.join(self.tempdir, 'scores_output')) as f:
            out = []
            for line in f:
                out.append(json.loads(line))
        self.assertSequenceEqual(out, [{
            'OtherScore': 2,
            'rank': 0.5
        }, {
            'OtherScore': 8,
            'rank': 0
        }, {
            'OtherScore': 5,
            'rank': 2
        }])

    def test_join_scores_plain_multikey(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'scores_input_2'),
                os.path.join(self.tempdir, 'ranks_input')
            ],
            'output':
            os.path.join(self.tempdir, 'scores_output'),
            'keys': [None, 'misc.rank']
        }
        self.opus_filter.join_scores(parameters)
        with open(os.path.join(self.tempdir, 'scores_output')) as f:
            out = []
            for line in f:
                out.append(json.loads(line))
        self.assertSequenceEqual(out, [{
            'OtherScore': 2,
            'misc': {
                'rank': 0.5
            }
        }, {
            'OtherScore': 8,
            'misc': {
                'rank': 0
            }
        }, {
            'OtherScore': 5,
            'misc': {
                'rank': 2
            }
        }])
Пример #12
0
class TestSort(unittest.TestCase):
    def setUp(self):
        self.tempdir = tempfile.mkdtemp()
        self.opus_filter = OpusFilter({
            'common': {
                'output_directory': self.tempdir
            },
            'steps': []
        })
        with open(os.path.join(self.tempdir, 'rank_input_src'), 'w') as f:
            f.write('Sentence3\nSentence4\nSentence2\nSentence1')
        with open(os.path.join(self.tempdir, 'rank_input_tgt'), 'w') as f:
            f.write('Sentence3\nSentence4\nSentence2\nSentence1')
        with open(os.path.join(self.tempdir, 'ranks_input'), 'w') as f:
            f.write('0.5\n0\n2\n10')
        with open(os.path.join(self.tempdir, 'scores_input'), 'w') as f:
            for item in [{
                    'MyScore': {
                        'src': 1,
                        'tgt': 0.5
                    }
            }, {
                    'MyScore': {
                        'src': 0.8,
                        'tgt': 0
                    }
            }, {
                    'MyScore': {
                        'src': 0.5,
                        'tgt': 2
                    }
            }, {
                    'MyScore': {
                        'src': 1,
                        'tgt': 10
                    }
            }]:
                f.write(json.dumps(item) + '\n')

    def tearDown(self):
        shutil.rmtree(self.tempdir)

    def test_order_by_rank(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'rank_input_src'),
                os.path.join(self.tempdir, 'rank_input_tgt'),
                os.path.join(self.tempdir, 'ranks_input')
            ],
            'values':
            os.path.join(self.tempdir, 'ranks_input'),
            'outputs': [
                os.path.join(self.tempdir, 'rank_output_src'),
                os.path.join(self.tempdir, 'rank_output_tgt'),
                os.path.join(self.tempdir, 'ranks_output')
            ],
            'reverse':
            False
        }
        self.opus_filter.sort_files(parameters)
        with open(os.path.join(self.tempdir, 'rank_output_src')) as f:
            self.assertEqual(f.read(),
                             'Sentence4\nSentence3\nSentence2\nSentence1\n')
        with open(os.path.join(self.tempdir, 'rank_output_tgt')) as f:
            self.assertEqual(f.read(),
                             'Sentence4\nSentence3\nSentence2\nSentence1\n')
        with open(os.path.join(self.tempdir, 'ranks_output')) as f:
            self.assertEqual(f.read(), '0\n0.5\n2\n10\n')

    def test_sort_files_reverse(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'rank_input_src'),
                os.path.join(self.tempdir, 'rank_input_tgt'),
                os.path.join(self.tempdir, 'ranks_input')
            ],
            'values':
            os.path.join(self.tempdir, 'ranks_input'),
            'outputs': [
                os.path.join(self.tempdir, 'rank_output_src'),
                os.path.join(self.tempdir, 'rank_output_tgt'),
                os.path.join(self.tempdir, 'ranks_output')
            ],
            'reverse':
            True
        }
        self.opus_filter.sort_files(parameters)
        with open(os.path.join(self.tempdir, 'rank_output_src')) as f:
            self.assertEqual(f.read(),
                             'Sentence1\nSentence2\nSentence3\nSentence4\n')
        with open(os.path.join(self.tempdir, 'rank_output_tgt')) as f:
            self.assertEqual(f.read(),
                             'Sentence1\nSentence2\nSentence3\nSentence4\n')
        with open(os.path.join(self.tempdir, 'ranks_output')) as f:
            self.assertEqual(f.read(), '10\n2\n0.5\n0\n')

    def test_sort_by_score(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'rank_input_src'),
                os.path.join(self.tempdir, 'rank_input_tgt'),
                os.path.join(self.tempdir, 'ranks_input')
            ],
            'values':
            os.path.join(self.tempdir, 'scores_input'),
            'outputs': [
                os.path.join(self.tempdir, 'rank_output_src'),
                os.path.join(self.tempdir, 'rank_output_tgt'),
                os.path.join(self.tempdir, 'ranks_output')
            ],
            'reverse':
            False,
            'key':
            'MyScore.tgt'
        }
        self.opus_filter.sort_files(parameters)
        with open(os.path.join(self.tempdir, 'rank_output_src')) as f:
            self.assertEqual(f.read(),
                             'Sentence4\nSentence3\nSentence2\nSentence1\n')
        with open(os.path.join(self.tempdir, 'rank_output_tgt')) as f:
            self.assertEqual(f.read(),
                             'Sentence4\nSentence3\nSentence2\nSentence1\n')
        with open(os.path.join(self.tempdir, 'ranks_output')) as f:
            self.assertEqual(f.read(), '0\n0.5\n2\n10\n')

    def test_sort_by_str(self):
        parameters = {
            'inputs': [
                os.path.join(self.tempdir, 'rank_input_src'),
                os.path.join(self.tempdir, 'rank_input_tgt'),
                os.path.join(self.tempdir, 'ranks_input')
            ],
            'values':
            os.path.join(self.tempdir, 'rank_input_src'),
            'outputs': [
                os.path.join(self.tempdir, 'rank_output_src'),
                os.path.join(self.tempdir, 'rank_output_tgt'),
                os.path.join(self.tempdir, 'ranks_output')
            ],
            'reverse':
            False,
            'type':
            'str'
        }
        self.opus_filter.sort_files(parameters)
        with open(os.path.join(self.tempdir, 'rank_output_src')) as f:
            self.assertEqual(f.read(),
                             'Sentence1\nSentence2\nSentence3\nSentence4\n')
        with open(os.path.join(self.tempdir, 'rank_output_tgt')) as f:
            self.assertEqual(f.read(),
                             'Sentence1\nSentence2\nSentence3\nSentence4\n')
        with open(os.path.join(self.tempdir, 'ranks_output')) as f:
            self.assertEqual(f.read(), '10\n2\n0.5\n0\n')
Пример #13
0
    def setUpClass(self):
        self.tempdir = tempfile.mkdtemp()
        self.configuration = {
            'common': {
                'output_directory': self.tempdir
            },
            'steps': [{
                'type': 'opus_read',
                'parameters': {
                    'corpus_name': 'RF',
                    'source_language': 'en',
                    'target_language': 'sv',
                    'release': 'latest',
                    'preprocessing': 'xml',
                    'src_output': 'RF1_sents.en',
                    'tgt_output': 'RF1_sents.sv'
                }
            }, {
                'type': 'filter',
                'parameters': {
                    'src_input':
                    'RF1_sents.en',
                    'tgt_input':
                    'RF1_sents.sv',
                    'src_output':
                    'RF1_filtered.en',
                    'tgt_output':
                    'RF1_filtered.sv',
                    'filters': [{
                        'LanguageIDFilter': {
                            'src_lang': 'en',
                            'tgt_lang': 'sv',
                            'src_threshold': 0,
                            'tgt_threshold': 0
                        }
                    }, {
                        'TerminalPunctuationFilter': {
                            'threshold': -2
                        }
                    }, {
                        'NonZeroNumeralsFilter': {
                            'threshold': 0.5
                        }
                    }, {
                        'CharacterScoreFilter': {
                            'src_script': 'Latin',
                            'tgt_script': 'Latin',
                            'src_threshold': 1,
                            'tgt_threshold': 1
                        }
                    }]
                }
            }, {
                'type': 'train_ngram',
                'parameters': {
                    'data': 'RF1_filtered.en',
                    'parameters': {
                        'norder': 20,
                        'dscale': 0.001
                    },
                    'model': 'RF1_en.arpa'
                }
            }, {
                'type': 'train_ngram',
                'parameters': {
                    'data': 'RF1_filtered.sv',
                    'parameters': {
                        'norder': 20,
                        'dscale': 0.001
                    },
                    'model': 'RF1_sv.arpa'
                }
            }, {
                'type': 'train_alignment',
                'parameters': {
                    'src_data': 'RF1_filtered.en',
                    'tgt_data': 'RF1_filtered.sv',
                    'parameters': {
                        'tokenizer': 'none',
                        'model': 3
                    },
                    'output': 'RF1_align.priors'
                }
            }, {
                'type': 'score',
                'parameters': {
                    'src_input':
                    'RF1_sents.en',
                    'tgt_input':
                    'RF1_sents.sv',
                    'output':
                    'RF1_scores.en-sv.jsonl',
                    'filters': [{
                        'LanguageIDFilter': {
                            'src_lang': 'en',
                            'tgt_lang': 'sv',
                            'src_threshold': 0,
                            'tgt_threshold': 0
                        }
                    }, {
                        'TerminalPunctuationFilter': {
                            'threshold': -2
                        }
                    }, {
                        'NonZeroNumeralsFilter': {
                            'threshold': 0.5
                        }
                    }, {
                        'CharacterScoreFilter': {
                            'src_script': 'Latin',
                            'tgt_script': 'Latin',
                            'src_threshold': 1,
                            'tgt_threshold': 1
                        }
                    }, {
                        'WordAlignFilter': {
                            'tokenizer': 'none',
                            'priors': 'RF1_align.priors',
                            'model': 3,
                            'src_threshold': 0,
                            'tgt_threshold': 0
                        }
                    }, {
                        'CrossEntropyFilter': {
                            'src_lm_params': {
                                'filename': 'RF1_en.arpa'
                            },
                            'tgt_lm_params': {
                                'filename': 'RF1_sv.arpa'
                            },
                            'src_threshold': 50.0,
                            'tgt_threshold': 50.0,
                            'diff_threshold': 10.0
                        }
                    }]
                }
            }]
        }

        OpusGet(directory='RF',
                source='en',
                target='sv',
                release='latest',
                preprocess='xml',
                suppress_prompts=True,
                download_dir=self.tempdir).get_files()
        self.opus_filter = OpusFilter(self.configuration)
        self.opus_filter.execute_steps()
Пример #14
0
class TestOpusFilter(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        self.tempdir = tempfile.mkdtemp()
        self.configuration = {
            'common': {
                'output_directory': self.tempdir
            },
            'steps': [{
                'type': 'opus_read',
                'parameters': {
                    'corpus_name': 'RF',
                    'source_language': 'en',
                    'target_language': 'sv',
                    'release': 'latest',
                    'preprocessing': 'xml',
                    'src_output': 'RF1_sents.en',
                    'tgt_output': 'RF1_sents.sv'
                }
            }, {
                'type': 'filter',
                'parameters': {
                    'src_input':
                    'RF1_sents.en',
                    'tgt_input':
                    'RF1_sents.sv',
                    'src_output':
                    'RF1_filtered.en',
                    'tgt_output':
                    'RF1_filtered.sv',
                    'filters': [{
                        'LanguageIDFilter': {
                            'src_lang': 'en',
                            'tgt_lang': 'sv',
                            'src_threshold': 0,
                            'tgt_threshold': 0
                        }
                    }, {
                        'TerminalPunctuationFilter': {
                            'threshold': -2
                        }
                    }, {
                        'NonZeroNumeralsFilter': {
                            'threshold': 0.5
                        }
                    }, {
                        'CharacterScoreFilter': {
                            'src_script': 'Latin',
                            'tgt_script': 'Latin',
                            'src_threshold': 1,
                            'tgt_threshold': 1
                        }
                    }]
                }
            }, {
                'type': 'train_ngram',
                'parameters': {
                    'data': 'RF1_filtered.en',
                    'parameters': {
                        'norder': 20,
                        'dscale': 0.001
                    },
                    'model': 'RF1_en.arpa'
                }
            }, {
                'type': 'train_ngram',
                'parameters': {
                    'data': 'RF1_filtered.sv',
                    'parameters': {
                        'norder': 20,
                        'dscale': 0.001
                    },
                    'model': 'RF1_sv.arpa'
                }
            }, {
                'type': 'train_alignment',
                'parameters': {
                    'src_data': 'RF1_filtered.en',
                    'tgt_data': 'RF1_filtered.sv',
                    'parameters': {
                        'tokenizer': 'none',
                        'model': 3
                    },
                    'output': 'RF1_align.priors'
                }
            }, {
                'type': 'score',
                'parameters': {
                    'src_input':
                    'RF1_sents.en',
                    'tgt_input':
                    'RF1_sents.sv',
                    'output':
                    'RF1_scores.en-sv.jsonl',
                    'filters': [{
                        'LanguageIDFilter': {
                            'src_lang': 'en',
                            'tgt_lang': 'sv',
                            'src_threshold': 0,
                            'tgt_threshold': 0
                        }
                    }, {
                        'TerminalPunctuationFilter': {
                            'threshold': -2
                        }
                    }, {
                        'NonZeroNumeralsFilter': {
                            'threshold': 0.5
                        }
                    }, {
                        'CharacterScoreFilter': {
                            'src_script': 'Latin',
                            'tgt_script': 'Latin',
                            'src_threshold': 1,
                            'tgt_threshold': 1
                        }
                    }, {
                        'WordAlignFilter': {
                            'tokenizer': 'none',
                            'priors': 'RF1_align.priors',
                            'model': 3,
                            'src_threshold': 0,
                            'tgt_threshold': 0
                        }
                    }, {
                        'CrossEntropyFilter': {
                            'src_lm_params': {
                                'filename': 'RF1_en.arpa'
                            },
                            'tgt_lm_params': {
                                'filename': 'RF1_sv.arpa'
                            },
                            'src_threshold': 50.0,
                            'tgt_threshold': 50.0,
                            'diff_threshold': 10.0
                        }
                    }]
                }
            }]
        }

        OpusGet(directory='RF',
                source='en',
                target='sv',
                release='latest',
                preprocess='xml',
                suppress_prompts=True,
                download_dir=self.tempdir).get_files()
        self.opus_filter = OpusFilter(self.configuration)
        self.opus_filter.execute_steps()

    @classmethod
    def tearDownClass(self):
        shutil.rmtree(self.tempdir)

    def test_get_pairs(self):
        pair_gen = self.opus_filter.get_pairs('RF1_sents.en', 'RF1_sents.sv')
        pair = next(pair_gen)
        for pair in pair_gen:
            pass
        self.assertEqual(pair,
                         ('This will ensure the cohesion of Swedish society .',
                          'Så kan vi hålla samman Sverige .'))

    def test_clean_data(self):
        with open(os.path.join(self.tempdir, 'RF1_filtered.en')) as clean:
            self.assertEqual(
                clean.readline(),
                'Your Majesties , Your Royal Highnesses , Mr Speaker , '
                'Members of the Swedish Parliament .\n')
        with open(os.path.join(self.tempdir, 'RF1_filtered.sv')) as clean:
            self.assertEqual(
                clean.readline(),
                'Eders Majestäter , Eders Kungliga Högheter , herr '
                'talman , ledamöter av Sveriges riksdag !\n')

    def test_train_models(self):
        self.assertTrue(
            os.path.isfile(os.path.join(self.tempdir, 'RF1_align.priors')))
        self.assertTrue(
            os.path.isfile(os.path.join(self.tempdir, 'RF1_en.arpa')))
        self.assertTrue(
            os.path.isfile(os.path.join(self.tempdir, 'RF1_en.arpa')))

    def test_score_data(self):
        with open(os.path.join(self.tempdir,
                               'RF1_scores.en-sv.jsonl')) as scores_file:
            score = json.loads(scores_file.readline())
            self.assertEqual(score['LanguageIDFilter'], {
                'src': 1.0,
                'tgt': 0.98
            })
            self.assertEqual(score['LanguageIDFilter'], {
                'src': 1.0,
                'tgt': 0.98
            })
            self.assertEqual(score['CharacterScoreFilter'], {
                'src': 1.0,
                'tgt': 1.0
            })
            self.assertAlmostEqual(score['CrossEntropyFilter']['src'],
                                   15.214258903317491)
            self.assertAlmostEqual(score['CrossEntropyFilter']['tgt'],
                                   7.569084909162213)
            self.assertEqual(score['TerminalPunctuationFilter'], -0.0)
            self.assertEqual(score['NonZeroNumeralsFilter'], 0.0)
            self.assertEqual(type(score['WordAlignFilter']), dict)

    def test_initial_files(self):
        with open(os.path.join(self.tempdir, 'RF1_sents.en')) as sents_file_en:
            with open(os.path.join(self.tempdir,
                                   'RF1_sents.sv')) as sents_file_sv:
                sents_en = sents_file_en.readlines()
                sents_sv = sents_file_sv.readlines()
                self.assertEqual(len(sents_en), 180)
                self.assertEqual(len(sents_sv), 180)
                self.assertEqual(
                    sents_en[0],
                    ('Statement of Government Policy by the Prime '
                     'Minister , Mr Ingvar Carlsson , at the Opening '
                     'of the Swedish Parliament on Tuesday , 4 October '
                     ', 1988 .\n'))
                self.assertEqual(sents_sv[0], 'REGERINGSFÖRKLARING .\n')

    @mock.patch('opustools.opus_get.input', create=True)
    def test_write_to_current_dir_if_common_not_specified(self, mocked_input):
        mocked_input.side_effect = ['y']
        step = self.configuration['steps'][0]
        test_config = {'steps': [step]}
        test_filter = OpusFilter(test_config)
        test_filter.execute_steps()
        self.assertTrue(os.path.isfile('RF1_sents.en'))
        self.assertTrue(os.path.isfile('RF1_sents.sv'))
        os.remove('RF1_sents.en')
        os.remove('RF1_sents.sv')
        os.remove('RF_latest_xml_en.zip')
        os.remove('RF_latest_xml_sv.zip')
        os.remove('RF_latest_xml_en-sv.xml.gz')

    @mock.patch('opustools.opus_get.input', create=True)
    def test_write_to_current_dir_if_output_dir_not_specified(
            self, mocked_input):
        mocked_input.side_effect = ['y']
        common = {'test': 'test'}
        step = self.configuration['steps'][0]
        test_config = {'common': common, 'steps': [step]}
        test_filter = OpusFilter(test_config)
        test_filter.execute_steps()
        self.assertTrue(os.path.isfile('RF1_sents.en'))
        self.assertTrue(os.path.isfile('RF1_sents.sv'))
        os.remove('RF1_sents.en')
        os.remove('RF1_sents.sv')
        os.remove('RF_latest_xml_en.zip')
        os.remove('RF_latest_xml_sv.zip')
        os.remove('RF_latest_xml_en-sv.xml.gz')

    @mock.patch('opustools.opus_get.input', create=True)
    def test_create_output_dir_if_it_does_not_exist(self, mocked_input):
        mocked_input.side_effect = ['y']
        common = {'output_directory': 'test_creating_dir'}
        step = self.configuration['steps'][0]
        test_config = {'common': common, 'steps': [step]}
        test_filter = OpusFilter(test_config)
        test_filter.execute_steps()
        self.assertTrue(os.path.isfile('test_creating_dir/RF1_sents.en'))
        self.assertTrue(os.path.isfile('test_creating_dir/RF1_sents.sv'))
        shutil.rmtree('test_creating_dir')