def test_split_by_label_length_issuer_separated_only_uses_given_label_list(self, split_mock): corpus = resources.create_multi_label_corpus() splitter = splitting.Splitter(corpus, random_seed=INITIAL_SEED) for utt in corpus.utterances.values(): utt.set_label_list(annotations.LabelList.create_single( 'another label', idx='some-idx' )) split_mock.return_value = { 'train': ['spk-1', 'spk-2'], 'test': ['spk-3'] } res = splitter.split_by_label_length( {'train': 0.5, 'test': 0.5}, label_list_idx='default', separate_issuers=True ) assert res['train'].utterances.keys() == { 'utt-1', 'utt-2', 'utt-3', 'utt-4', 'utt-5' } assert res['test'].utterances.keys() == { 'utt-6', 'utt-7', 'utt-8' } split_mock.assert_called_with( { 'spk-1': {'length': 32}, 'spk-2': {'length': 33}, 'spk-3': {'length': 32}, }, {'train': 0.5, 'test': 0.5}, seed=mock.ANY )
def test_split_by_number_of_utterances_seed(self): corpus = resources.create_multi_label_corpus() res1 = splitting.Splitter( corpus, random_seed=15).split_by_number_of_utterances({ 'train': 0.6, 'test': 0.2 }) corpus = resources.create_multi_label_corpus() res2 = splitting.Splitter( corpus, random_seed=15).split_by_number_of_utterances({ 'train': 0.6, 'test': 0.2 }) assert set(res1['train'].utterances.keys()) == set( res2['train'].utterances.keys()) assert set(res1['test'].utterances.keys()) == set( res2['test'].utterances.keys())
def test_split_by_proportionally_distribute_labels_by_number_seed(self): corpus = resources.create_multi_label_corpus() splitter = splitting.Splitter(corpus, random_seed=15) res1 = splitter.split_by_proportionally_distribute_labels( { 'train': 0.6, 'test': 0.2 }, use_lengths=False) corpus = resources.create_multi_label_corpus() splitter = splitting.Splitter(corpus, random_seed=15) res2 = splitter.split_by_proportionally_distribute_labels( { 'train': 0.6, 'test': 0.2 }, use_lengths=False) assert set(res1['train'].utterances.keys()) == set( res2['train'].utterances.keys()) assert set(res1['test'].utterances.keys()) == set( res2['test'].utterances.keys())
def test_split_by_label_length_only_uses_given_label_list(self, split_mock): corpus = resources.create_multi_label_corpus() splitter = splitting.Splitter(corpus, random_seed=INITIAL_SEED) for utt in corpus.utterances.values(): utt.set_label_list(annotations.LabelList.create_single( 'another label', idx='some-idx' )) split_mock.return_value = { 'train': ['utt-1', 'utt-3'], 'test': ['utt-3', 'utt-4'], 'dev': ['utt-5', 'utt-6'], } res = splitter.split_by_label_length( {'train': 0.6, 'test': 0.2, 'dev': 0.2}, label_list_idx='default' ) assert res['train'].utterances.keys() == {'utt-1', 'utt-3'} assert res['test'].utterances.keys() == {'utt-3', 'utt-4'} assert res['dev'].utterances.keys() == {'utt-5', 'utt-6'} split_mock.assert_called_with( { 'utt-1': {'length': 16}, 'utt-2': {'length': 16}, 'utt-3': {'length': 11}, 'utt-4': {'length': 16}, 'utt-5': {'length': 6}, 'utt-6': {'length': 16}, 'utt-7': {'length': 11}, 'utt-8': {'length': 5}, }, {'train': 0.6, 'test': 0.2, 'dev': 0.2}, seed=mock.ANY )
def splitter(): corpus = resources.create_multi_label_corpus() return splitting.Splitter(corpus)
def setUp(self): self.corpus = resources.create_multi_label_corpus() self.splitter = splitting.Splitter(self.corpus)
def splitter(): corpus = resources.create_multi_label_corpus() return splitting.Splitter(corpus, random_seed=INITIAL_SEED)