def test_deterministic(self, prep_org_data): """ Ensures loading and processing utterences from ELAN files is deterministic. """ bkw_org_path = prep_org_data utterances_1 = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"]) utterances_2 = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"]) assert utterances_1 == utterances_2 utterances_1 = [ utter for utter in utterances_1 if bkw.bkw_filter(utter) ] utterances_2 = [ utter for utter in utterances_2 if bkw.bkw_filter(utter) ] assert utterances_1 == utterances_2 utterances_1 = utterance.remove_duplicates(utterances_1) utterances_2 = utterance.remove_duplicates(utterances_2) assert utterances_1 == utterances_2 utterances_1 = [ bkw.bkw_label_segmenter.segment_labels(utter) for utter in utterances_1 ] utterances_2 = [ bkw.bkw_label_segmenter.segment_labels(utter) for utter in utterances_2 ] assert utterances_1 == utterances_2 utterances_1 = utterance.remove_empty_text(utterances_1) utterances_2 = utterance.remove_empty_text(utterances_2) assert utterances_1 == utterances_2
def test_explore_code_switching(self, prep_org_data): bkw_org_path = prep_org_data utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"]) utterances = utterance.remove_empty_text( utterance.remove_duplicates(utterances)) codeswitched_path = tgt_dir / "codeswitched.txt" bkw.explore_code_switching(utterances, codeswitched_path)
def test_speaker_durations(self, prep_org_data): bkw_org_path = prep_org_data utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"]) print(len(utterances)) utterances = utterance.remove_empty_text(utterances) print(len(utterances)) utterances = utterance.remove_duplicates(utterances) print(len(utterances)) utterances = [utter for utter in utterances if bkw.bkw_filter(utter)] print(len(utterances)) utterances = [ utter for utter in utterances if utterance.duration(utter) < 10000 ] total = 0 fmt = "{:20}{:10}" dur_fmt = "{:<10.3f}" print(fmt.format("Speaker", "Duration")) for speaker, duration in sorted( utterance.speaker_durations(utterances), key=lambda x: x[1], reverse=True): dur_mins = (duration * ureg.milliseconds).to( ureg.minutes).magnitude dur_str = dur_fmt.format(dur_mins) total += dur_mins print(fmt.format(speaker, dur_str)) print(fmt.format("Total", dur_fmt.format(total)))
def test_remove_duplicates_same_time(): from persephone.utterance import Utterance, remove_duplicates utter_a1 = Utterance(org_media_path=Path('test.wav'), org_transcription_path=Path('test.txt'), prefix='test', start_time=1, end_time=2, text='test text', speaker='Unit tester') utter_a2 = Utterance(org_media_path=Path('test.wav'), org_transcription_path=Path('test.txt'), prefix='test', start_time=1, end_time=2, text='test text', speaker='Unit tester') utter_b = Utterance(org_media_path=Path('testb.wav'), org_transcription_path=Path('testb.txt'), prefix='testb', start_time=1, end_time=2, text='testb text', speaker='Unit tester') all_utterances = [utter_a1, utter_a2, utter_b] result = remove_duplicates(all_utterances) assert result assert len(result) == 2 assert utter_b in result assert (utter_a1 in result or utter_a2 in result)
def test_mark_on_rock_rf_xv_duplicate(self, prep_org_data): mark_on_rock_path = prep_org_data / "Mark on Rock.eaf" anbuyika_text = (" Anbuyika rudno karudyo mani arriwa::::m" " arribebmeng Madjinbardi") xv_utters = elan.utterances_from_eaf(mark_on_rock_path, ["xv"]) rf_utters = elan.utterances_from_eaf(mark_on_rock_path, ["rf"]) xv_rf_utters = elan.utterances_from_eaf(mark_on_rock_path, ["xv", "rf"]) assert self.check_text_in_utters(anbuyika_text, xv_utters) assert self.check_text_in_utters(anbuyika_text, rf_utters) assert self.check_text_in_utters(anbuyika_text, xv_rf_utters) assert not self.check_text_in_utters("some random text", xv_rf_utters) assert len(xv_utters) == 425 assert len(rf_utters) == 420 assert len(xv_rf_utters) == 845 assert len(utterance.remove_duplicates(xv_rf_utters)) == 476 assert len(utterance.remove_empty_text( utterance.remove_duplicates(xv_rf_utters))) == 473
def test_utterances_from_dir(self, prep_org_data): bkw_org_path = prep_org_data utterances = elan.utterances_from_dir(bkw_org_path, ["xv"]) assert len(utterances) == 1036 assert len(utterance.remove_empty_text(utterances)) == 1035 assert len(utterance.remove_duplicates(utterances)) == 1029 assert len( utterance.remove_duplicates( utterance.remove_empty_text(utterances))) == 1028 utterances = elan.utterances_from_dir(bkw_org_path, ["rf"]) assert len(utterances) == 1242 assert len(utterance.remove_empty_text(utterances)) == 631 assert len(utterance.remove_duplicates(utterances)) == 1239 assert len( utterance.remove_duplicates( utterance.remove_empty_text(utterances))) == 631 utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"]) assert len(utterances) == 2278 assert len(utterance.remove_empty_text(utterances)) == 1666 assert len(utterance.remove_duplicates(utterances)) == 1899 assert len( utterance.remove_duplicates( utterance.remove_empty_text(utterances))) == 1291