예제 #1
0
 def test_deterministic(self, prep_org_data):
     """ Ensures loading and processing utterences from ELAN files is
     deterministic.
     """
     bkw_org_path = prep_org_data
     utterances_1 = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"])
     utterances_2 = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"])
     assert utterances_1 == utterances_2
     utterances_1 = [
         utter for utter in utterances_1 if bkw.bkw_filter(utter)
     ]
     utterances_2 = [
         utter for utter in utterances_2 if bkw.bkw_filter(utter)
     ]
     assert utterances_1 == utterances_2
     utterances_1 = utterance.remove_duplicates(utterances_1)
     utterances_2 = utterance.remove_duplicates(utterances_2)
     assert utterances_1 == utterances_2
     utterances_1 = [
         bkw.bkw_label_segmenter.segment_labels(utter)
         for utter in utterances_1
     ]
     utterances_2 = [
         bkw.bkw_label_segmenter.segment_labels(utter)
         for utter in utterances_2
     ]
     assert utterances_1 == utterances_2
     utterances_1 = utterance.remove_empty_text(utterances_1)
     utterances_2 = utterance.remove_empty_text(utterances_2)
     assert utterances_1 == utterances_2
예제 #2
0
 def test_explore_code_switching(self, prep_org_data):
     bkw_org_path = prep_org_data
     utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"])
     utterances = utterance.remove_empty_text(
                  utterance.remove_duplicates(utterances))
     codeswitched_path = tgt_dir / "codeswitched.txt"
     bkw.explore_code_switching(utterances, codeswitched_path)
예제 #3
0
 def test_speaker_durations(self, prep_org_data):
     bkw_org_path = prep_org_data
     utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"])
     print(len(utterances))
     utterances = utterance.remove_empty_text(utterances)
     print(len(utterances))
     utterances = utterance.remove_duplicates(utterances)
     print(len(utterances))
     utterances = [utter for utter in utterances if bkw.bkw_filter(utter)]
     print(len(utterances))
     utterances = [
         utter for utter in utterances if utterance.duration(utter) < 10000
     ]
     total = 0
     fmt = "{:20}{:10}"
     dur_fmt = "{:<10.3f}"
     print(fmt.format("Speaker", "Duration"))
     for speaker, duration in sorted(
             utterance.speaker_durations(utterances),
             key=lambda x: x[1],
             reverse=True):
         dur_mins = (duration * ureg.milliseconds).to(
             ureg.minutes).magnitude
         dur_str = dur_fmt.format(dur_mins)
         total += dur_mins
         print(fmt.format(speaker, dur_str))
     print(fmt.format("Total", dur_fmt.format(total)))
예제 #4
0
def test_remove_duplicates_same_time():
    from persephone.utterance import Utterance, remove_duplicates
    utter_a1 = Utterance(org_media_path=Path('test.wav'),
                         org_transcription_path=Path('test.txt'),
                         prefix='test',
                         start_time=1,
                         end_time=2,
                         text='test text',
                         speaker='Unit tester')

    utter_a2 = Utterance(org_media_path=Path('test.wav'),
                         org_transcription_path=Path('test.txt'),
                         prefix='test',
                         start_time=1,
                         end_time=2,
                         text='test text',
                         speaker='Unit tester')

    utter_b = Utterance(org_media_path=Path('testb.wav'),
                        org_transcription_path=Path('testb.txt'),
                        prefix='testb',
                        start_time=1,
                        end_time=2,
                        text='testb text',
                        speaker='Unit tester')

    all_utterances = [utter_a1, utter_a2, utter_b]
    result = remove_duplicates(all_utterances)
    assert result
    assert len(result) == 2
    assert utter_b in result
    assert (utter_a1 in result or utter_a2 in result)
예제 #5
0
    def test_mark_on_rock_rf_xv_duplicate(self, prep_org_data):
        mark_on_rock_path = prep_org_data / "Mark on Rock.eaf"
        anbuyika_text = (" Anbuyika rudno karudyo mani arriwa::::m"
                         " arribebmeng Madjinbardi")

        xv_utters = elan.utterances_from_eaf(mark_on_rock_path, ["xv"])
        rf_utters = elan.utterances_from_eaf(mark_on_rock_path, ["rf"])
        xv_rf_utters = elan.utterances_from_eaf(mark_on_rock_path, ["xv", "rf"])

        assert self.check_text_in_utters(anbuyika_text, xv_utters)
        assert self.check_text_in_utters(anbuyika_text, rf_utters)
        assert self.check_text_in_utters(anbuyika_text, xv_rf_utters)
        assert not self.check_text_in_utters("some random text", xv_rf_utters)

        assert len(xv_utters) == 425
        assert len(rf_utters) == 420
        assert len(xv_rf_utters) == 845
        assert len(utterance.remove_duplicates(xv_rf_utters)) == 476
        assert len(utterance.remove_empty_text(
                   utterance.remove_duplicates(xv_rf_utters))) == 473
예제 #6
0
    def test_utterances_from_dir(self, prep_org_data):
        bkw_org_path = prep_org_data

        utterances = elan.utterances_from_dir(bkw_org_path, ["xv"])
        assert len(utterances) == 1036
        assert len(utterance.remove_empty_text(utterances)) == 1035
        assert len(utterance.remove_duplicates(utterances)) == 1029
        assert len(
            utterance.remove_duplicates(
                utterance.remove_empty_text(utterances))) == 1028

        utterances = elan.utterances_from_dir(bkw_org_path, ["rf"])
        assert len(utterances) == 1242
        assert len(utterance.remove_empty_text(utterances)) == 631
        assert len(utterance.remove_duplicates(utterances)) == 1239
        assert len(
            utterance.remove_duplicates(
                utterance.remove_empty_text(utterances))) == 631

        utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"])
        assert len(utterances) == 2278
        assert len(utterance.remove_empty_text(utterances)) == 1666
        assert len(utterance.remove_duplicates(utterances)) == 1899
        assert len(
            utterance.remove_duplicates(
                utterance.remove_empty_text(utterances))) == 1291