예제 #1
0
 def test_deterministic(self, prep_org_data):
     """ Ensures loading and processing utterences from ELAN files is
     deterministic.
     """
     bkw_org_path = prep_org_data
     utterances_1 = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"])
     utterances_2 = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"])
     assert utterances_1 == utterances_2
     utterances_1 = [
         utter for utter in utterances_1 if bkw.bkw_filter(utter)
     ]
     utterances_2 = [
         utter for utter in utterances_2 if bkw.bkw_filter(utter)
     ]
     assert utterances_1 == utterances_2
     utterances_1 = utterance.remove_duplicates(utterances_1)
     utterances_2 = utterance.remove_duplicates(utterances_2)
     assert utterances_1 == utterances_2
     utterances_1 = [
         bkw.bkw_label_segmenter.segment_labels(utter)
         for utter in utterances_1
     ]
     utterances_2 = [
         bkw.bkw_label_segmenter.segment_labels(utter)
         for utter in utterances_2
     ]
     assert utterances_1 == utterances_2
     utterances_1 = utterance.remove_empty_text(utterances_1)
     utterances_2 = utterance.remove_empty_text(utterances_2)
     assert utterances_1 == utterances_2
예제 #2
0
    def test_utterances_from_dir(self, prep_org_data):
        bkw_org_path = prep_org_data

        utterances = elan.utterances_from_dir(bkw_org_path, ["xv"])
        assert len(utterances) == 1036
        assert len(utterance.remove_empty_text(utterances)) == 1035
        assert len(utterance.remove_duplicates(utterances)) == 1029
        assert len(
            utterance.remove_duplicates(
                utterance.remove_empty_text(utterances))) == 1028

        utterances = elan.utterances_from_dir(bkw_org_path, ["rf"])
        assert len(utterances) == 1242
        assert len(utterance.remove_empty_text(utterances)) == 631
        assert len(utterance.remove_duplicates(utterances)) == 1239
        assert len(
            utterance.remove_duplicates(
                utterance.remove_empty_text(utterances))) == 631

        utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"])
        assert len(utterances) == 2278
        assert len(utterance.remove_empty_text(utterances)) == 1666
        assert len(utterance.remove_duplicates(utterances)) == 1899
        assert len(
            utterance.remove_duplicates(
                utterance.remove_empty_text(utterances))) == 1291
예제 #3
0
 def test_poly_durations(self, prep_org_data):
     bkw_org_path = prep_org_data
     utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"])
     print("Total duration of utterances is {}".format(
         utterance.duration(utterances)))
     print("Total duration of the first utterance is {}".format(
         utterance.duration(utterances[0])))
예제 #4
0
 def test_explore_code_switching(self, prep_org_data):
     bkw_org_path = prep_org_data
     utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"])
     utterances = utterance.remove_empty_text(
                  utterance.remove_duplicates(utterances))
     codeswitched_path = tgt_dir / "codeswitched.txt"
     bkw.explore_code_switching(utterances, codeswitched_path)
예제 #5
0
 def test_speaker_durations(self, prep_org_data):
     bkw_org_path = prep_org_data
     utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"])
     print(len(utterances))
     utterances = utterance.remove_empty_text(utterances)
     print(len(utterances))
     utterances = utterance.remove_duplicates(utterances)
     print(len(utterances))
     utterances = [utter for utter in utterances if bkw.bkw_filter(utter)]
     print(len(utterances))
     utterances = [
         utter for utter in utterances if utterance.duration(utter) < 10000
     ]
     total = 0
     fmt = "{:20}{:10}"
     dur_fmt = "{:<10.3f}"
     print(fmt.format("Speaker", "Duration"))
     for speaker, duration in sorted(
             utterance.speaker_durations(utterances),
             key=lambda x: x[1],
             reverse=True):
         dur_mins = (duration * ureg.milliseconds).to(
             ureg.minutes).magnitude
         dur_str = dur_fmt.format(dur_mins)
         total += dur_mins
         print(fmt.format(speaker, dur_str))
     print(fmt.format("Total", dur_fmt.format(total)))
예제 #6
0
    def test_empty_wav(self, prep_org_data):
        # Checking the origin of the empty wav.

        bkw_org_path = prep_org_data
        utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"])

        filtered = utterance.remove_too_short(utterances)
        if filtered != utterances:
            diff = set(utterances) - set(filtered)
            print("set(utterances) - set(filtered): {}:\n".format(
                pprint.pformat(diff)))
            assert False
예제 #7
0
    def test_speaker_id(self, prep_org_data):
        bkw_org_path = prep_org_data
        utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"])
        no_speaker_tiers = set()
        speaker_tiers = set()
        speakers = set()
        for utter in utterances:
            tier_id = splitext(utter.prefix)[0]
            if utter.speaker == None:
                no_speaker_tiers.add(tier_id)
            else:
                speaker_tiers.add((tier_id, utter.speaker))
                speakers.add(utter.speaker)

        assert len(no_speaker_tiers) == 0
        assert len(speakers) == NUM_SPEAKERS