def reorg_noncollapsed(f):
    padding = 0.1
    print(f)
    tg_path = os.path.join(noncollapsed_dir, f)
    tg = TextGrid()
    tg.read(tg_path)
    new_tg = TextGrid(maxTime=tg.maxTime)
    new_tg_path = tg_path.replace(noncollapsed_dir, data_dir)
    for tier in tg.tiers:
        new_tier = IntervalTier(name=tier.name, maxTime=tg.maxTime)
        for i in tier:
            new_mark = sub_pattern.sub(' ', i.mark).strip()
            if not new_mark:
                continue
            new_begin = i.minTime - padding
            if new_begin < 0:
                new_begin = 0
            new_end = i.maxTime + padding
            if new_end > tg.maxTime:
                new_end = tg.maxTime
            try:
                new_tier.add(new_begin, new_end, new_mark)
            except ValueError:
                new_tier[-1].maxTime = new_end
                new_tier[-1].mark += ' ' + new_mark
        print(len(new_tier))
        new_tg.append(new_tier)
    new_tg.write(new_tg_path)
    def export_segments(self, output_directory):
        from decimal import Decimal
        from textgrid import TextGrid, IntervalTier

        file_dict = {}
        for utt, segment in self.corpus.vad_segments.items():
            filename, utt_begin, utt_end = segment
            utt_begin = Decimal(utt_begin)
            utt_end = Decimal(utt_end)
            if filename not in file_dict:
                file_dict[filename] = {}
            speaker = 'segments'
            text = 'speech'
            if speaker not in file_dict[filename]:
                file_dict[filename][speaker] = []
            file_dict[filename][speaker].append([utt_begin, utt_end, text])
        for filename, speaker_dict in file_dict.items():
            try:
                speaker_directory = os.path.join(
                    output_directory,
                    self.corpus.file_directory_mapping[filename])
            except KeyError:
                speaker_directory = output_directory
            os.makedirs(speaker_directory, exist_ok=True)
            max_time = self.corpus.get_wav_duration(filename)
            tg = TextGrid(maxTime=max_time)
            for speaker in sorted(speaker_dict.keys()):
                words = speaker_dict[speaker]
                tier = IntervalTier(name=speaker, maxTime=max_time)
                for w in words:
                    if w[1] > max_time:
                        w[1] = max_time
                    tier.add(*w)
                tg.append(tier)
            tg.write(os.path.join(speaker_directory, filename + '.TextGrid'))
def convert_ctm_to_textgrid(ctm, textgrid):
    words = []
    phonemes = []
    with open(ctm, encoding='utf-8') as f:
        for l in f:
            tok = l.strip().split()
            text = tok[4]
            beg = float(tok[2])
            dur = float(tok[3])
            if tok[0][0] == '@':
                if besi.match(text):
                    text = text[:-2]
                phonemes.append((text, beg, dur))
            else:
                words.append((text, beg, dur))
    tw = IntervalTier(name='words')
    tp = IntervalTier(name='phonemes')
    for seg in words:
        try:
            tw.add(round(seg[1], 2), round(seg[1] + seg[2], 2), seg[0])
        except ValueError:
            print("Error in word seg: " + seg[0])
    for seg in phonemes:
        try:
            tp.add(round(seg[1], 2), round(seg[1] + seg[2], 2), seg[0])
        except ValueError:
            print("Error in phoneme seg: " + seg[0])
    tg = TextGrid()
    tg.append(tw)
    tg.append(tp)
    tg.write(textgrid)
示例#4
0
def ctm_to_textgrid(phone_ctm, out_directory, utt2dur, frameshift=0.01):
    textgrid_write_errors = {}
    frameshift = Decimal(str(frameshift))
    if not os.path.exists(out_directory):
        os.makedirs(out_directory)

    utt2dur_mapping = generate_utt2dur(utt2dur)

    for i, (k, v) in enumerate(sorted(phone_ctm.items())):
        maxtime = Decimal(str(utt2dur_mapping[k]))
        try:
            tg = TextGrid(maxTime=maxtime)
            phonetier = IntervalTier(name='phones', maxTime=maxtime)
            for interval in v:
                if maxtime - interval[1] < frameshift:
                    interval[1] = maxtime
                #remove B E I and stress (0,1) information from phoneme
                interval[2] = re.sub("\d+", "", interval[2].split('_')[0])
                phonetier.add(*interval)
            tg.append(phonetier)
            outpath = os.path.join(out_directory, k + '.TextGrid')
            tg.write(outpath)
        except Exception as e:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            textgrid_write_errors[k] = '\n'.join(
                traceback.format_exception(exc_type, exc_value, exc_traceback))
    if textgrid_write_errors:
        error_log = os.path.join(out_directory, 'output_errors.txt')
        with io_open(error_log, 'w', encoding='utf-8') as f:
            f.write(
                u'The following exceptions were encountered during the ouput of the alignments to TextGrids:\n\n'
            )
            for k, v in textgrid_write_errors.items():
                f.write(u'{}:\n'.format(k))
                f.write(u'{}\n\n'.format(v))
示例#5
0
 def loadOrGenerate(self):
     fname = self.app.Data.checkFileLevel('.TextGrid', shoulderror=False)
     if fname:
         self.TextGrid = self.fromFile(fname)
     else:
         minTime = 0.
         if not hasattr(self.app.Audio, 'duration'):
             self.app.Audio.reset()
         try:
             maxTime = self.app.Audio.duration
         except:
             warn(
                 'Audio has no duration attribute after calling reset(), defaulting to 1 second'
             )
             maxTime = 1.
         self.TextGrid = TextGridFile(maxTime=maxTime)
         keys = self.app.Data.getFileLevel('all')
         if not ('.ult' in keys and '.txt' in keys):
             sentenceTier = IntervalTier("text")
             sentenceTier.add(minTime, maxTime, "text")
             self.TextGrid.append(sentenceTier)
         fname = self.app.Data.unrelativize(
             self.app.Data.getCurrentFilename() + '.TextGrid')
         self.app.Data.setFileLevel('.TextGrid', fname)
     names = self.TextGrid.getNames()
     for i, n in enumerate(names):
         if n in ALIGNMENT_TIER_NAMES:
             if len(self.TextGrid[i]) == 0:
                 self.TextGrid.pop(i)
                 break
             else:
                 self.frameTierName = n
                 return
     self.genFramesTier()
示例#6
0
 def loadOrGenerate(self):
     fname = self.app.Data.checkFileLevel('.TextGrid', shoulderror=False)
     if fname:
         self.TextGrid = self.fromFile(fname)
     else:
         minTime = 0.
         if not hasattr(self.app.Audio, 'duration'):
             self.app.Audio.reset()
         maxTime = self.app.Audio.duration
         self.TextGrid = TextGridFile(maxTime=maxTime)
         sentenceTier = IntervalTier("text")
         sentenceTier.add(minTime, maxTime, "text")
         self.TextGrid.tiers.append(sentenceTier)
         fname = self.app.Data.unrelativize(
             self.app.Data.getCurrentFilename() + '.TextGrid')
         self.app.Data.setFileLevel('.TextGrid', fname)
     names = self.TextGrid.getNames()
     for i, n in enumerate(names):
         if n in ALIGNMENT_TIER_NAMES:
             if len(self.TextGrid[i]) == 0:
                 self.TextGrid.pop(i)
                 break
             else:
                 return
     self.genFramesTier()
示例#7
0
def parseFile(path, fn):
	filename= fn.split(".")[0] #just name of file
	with open(path, "r") as f1:
		lines = f1.readlines()
	SAM = getSAM(lines)
	allSegs = getMAU(lines, SAM, filename)
	if allSegs is None:
		return
	segs = []
	for seg in allSegs:
		
		#print("%f %f %s %s"%(seg.start, seg.end, seg.segment, seg.index))
		tup = getSegInfo(seg)
		segs.append(tup)
	words = getWords(lines, allSegs, filename)


	maxtime = getMaxTime(allSegs)
	if maxtime == -1:
		return
	tg = TextGrid(maxTime = maxtime)
	wordtier = IntervalTier(name = 'words', maxTime = maxtime)
	phonetier = IntervalTier(name = 'phones', maxTime = maxtime)
	for interval in words:
		wordtier.add(*interval)
	for interval in segs:
		phonetier.add(*interval)
	tg.append(wordtier)
	tg.append(phonetier)
	outpath = "/Users/elias/Desktop/TextGrids/%s.TextGrid"%filename
	tg.write(outpath)
def createTextGrid(data, tierName = "words"):
	tier = IntervalTier(tierName)
	txtgrid = TextGrid()
	prevTime = 0
	for (name, time, dur, words) in data:
		tier.add(prevTime, prevTime+dur, makeSentence(words))
		prevTime += dur
	txtgrid.append(tier)
	return txtgrid
示例#9
0
def ctm_to_textgrid(word_ctm,
                    phone_ctm,
                    out_directory,
                    corpus,
                    dictionary,
                    frameshift=0.01):
    textgrid_write_errors = {}
    frameshift = Decimal(str(frameshift))
    if not os.path.exists(out_directory):
        os.makedirs(out_directory, exist_ok=True)

    silences = {dictionary.optional_silence, dictionary.nonoptional_silence}
    for i, (filename, speaker_dict) in enumerate(sorted(word_ctm.items())):
        maxtime = corpus.get_wav_duration(filename)
        try:
            speaker_directory = os.path.join(
                out_directory, corpus.file_directory_mapping[filename])
            tg = TextGrid(maxTime=maxtime)
            for speaker in corpus.speaker_ordering[filename]:
                words = speaker_dict[speaker]
                word_tier_name = '{} - words'.format(speaker)
                phone_tier_name = '{} - phones'.format(speaker)
                word_tier = IntervalTier(name=word_tier_name, maxTime=maxtime)
                phone_tier = IntervalTier(name=phone_tier_name,
                                          maxTime=maxtime)
                for w in words:
                    word_tier.add(*w)
                for p in phone_ctm[filename][speaker]:
                    if len(phone_tier) > 0 and phone_tier[
                            -1].mark in silences and p[2] in silences:
                        phone_tier[-1].maxTime = p[1]
                    else:
                        if len(phone_tier) > 0 and p[2] in silences and p[
                                0] < phone_tier[-1].maxTime:
                            p = phone_tier[-1].maxTime, p[1], p[2]
                        elif len(phone_tier) > 0 and p[2] not in silences and p[0] < phone_tier[-1].maxTime and \
                                        phone_tier[-1].mark in silences:
                            phone_tier[-1].maxTime = p[0]
                        phone_tier.add(*p)
                tg.append(word_tier)
                tg.append(phone_tier)
            tg.write(os.path.join(speaker_directory, filename + '.TextGrid'))
        except Exception as e:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            textgrid_write_errors[filename] = '\n'.join(
                traceback.format_exception(exc_type, exc_value, exc_traceback))
    if textgrid_write_errors:
        error_log = os.path.join(out_directory, 'output_errors.txt')
        with open(error_log, 'w', encoding='utf8') as f:
            f.write(
                'The following exceptions were encountered during the ouput of the alignments to TextGrids:\n\n'
            )
            for k, v in textgrid_write_errors.items():
                f.write('{}:\n'.format(k))
                f.write('{}\n\n'.format(v))
    def export_classification(self, output_directory):
        if self.cluster:
            self.cluster_utterances()
        else:
            self.get_classification_stats()
        from decimal import Decimal
        from textgrid import TextGrid, IntervalTier
        spk2utt_path = os.path.join(self.classify_directory, 'spk2utt')
        utt2spk_path = os.path.join(self.classify_directory, 'utt2spk')
        if self.corpus.segments:
            utt2spk = load_scp(utt2spk_path)
            file_dict = {}
            for utt, segment in self.corpus.segments.items():

                filename, utt_begin, utt_end = segment.split(' ')
                utt_begin = Decimal(utt_begin)
                utt_end = Decimal(utt_end)
                if filename not in file_dict:
                    file_dict[filename] = {}
                speaker = utt2spk[utt]
                text = self.corpus.text_mapping[utt]
                if speaker not in file_dict[filename]:
                    file_dict[filename][speaker] = []
                file_dict[filename][speaker].append([utt_begin, utt_end, text])
            for filename, speaker_dict in file_dict.items():
                try:
                    speaker_directory = os.path.join(
                        output_directory,
                        self.corpus.file_directory_mapping[filename])
                except KeyError:
                    speaker_directory = output_directory
                max_time = self.corpus.get_wav_duration(filename)
                tg = TextGrid(maxTime=max_time)
                for speaker in sorted(speaker_dict.keys()):
                    words = speaker_dict[speaker]
                    tier = IntervalTier(name=speaker, maxTime=max_time)
                    for w in words:
                        if w[1] > max_time:
                            w[1] = max_time
                        tier.add(*w)
                    tg.append(tier)
                tg.write(
                    os.path.join(speaker_directory, filename + '.TextGrid'))

        else:
            spk2utt = load_scp(spk2utt_path)
            for speaker, utts in spk2utt.items():
                speaker_dir = os.path.join(output_directory, speaker)
                os.makedirs(speaker_dir, exist_ok=True)
                with open(os.path.join(speaker_dir, 'utterances.txt'),
                          'w',
                          encoding='utf8') as f:
                    for u in utts:
                        f.write('{}\n'.format(u))
示例#11
0
 def test_time_to_frame_interval_tier_short_seg(self):
     tier = IntervalTier('test', 0, 0.01)
     tier.add(0, 0.003, "a")
     tier.add(0.003, 0.01, "b")
     frame_tier = utterance.time_to_frame_interval_tier(tier, 5)
     self.assertEqual(frame_tier.minTime, 0)
     self.assertEqual(frame_tier.maxTime, 2)
     self.assertEqual(frame_tier.intervals[0].minTime, 0)
     self.assertEqual(frame_tier.intervals[0].maxTime, 1)
     self.assertEqual(frame_tier.intervals[1].minTime, 1)
     self.assertEqual(frame_tier.intervals[1].maxTime, 2)
示例#12
0
def generator_textgrid(maxtime, lines, output):
    # Download Praat: https://www.fon.hum.uva.nl/praat/
    interval = maxtime / (len(lines) + 1)
    margin = 0.0001

    tg = TextGrid(maxTime=maxtime)
    linetier = IntervalTier(name="line", maxTime=maxtime)

    i = 0
    for l in lines:
        s, e, w = l.split()
        linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w)

    tg.append(linetier)
    print("successfully generator {}".format(output))
    tg.write(output)
def createNew(textgrid, tier_name, VERBOSE=False):
	tiers = textgrid.getList(tier_name)
	tier = tiers[0]
	new_tier = IntervalTier(tier_name+'_clean') 
	new_txtgrid = TextGrid()
	if VERBOSE == True:
		print ("Old tier: %s" % tier)
	for interval in tier:
		if isPause(interval.mark) == True:
			new_tier.add(interval.minTime, interval.maxTime, '')
		else:
			new_tier.add(interval.minTime, interval.maxTime, fixString(interval.mark))
	new_txtgrid.append(new_tier)
	if VERBOSE == True:
		print ("New tier: %s" % new_tier)
	return new_txtgrid
示例#14
0
def convert_ctm_to_textgrid(ctms, textgrid):
    for ctm in ctms:
        tiername = ctm.stem
        ret = []
        with open(ctm, encoding='utf-8') as f:
            for l in f:
                tok = l.strip().split()
                word = tok[4]
                beg = float(tok[2])
                dur = float(tok[3])
                ret.append((word, beg, dur))
        t = IntervalTier(name=tiername)
        for seg in ret:
            try:
                t.add(round(seg[1], 2), round(seg[1] + seg[2], 2), seg[0])
            except ValueError:
                print("Error in seg: " + seg[0])
        tg = TextGrid()
        tg.append(t)
        tg.write(textgrid)
def read_segment(val: Segment) -> IntervalTier:
    """Read a Segment message and save it to an IntervalTier object.

    Args:
        val: A Segment message as defined in data_utterance.pb.

    Returns:
        interval: The Segment message saved in an IntervalTier object.
    """
    symbols = val.symbol
    start_time = mat_to_numpy(val.start_time)
    end_time = mat_to_numpy(val.end_time)
    num_items = val.num_item

    if not (len(symbols) == len(start_time) == len(end_time) == num_items):
        raise ValueError("Interval item number is not consistent!")

    interval = IntervalTier(minTime=start_time[0], maxTime=end_time[-1])
    for sym, min_time, max_time in zip(symbols, start_time, end_time):
        interval.add(min_time, max_time, sym)
    return interval
def time_to_frame_interval_tier(time_tier: IntervalTier,
                                shift) -> IntervalTier:
    """Convert an IntervalTier in time to frame.

    Args:
        time_tier: IntervalTier represented in seconds.
        shift: Window shift in ms.

    Returns:
        frame_tier: IntervalTier represented in frames.
    """
    max_frame = time_to_frame(time_tier.maxTime, shift)
    frame_tier = IntervalTier(time_tier.name, 0, max_frame)

    # Deal with (occasionally) very short segments -- less than a frame shift
    # If we have consecutive very small segments then the function will raise a
    # ValueError
    start_shift = 0
    for each_interval in time_tier.intervals:
        curr_min_frame = time_to_frame(each_interval.minTime, shift)
        if start_shift > 0:
            logging.warning("Last segment is too short, have to cut the %d "
                            "frame(s) from the beginning of the current "
                            "segment.", start_shift)
            curr_min_frame += start_shift
            start_shift = 0
        curr_max_frame = time_to_frame(each_interval.maxTime, shift)
        if curr_min_frame >= curr_max_frame:
            curr_max_frame = curr_min_frame + 1
            start_shift = curr_max_frame - curr_min_frame
            logging.warning("The current segment is too short, extend it for "
                            "%d frame(s).", start_shift)
        if curr_max_frame > frame_tier.maxTime:
            raise ValueError("Extreme short segments in the tier, please fix "
                             "these.")
        frame_tier.add(curr_min_frame, curr_max_frame, each_interval.mark)
    return frame_tier
示例#17
0
    def genFramesTier(self):
        debug('generating frames tier for %s' %
              self.app.Data.getCurrentFilename())
        self.frameTierName = 'frames'
        times = self.app.Dicom.getFrameTimes()
        self.app.Data.setFileLevel("NumberOfFrames", len(times))
        try:
            maxTime = max(self.app.Audio.duration, times[-1])
        except AttributeError:
            maxTime = times[-1]
        tier = PointTier('frames', maxTime=maxTime)
        for f, t in enumerate(times):
            tier.addPoint(Point(t, str(f)))
        if not self.TextGrid.maxTime or maxTime > self.TextGrid.maxTime:
            self.TextGrid.maxTime = maxTime
        self.TextGrid.append(tier)

        keys = self.app.Data.getFileLevel('all')
        if '.ult' in keys and '.txt' in keys:
            fname = self.app.Data.unrelativize(
                self.app.Data.getFileLevel('.txt'))
            f = open(fname, 'rb')
            s = util.decode_bytes(f.read())
            f.close()
            if s:
                line = s.splitlines()[0]
                sentenceTier = IntervalTier("sentence")
                sentenceTier.add(0, self.app.Audio.duration, line)
                self.TextGrid.append(sentenceTier)
                self.TextGrid.tiers = [self.TextGrid.tiers[-1]
                                       ] + self.TextGrid.tiers[:-1]

        path = self.app.Data.unrelativize(
            self.app.Data.getFileLevel('.TextGrid'))
        self.TextGrid.write(path)
        self.TextGrid = TextGridFile.fromFile(path)
def ctm_to_textgrid(word_ctm, phone_ctm, out_directory, corpus, dictionary, frameshift=0.01):
    textgrid_write_errors = {}
    frameshift = Decimal(str(frameshift))
    if not os.path.exists(out_directory):
        os.makedirs(out_directory, exist_ok=True)
    if not corpus.segments:
        for i, (k, v) in enumerate(sorted(word_ctm.items())):
            maxtime = Decimal(str(corpus.get_wav_duration(k)))
            speaker = list(v.keys())[0]
            v = list(v.values())[0]
            try:
                tg = TextGrid(maxTime=maxtime)
                wordtier = IntervalTier(name='words', maxTime=maxtime)
                phonetier = IntervalTier(name='phones', maxTime=maxtime)
                for interval in v:
                    if maxtime - interval[1] < frameshift:  # Fix rounding issues
                        interval[1] = maxtime
                    wordtier.add(*interval)
                for interval in phone_ctm[k][speaker]:
                    if maxtime - interval[1] < frameshift:
                        interval[1] = maxtime
                    phonetier.add(*interval)
                tg.append(wordtier)
                tg.append(phonetier)
                relative = corpus.file_directory_mapping[k]
                if relative:
                    speaker_directory = os.path.join(out_directory, relative)
                else:
                    speaker_directory = out_directory
                os.makedirs(speaker_directory, exist_ok=True)
                outpath = os.path.join(speaker_directory, k + '.TextGrid')
                tg.write(outpath)
            except Exception as e:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                textgrid_write_errors[k] = '\n'.join(traceback.format_exception(exc_type, exc_value, exc_traceback))
    else:
        silences = {dictionary.optional_silence, dictionary.nonoptional_silence}
        for i, (filename, speaker_dict) in enumerate(sorted(word_ctm.items())):
            maxtime = corpus.get_wav_duration(filename)
            try:
                speaker_directory = os.path.join(out_directory, corpus.file_directory_mapping[filename])
                tg = TextGrid(maxTime=maxtime)
                for speaker in corpus.speaker_ordering[filename]:
                    words = speaker_dict[speaker]
                    word_tier_name = '{} - words'.format(speaker)
                    phone_tier_name = '{} - phones'.format(speaker)
                    word_tier = IntervalTier(name=word_tier_name, maxTime=maxtime)
                    phone_tier = IntervalTier(name=phone_tier_name, maxTime=maxtime)
                    for w in words:
                        word_tier.add(*w)
                    for p in phone_ctm[filename][speaker]:
                        if len(phone_tier) > 0 and phone_tier[-1].mark in silences and p[2] in silences:
                            phone_tier[-1].maxTime = p[1]
                        else:
                            if len(phone_tier) > 0 and p[2] in silences and p[0] < phone_tier[-1].maxTime:
                                p = phone_tier[-1].maxTime, p[1], p[2]
                            elif len(phone_tier) > 0 and p[2] not in silences and p[0] < phone_tier[-1].maxTime and \
                                            phone_tier[-1].mark in silences:
                                phone_tier[-1].maxTime = p[0]
                            phone_tier.add(*p)
                    tg.append(word_tier)
                    tg.append(phone_tier)
                tg.write(os.path.join(speaker_directory, filename + '.TextGrid'))
            except Exception as e:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                textgrid_write_errors[filename] = '\n'.join(traceback.format_exception(exc_type, exc_value, exc_traceback))
    if textgrid_write_errors:
        error_log = os.path.join(out_directory, 'output_errors.txt')
        with open(error_log, 'w', encoding='utf8') as f:
            f.write('The following exceptions were encountered during the ouput of the alignments to TextGrids:\n\n')
            for k,v in textgrid_write_errors.items():
                f.write('{}:\n'.format(k))
                f.write('{}\n\n'.format(v))
 #print(transcription)
 if len(transcription) < 3 and u.duration > 25:
     continue
 if u.duration > 25:
     print(d, u.duration)
     print([(x._type_node['label'], x.begin, x.duration)
            for x in u.word])
     error
 transcription = ' '.join(x[0] for x in transcription)
 begin = u.begin - 0.075
 if begin < 0:
     begin = 0
 end = u.end + 0.075
 if end > duration:
     end = duration
 utterance_tier.add(begin, end, transcription)
 utt_duration = end - begin
 utt_name = '{}_{}_{}'.format(d, begin, end)
 utt_wav_path = os.path.join(lab_dir, utt_name + '.wav')
 if not os.path.exists(utt_wav_path):
     extract_audio(wav_path, utt_wav_path, begin, end, padding=0)
 lab_path = os.path.join(lab_dir, utt_name + '.lab')
 with open(lab_path, 'w') as f:
     f.write(transcription)
 trans_path = os.path.join(lab_dir, utt_name + '.txt')
 with open(trans_path, 'w') as f:
     f.write('{}\t{}\t0\t{}\t{}'.format(speaker, speaker,
                                        utt_duration,
                                        transcription))
 utt_tg_path = os.path.join(lab_dir, utt_name + '.TextGrid')
 utt_tg = TextGrid(maxTime=utt_duration)
示例#20
0
class TestUtterance(unittest.TestCase):
    def setUp(self):
        self.utt = utterance.Utterance()
        self.float_mat = FloatMatrix()
        self.int_mat = Int32Matrix()
        self.bool_mat = BinaryMatrix()
        self.seg = Segment()
        self.tier = IntervalTier('test', 0, 2)
        self.tier.add(0, 1, "a")
        self.tier.add(1, 2, "b")

    def tearDown(self):
        pass

    def test_empty_init(self):
        self.assertTrue(isinstance(self.utt, utterance.Utterance))

    def test_full_init(self):
        utt = utterance.Utterance(np.array([1, 2, 3, 4, 5]), 16000, "test")
        self.assertTrue(isinstance(utt, utterance.Utterance))

    def test_invalid_init(self):
        try:
            utt = utterance.Utterance(np.array([1, 2, 3, 4, 5]), text="test")
        except ValueError:
            pass

    def test_rw_internal(self):
        pb = self.utt.write_internal()
        self.utt.read_internal(pb)

    def test_rw(self):
        with tempfile.TemporaryDirectory() as tpf:
            output_file = os.path.join(tpf, "test.data")
            self.utt.write(output_file)
            self.assertTrue(os.path.exists(output_file))
            self.utt.read(output_file)

    def test_rw_mat_empty(self):
        np_mat = np.array([])
        utterance.numpy_to_mat(np_mat, self.float_mat)
        self.assertEqual(self.float_mat.num_row, 0)
        self.assertEqual(self.float_mat.num_col, 0)
        np_mat_recover = utterance.mat_to_numpy(self.float_mat)
        self.assertEqual(np_mat_recover.shape, np_mat.shape)
        self.assertTrue((np_mat == np_mat_recover).all())

    def test_rw_mat_scalar(self):
        np_mat = np.array([1])
        utterance.numpy_to_mat(np_mat, self.float_mat)
        self.assertEqual(self.float_mat.num_row, 1)
        self.assertEqual(self.float_mat.num_col, 1)
        np_mat_recover = utterance.mat_to_numpy(self.float_mat)
        self.assertEqual(np_mat_recover.shape, np_mat.shape)
        self.assertTrue((np_mat == np_mat_recover).all())

    def test_rw_mat_row_vec(self):
        np_mat = np.array([1, 2, 3, 4])
        num_ele = len(np_mat)
        utterance.numpy_to_mat(np_mat, self.float_mat)
        self.assertEqual(self.float_mat.num_row, 1)
        self.assertEqual(self.float_mat.num_col, num_ele)
        np_mat_recover = utterance.mat_to_numpy(self.float_mat)
        self.assertEqual(np_mat_recover.shape, np_mat.shape)
        self.assertTrue((np_mat == np_mat_recover).all())

    def test_rw_mat_col_vec(self):
        np_mat = np.array([[1], [2], [3], [4]])
        num_row, num_col = np_mat.shape
        utterance.numpy_to_mat(np_mat, self.float_mat)
        self.assertEqual(self.float_mat.num_row, num_row)
        self.assertEqual(self.float_mat.num_col, num_col)
        np_mat_recover = utterance.mat_to_numpy(self.float_mat)
        self.assertEqual(np_mat_recover.shape, (num_row, num_col))
        self.assertTrue((np_mat == np_mat_recover).all())

    def test_rw_mat_int_mat(self):
        np_mat = np.array([[1, 1], [2, 2], [3, 3], [4, 4]])
        np_mat.astype(int)
        num_row, num_col = np_mat.shape
        utterance.numpy_to_mat(np_mat, self.int_mat)
        self.assertEqual(self.int_mat.num_row, num_row)
        self.assertEqual(self.int_mat.num_col, num_col)
        np_mat_recover = utterance.mat_to_numpy(self.int_mat)
        self.assertEqual(np_mat_recover.shape, (num_row, num_col))
        self.assertTrue((np_mat == np_mat_recover).all())

    def test_rw_mat_float_mat(self):
        np_mat = np.array([[1, 1], [2, 2], [3, 3], [4, 4]])
        num_row, num_col = np_mat.shape
        utterance.numpy_to_mat(np_mat, self.float_mat)
        self.assertEqual(self.float_mat.num_row, num_row)
        self.assertEqual(self.float_mat.num_col, num_col)
        np_mat_recover = utterance.mat_to_numpy(self.float_mat)
        self.assertEqual(np_mat_recover.shape, (num_row, num_col))
        self.assertTrue((np_mat == np_mat_recover).all())

    def test_rw_mat_binary_mat(self):
        np_mat = np.array([[1, 0], [0, 1], [0, 0], [1, 1]])
        np_mat.astype(bool)
        num_row, num_col = np_mat.shape
        utterance.numpy_to_mat(np_mat, self.bool_mat)
        self.assertEqual(self.bool_mat.num_row, num_row)
        self.assertEqual(self.bool_mat.num_col, num_col)
        np_mat_recover = utterance.mat_to_numpy(self.bool_mat)
        self.assertEqual(np_mat_recover.shape, (num_row, num_col))
        self.assertTrue((np_mat == np_mat_recover).all())

    def test_rw_segment(self):
        utterance.write_segment(self.tier, self.seg)
        self.assertEqual(self.seg.num_item, len(self.tier.intervals))
        interval = utterance.read_segment(self.seg)
        self.assertTrue(isinstance(interval, IntervalTier))
        self.assertEqual(len(self.tier.intervals), len(interval.intervals))

    def test_rw_data(self):
        data = self.utt.data
        self.utt.data = data

    def test_rw_wav(self):
        input_wav = np.array([1, 2, 3, 4])
        self.utt.wav = input_wav
        wav = self.utt.wav
        self.assertTrue((input_wav == wav).any())

    def test_rw_fs(self):
        self.utt.fs = 16000
        fs = self.utt.fs
        self.assertEqual(fs, 16000)

    def test_set_invalid_fs(self):
        try:
            self.utt.fs = 0
        except ValueError:
            pass

        try:
            self.utt.fs = -2
        except ValueError:
            pass

    def test_rw_text(self):
        self.utt.text = "test"
        text = self.utt.text
        self.assertEqual(text, "test")

    def test_rw_align(self):
        align = TextGrid()
        align.read("data/test.TextGrid")
        self.utt.align = align
        align_recover = self.utt.align
        self.assertEqual(len(align_recover.tiers), len(align.tiers))

    def test_rw_ppg(self):
        ppg = np.random.uniform(0, 1, (100, 5816))
        self.utt.ppg = ppg
        ppg_recover = self.utt.ppg
        # There might be a small difference since the data are saved in float
        # while the original values are double precision
        self.assertAlmostEqual(((ppg_recover - ppg)**2).sum(), 0)

    def test_rw_monophone_ppg(self):
        mono_ppg = np.random.uniform(0, 1, (100, 40))
        self.utt.monophone_ppg = mono_ppg
        mono_ppg_recover = self.utt.monophone_ppg
        # There might be a small difference since the data are saved in float
        # while the original values are double precision
        self.assertAlmostEqual(((mono_ppg_recover - mono_ppg)**2).sum(), 0)

    def test_rw_phone(self):
        self.utt.phone = self.tier
        interval = self.utt.phone
        self.assertEqual(len(interval.intervals), len(self.tier.intervals))

    def test_rw_word(self):
        self.utt.word = self.tier
        interval = self.utt.word
        self.assertEqual(len(interval.intervals), len(self.tier.intervals))

    def test_rw_lab(self):
        lab = np.array([1, 2, 3, 4, 5, 5, 6, 7])
        self.utt.lab = lab
        lab_recover = self.utt.lab
        self.assertTrue((lab == lab_recover).all())

    def test_rw_utterance_id(self):
        self.utt.utterance_id = "id"
        utt_id = self.utt.utterance_id
        self.assertEqual(utt_id, "id")

    def test_rw_speaker_id(self):
        self.utt.speaker_id = "id"
        spk_id = self.utt.speaker_id
        self.assertEqual(spk_id, "id")

    def test_rw_dialect(self):
        self.utt.dialect = "EN_CN"
        dialect = self.utt.dialect
        self.assertEqual(dialect, "EN_CN")

    def test_rw_gender(self):
        self.utt.gender = "O"
        gender = self.utt.gender
        self.assertEqual(gender, "O")

    def test_rw_original_file(self):
        self.utt.original_file = "file.test"
        origin_file = self.utt.original_file
        self.assertEqual(origin_file, "file.test")

    def test_rw_num_channel(self):
        self.utt.num_channel = 2
        channel = self.utt.num_channel
        self.assertEqual(channel, 2)

    def test_rw_kaldi_shift(self):
        self.utt.kaldi_shift = 10
        shift = self.utt.kaldi_shift
        self.assertEqual(shift, 10)

    def test_rw_kaldi_window_size(self):
        self.utt.kaldi_window_size = 25
        w_size = self.utt.kaldi_window_size
        self.assertEqual(w_size, 25)

    def test_rw_kaldi_window_type(self):
        self.utt.kaldi_window_type = "hamming"
        w_type = self.utt.kaldi_window_type
        self.assertEqual(w_type, "hamming")

    def test_rw_vocoder(self):
        self.utt.vocoder = "WORLD"
        vocoder_name = self.utt.vocoder
        self.assertEqual(vocoder_name, "WORLD")

    def test_rw_spec(self):
        spec = np.random.uniform(0, 1, (100, 513))
        self.utt.spec = spec
        spec_recover = self.utt.spec
        # There might be a small difference since the data are saved in float
        # while the original values are double precision
        self.assertAlmostEqual(((spec_recover - spec)**2).sum(), 0)
        self.assertEqual(self.utt.spec_dim, 513)

    def test_rw_mfcc(self):
        mfcc = np.random.uniform(0, 1, (100, 25))
        self.utt.mfcc = mfcc
        mfcc_recover = self.utt.mfcc
        # There might be a small difference since the data are saved in float
        # while the original values are double precision
        self.assertAlmostEqual(((mfcc_recover - mfcc)**2).sum(), 0)
        self.assertEqual(self.utt.mfcc_dim, 25)

    def test_rw_mcep(self):
        mcep = np.random.uniform(0, 1, (100, 60))
        self.utt.mcep = mcep
        mcep_recover = self.utt.mcep
        # There might be a small difference since the data are saved in float
        # while the original values are double precision
        self.assertAlmostEqual(((mcep_recover - mcep)**2).sum(), 0)
        self.assertEqual(self.utt.mcep_dim, 60)

    def test_rw_f0(self):
        f0 = np.array([1, 2, 3, 4, 5])
        self.utt.f0 = f0
        f0_recover = self.utt.f0
        # There might be a small difference since the data are saved in float
        # while the original values are double precision
        self.assertAlmostEqual(((f0_recover - f0)**2).sum(), 0)
        self.assertEqual(self.utt.num_frame, 5)

    def test_rw_ap(self):
        ap = np.random.uniform(0, 1, (100, 513))
        self.utt.ap = ap
        ap_recover = self.utt.ap
        # There might be a small difference since the data are saved in float
        # while the original values are double precision
        self.assertAlmostEqual(((ap_recover - ap)**2).sum(), 0)
        self.assertEqual(self.utt.ap_dim, 513)

    def test_rw_bap(self):
        bap = np.array([1, 2, 3, 4, 5])
        self.utt.bap = bap
        bap_recover = self.utt.bap
        # There might be a small difference since the data are saved in float
        # while the original values are double precision
        self.assertAlmostEqual(((bap_recover - bap)**2).sum(), 0)
        self.assertEqual(self.utt.bap_dim, 1)

        bap = np.random.uniform(0, 1, (100, 5))
        self.utt.bap = bap
        bap_recover = self.utt.bap
        # There might be a small difference since the data are saved in float
        # while the original values are double precision
        self.assertAlmostEqual(((bap_recover - bap)**2).sum(), 0)
        self.assertEqual(self.utt.bap_dim, 5)

    def test_rw_vuv(self):
        vuv = np.array([1, 2, 3, 4, 5])
        self.utt.vuv = vuv
        vuv_recover = self.utt.vuv
        # There might be a small difference since the data are saved in float
        # while the original values are double precision
        self.assertAlmostEqual(((vuv_recover - vuv)**2).sum(), 0)

    def test_rw_temporal_position(self):
        tp = np.array([1, 2, 3, 4, 5])
        self.utt.temporal_position = tp
        tp_recover = self.utt.temporal_position
        # There might be a small difference since the data are saved in float
        # while the original values are double precision
        self.assertAlmostEqual(((tp_recover - tp)**2).sum(), 0)

    def test_rw_vocoder_shift(self):
        self.utt.vocoder_shift = 10
        shift = self.utt.vocoder_shift
        self.assertEqual(shift, 10)

    def test_rw_vocoder_window_size(self):
        self.utt.vocoder_window_size = 25
        w_size = self.utt.vocoder_window_size
        self.assertEqual(w_size, 25)

    def test_rw_vocoder_window_type(self):
        self.utt.vocoder_window_type = "hamming"
        w_type = self.utt.vocoder_window_type
        self.assertEqual(w_type, "hamming")

    def test_rw_num_frame(self):
        self.utt.num_frame = 10
        num_frame = self.utt.num_frame
        self.assertEqual(num_frame, 10)

    def test_rw_alpha(self):
        self.utt.alpha = 0.42
        alpha = self.utt.alpha
        self.assertAlmostEqual(alpha, 0.42)

    def test_rw_fft_size(self):
        self.utt.fft_size = 1024
        fft_size = self.utt.fft_size
        self.assertEqual(fft_size, 1024)

    def test_rw_spec_dim(self):
        self.utt.spec_dim = 513
        spec_dim = self.utt.spec_dim
        self.assertEqual(spec_dim, 513)

    def test_rw_mfcc_dim(self):
        self.utt.mfcc_dim = 60
        mfcc_dim = self.utt.mfcc_dim
        self.assertEqual(mfcc_dim, 60)

    def test_rw_mcep_dim(self):
        self.utt.mcep_dim = 60
        mcep_dim = self.utt.mcep_dim
        self.assertEqual(mcep_dim, 60)

    def test_rw_f0_floor(self):
        self.utt.f0_floor = 40
        f0_floor = self.utt.f0_floor
        self.assertEqual(f0_floor, 40)

    def test_rw_f0_ceil(self):
        self.utt.f0_ceil = 800
        f0_ceil = self.utt.f0_ceil
        self.assertEqual(f0_ceil, 800)

    def test_rw_timestamp(self):
        self.utt.timestamp = '1/25/2019'
        timestamp = self.utt.timestamp
        self.assertEqual(timestamp, '1/25/2019')

    def test_rw_ap_dim(self):
        self.utt.ap_dim = 5
        ap_dim = self.utt.ap_dim
        self.assertEqual(ap_dim, 5)

    def test_rw_bap_dim(self):
        self.utt.bap_dim = 1
        bap_dim = self.utt.bap_dim
        self.assertEqual(bap_dim, 1)

    def test_rw_pitch_tracker(self):
        self.utt.pitch_tracker = 'dio'
        pitch_tracker = self.utt.pitch_tracker
        self.assertEqual(pitch_tracker, 'dio')

    def test_time_to_frame_valid_input(self):
        self.assertEqual(utterance.time_to_frame(3.66, 5), 732)
        self.assertEqual(utterance.time_to_frame(0, 5), 0)

    def test_time_to_frame_invalid_input(self):
        try:
            utterance.time_to_frame(-10, 5)
        except ValueError:
            pass

    def test_time_to_frame_interval_tier_simple_case(self):
        shift = 5
        frame_tier = utterance.time_to_frame_interval_tier(self.tier, shift)
        self.assertEqual(frame_tier.minTime, 0)
        self.assertEqual(frame_tier.maxTime,
                         utterance.time_to_frame(self.tier.maxTime, shift))

    def test_time_to_frame_interval_tier_short_seg(self):
        tier = IntervalTier('test', 0, 0.01)
        tier.add(0, 0.003, "a")
        tier.add(0.003, 0.01, "b")
        frame_tier = utterance.time_to_frame_interval_tier(tier, 5)
        self.assertEqual(frame_tier.minTime, 0)
        self.assertEqual(frame_tier.maxTime, 2)
        self.assertEqual(frame_tier.intervals[0].minTime, 0)
        self.assertEqual(frame_tier.intervals[0].maxTime, 1)
        self.assertEqual(frame_tier.intervals[1].minTime, 1)
        self.assertEqual(frame_tier.intervals[1].maxTime, 2)

    def test_is_sil(self):
        self.assertTrue(utterance.is_sil("sil"))
        self.assertTrue(utterance.is_sil("sp"))
        self.assertTrue(utterance.is_sil("spn"))
        self.assertTrue(utterance.is_sil(""))
        self.assertTrue(utterance.is_sil("SIL"))
        self.assertTrue(utterance.is_sil("SPN"))
        self.assertTrue(utterance.is_sil("SP"))
        self.assertFalse(utterance.is_sil("AO"))
        self.assertFalse(utterance.is_sil("s"))

    def test_normalize_phone(self):
        self.assertEqual("ao", utterance.normalize_phone("AO0"))
        self.assertEqual("ao", utterance.normalize_phone("AO"))
        self.assertEqual("ao", utterance.normalize_phone("AO0, AH, s"))
        self.assertEqual("ao,ah,s",
                         utterance.normalize_phone("AO0, AH, s", False))
        self.assertEqual("sil", utterance.normalize_phone("SIL"))
        self.assertEqual("sil", utterance.normalize_phone(""))
        try:
            utterance.normalize_phone("1243")
        except ValueError:
            pass

    def test_normalize_tier_mark(self):
        utterance.normalize_tier_mark(self.tier)
        utterance.normalize_tier_mark(self.tier, "NormalizePhoneAnnotation")

        try:
            utterance.normalize_tier_mark(self.tier, "NormalizePhone")
        except ValueError:
            pass

    def test_read_sym_table(self):
        sym_table = utterance.read_sym_table("data/phoneme_table")
        self.assertEqual(len(sym_table), 40)

    def test_get_hardcoded_sym_table(self):
        sym_table = utterance.get_hardcoded_sym_table()
        correct_sym_table = utterance.read_sym_table("data/phoneme_table")
        self.assertTrue(
            set(sym_table.items()) == set(correct_sym_table.items()))

    def test_get_phone_tier_invalid_0(self):
        try:
            self.utt.get_phone_tier()
        except ValueError:
            pass

    def test_get_phone_tier_invalid_1(self):
        try:
            self.utt.kaldi_shift = 5
            self.utt.get_phone_tier()
        except ValueError:
            pass

    def test_get_word_tier_invalid_0(self):
        try:
            self.utt.get_word_tier()
        except ValueError:
            pass

    def test_get_word_tier_invalid_1(self):
        try:
            self.utt.kaldi_shift = 5
            self.utt.get_word_tier()
        except ValueError:
            pass

    def test_get_monophone_ppg_valid(self):
        fs, wav = wavfile.read('data/test_mono_channel.wav')
        utt = utterance.Utterance(wav, fs)
        utt.kaldi_shift = 5
        ppgs = utt.get_monophone_ppg()
        self.assertEqual(ppgs.shape[1], 40)

    def test_get_monophone_ppg_invalid_0(self):
        try:
            self.utt.get_monophone_ppg()
        except ValueError:
            pass

    def test_get_monophone_ppg_invalid_1(self):
        try:
            self.utt.kaldi_shift = 5
            self.utt.get_monophone_ppg()
        except ValueError:
            pass
def ctm_to_textgrid(word_ctm, phone_ctm, out_directory, corpus):

    if not os.path.exists(out_directory):
        os.makedirs(out_directory, exist_ok=True)
    if not corpus.segments:
        for i,(k,v) in enumerate(word_ctm.items()):
            maxtime = corpus.get_wav_duration(k)
            try:
                tg = TextGrid(maxTime = maxtime)
                wordtier = IntervalTier(name = 'words', maxTime = maxtime)
                phonetier = IntervalTier(name = 'phones', maxTime = maxtime)
                for interval in v:
                    wordtier.add(*interval)
                for interval in phone_ctm[k]:
                    phonetier.add(*interval)
                tg.append(wordtier)
                tg.append(phonetier)
                if corpus.speaker_directories:
                    speaker_directory = os.path.join(out_directory, corpus.utt_speak_mapping[k])
                else:
                    speaker_directory = out_directory
                os.makedirs(speaker_directory, exist_ok=True)
                outpath = os.path.join(speaker_directory, k + '.TextGrid')
                tg.write(outpath)
            except ValueError as e:
                print('Could not write textgrid for {}'.format(k))
                print(e)
    else:
        tgs = {}
        for i,(k,v) in enumerate(word_ctm.items()):
            rec = corpus.segments[k]
            rec, begin, end = rec.split(' ')
            maxtime = corpus.get_wav_duration(k)
            if rec not in tgs:
                tgs[rec] = TextGrid(maxTime = maxtime)
            tg = tgs[rec]
            begin = float(begin)
            speaker = corpus.utt_speak_mapping[k]
            word_tier_name = '{} - words'.format(speaker)
            phone_tier_name = '{} - phones'.format(speaker)
            wordtier = tg.getFirst(word_tier_name)
            if wordtier is None:
                wordtier = IntervalTier(name = word_tier_name, maxTime = maxtime)
                tg.append(wordtier)
            phonetier = tg.getFirst(phone_tier_name)
            if phonetier is None:
                phonetier = IntervalTier(name = phone_tier_name, maxTime = maxtime)
                tg.append(phonetier)
            for interval in v:
                interval = interval[0] + begin, interval[1] + begin, interval[2]
                wordtier.add(*interval)
            for interval in phone_ctm[k]:
                interval = interval[0] + begin, interval[1] + begin, interval[2]
                phonetier.add(*interval)
        for k,v in tgs.items():
            outpath = os.path.join(out_directory, k + '.TextGrid')
            try:
                v.write(outpath)
            except ValueError as e:
                print('Could not write textgrid for {}'.format(k))
                print(e)
示例#22
0
        label = ss[4]
        result.append([begin, end, label])
    return result


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('words_ctm')
    parser.add_argument('phones_ctm')
    parser.add_argument('output_textgrid')

    args = parser.parse_args()

    words = read_ctm(args.words_ctm)
    phones = read_ctm(args.phones_ctm)

    max_time = phones[-1][1]
    tg = TextGrid(maxTime=max_time)
    word_tier = IntervalTier(name="words", maxTime=max_time)
    phone_tier = IntervalTier(name="phones", maxTime=max_time)

    for w in words:
        word_tier.add(*w)
    for p in phones:
        phone_tier.add(*p)

    tg.append(word_tier)
    tg.append(phone_tier)

    tg.write(args.output_textgrid)
示例#23
0
def ctm_to_textgrid(word_ctm,
                    phone_ctm,
                    out_directory,
                    corpus,
                    dictionary,
                    frameshift=0.01):
    textgrid_write_errors = {}
    frameshift = Decimal(str(frameshift))
    if not os.path.exists(out_directory):
        os.makedirs(out_directory, exist_ok=True)
    if not corpus.segments:
        for i, (k, v) in enumerate(sorted(word_ctm.items())):
            maxtime = Decimal(str(corpus.get_wav_duration(k)))
            speaker = list(v.keys())[0]
            v = list(v.values())[0]
            try:
                tg = TextGrid(maxTime=maxtime)
                wordtier = IntervalTier(name='words', maxTime=maxtime)
                phonetier = IntervalTier(name='phones', maxTime=maxtime)
                for interval in v:
                    if maxtime - interval[
                            1] < frameshift:  # Fix rounding issues
                        interval[1] = maxtime
                    wordtier.add(*interval)
                for interval in phone_ctm[k][speaker]:
                    if maxtime - interval[1] < frameshift:
                        interval[1] = maxtime
                    phonetier.add(*interval)
                tg.append(wordtier)
                tg.append(phonetier)
                if corpus.speaker_directories:
                    speaker_directory = os.path.join(
                        out_directory, corpus.utt_speak_mapping[k])
                else:
                    speaker_directory = out_directory
                os.makedirs(speaker_directory, exist_ok=True)
                outpath = os.path.join(speaker_directory, k + '.TextGrid')
                tg.write(outpath)
            except Exception as e:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                textgrid_write_errors[k] = '\n'.join(
                    traceback.format_exception(exc_type, exc_value,
                                               exc_traceback))
    else:
        silences = {
            dictionary.optional_silence, dictionary.nonoptional_silence
        }
        for i, (filename, speaker_dict) in enumerate(sorted(word_ctm.items())):
            maxtime = corpus.get_wav_duration(filename)
            try:
                tg = TextGrid(maxTime=maxtime)
                for speaker, words in speaker_dict.items():
                    word_tier_name = '{} - words'.format(speaker)
                    phone_tier_name = '{} - phones'.format(speaker)
                    word_tier = IntervalTier(name=word_tier_name,
                                             maxTime=maxtime)
                    phone_tier = IntervalTier(name=phone_tier_name,
                                              maxTime=maxtime)
                    for w in words:
                        word_tier.add(*w)
                    for p in phone_ctm[filename][speaker]:
                        if len(phone_tier) > 0 and phone_tier[
                                -1].mark in silences and p[2] in silences:
                            phone_tier[-1].maxTime = p[1]
                        else:
                            if len(phone_tier) > 0 and p[2] in silences and p[
                                    0] < phone_tier[-1].maxTime:
                                p = phone_tier[-1].maxTime, p[1], p[2]
                            elif len(phone_tier) > 0 and p[2] not in silences and p[0] < phone_tier[-1].maxTime and \
                                            phone_tier[-1].mark in silences:
                                phone_tier[-1].maxTime = p[0]
                            phone_tier.add(*p)
                    tg.append(word_tier)
                    tg.append(phone_tier)
                tg.write(os.path.join(out_directory, filename + '.TextGrid'))
            except Exception as e:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                textgrid_write_errors[filename] = '\n'.join(
                    traceback.format_exception(exc_type, exc_value,
                                               exc_traceback))
    if textgrid_write_errors:
        error_log = os.path.join(out_directory, 'output_errors.txt')
        with open(error_log, 'w', encoding='utf8') as f:
            f.write(
                'The following exceptions were encountered during the ouput of the alignments to TextGrids:\n\n'
            )
            for k, v in textgrid_write_errors.items():
                f.write('{}:\n'.format(k))
                f.write('{}\n\n'.format(v))
示例#24
0
        if interval.mark == '':
            continue
        print(interval.mark, interval.minTime, interval.maxTime)
        outpath = os.path.join(temp_wav_dir, interval.mark + '.wav')
        extract_audio(wav_path, outpath, interval.minTime, interval.maxTime, padding = padding)
        rep = Mfcc(outpath, freq_lims = (80, 7800), num_coeffs = 12, win_len = 0.025, time_step = 0.01)
        rep.is_windowed = True
        duration = interval.maxTime - interval.minTime
        thresh = unnorm(norm(duration, min_duration, max_duration), min_thresh, max_thresh)
        rep.segment(threshold = thresh)
        print(sorted(rep._segments.keys()))
        padded_begin = interval.minTime - padding
        if padded_begin < 0:
            padded_begin = 0
        for k in sorted(rep._segments.keys()):
            with open(os.path.join(temp_mfcc_dir, '{}.mfcc'.format(seg_ind)), 'wb') as fh:
                pickle.dump(rep[k[0],k[1]], fh)
            with open(os.path.join(temp_mean_dir, '{}.mean'.format(seg_ind)), 'wb') as fh:
                pickle.dump(rep._segments[k], fh)
            segs.append(str(seg_ind))
            seg_ind += 1
            begin = round(k[0] + padded_begin, 3)

            end = round(k[1] + padded_begin,3)
            print(begin, end)
            segmentation_tier.add(begin, end, '{}'.format(seg_ind))
    with open(os.path.join(temp_align_dir, '{}.seg'.format(f)), 'w') as fa:
        fa.write(' '.join(segs))
    tg.append(segmentation_tier)
    tg.write(textgrid_path.replace(data_dir, temp_textgrid_dir))
                    wordintervals.append(x)

            elif i == 1:
                for x in ti:
                    x.maxTime += cur_dur
                    x.minTime += cur_dur
                    phoneintervals.append(x)
                cur_dur += maxtime

    words = IntervalTier(name='words')
    for i in wordintervals:
        words.addInterval(i)
    phones = IntervalTier(name='phones')
    for i in phoneintervals:
        phones.addInterval(i)
    tg1 = TextGrid(maxTime=cur_dur)
    tg1.append(words)
    tg1.append(phones)
    tg1.write(chapteroutpath1, null='')

    speaker_tier = IntervalTier(name=speaker)
    for i in range(len(groupedwavfiles)):
        if i == 1:
            speaker_tier.add(0.0, wavfiletimes[0], groupedlabtext[0])
        else:
            speaker_tier.add(wavfiletimes[i - 2], wavfiletimes[i - 1],
                             groupedlabtext[i - 1])
    tg2 = TextGrid(maxTime=duration)
    tg2.append(speaker_tier)
    tg2.write(chapteroutpath2, null='')
示例#26
0
                   num_coeffs=12,
                   win_len=0.025,
                   time_step=0.01)
        rep.is_windowed = True
        duration = interval.maxTime - interval.minTime
        thresh = unnorm(norm(duration, min_duration, max_duration), min_thresh,
                        max_thresh)
        rep.segment(threshold=thresh)
        print(sorted(rep._segments.keys()))
        padded_begin = interval.minTime - padding
        if padded_begin < 0:
            padded_begin = 0
        for k in sorted(rep._segments.keys()):
            with open(os.path.join(temp_mfcc_dir, '{}.mfcc'.format(seg_ind)),
                      'wb') as fh:
                pickle.dump(rep[k[0], k[1]], fh)
            with open(os.path.join(temp_mean_dir, '{}.mean'.format(seg_ind)),
                      'wb') as fh:
                pickle.dump(rep._segments[k], fh)
            segs.append(str(seg_ind))
            seg_ind += 1
            begin = round(k[0] + padded_begin, 3)

            end = round(k[1] + padded_begin, 3)
            print(begin, end)
            segmentation_tier.add(begin, end, '{}'.format(seg_ind))
    with open(os.path.join(temp_align_dir, '{}.seg'.format(f)), 'w') as fa:
        fa.write(' '.join(segs))
    tg.append(segmentation_tier)
    tg.write(textgrid_path.replace(data_dir, temp_textgrid_dir))
			elif i == 1:
				for x in ti:
					x.maxTime += cur_dur
					x.minTime += cur_dur
					phoneintervals.append(x)
				cur_dur += maxtime

	words = IntervalTier(name='words')
	for i in wordintervals:
		words.addInterval(i)
	phones = IntervalTier(name='phones')
	for i in phoneintervals:
		phones.addInterval(i)
	tg1 = TextGrid(maxTime = cur_dur)
	tg1.append(words)
	tg1.append(phones)
	tg1.write(chapteroutpath1, null = '')

	speaker_tier = IntervalTier(name=speaker)
	for i in range(len(groupedwavfiles)):
		if i == 1:
			speaker_tier.add(0.0, wavfiletimes[0], groupedlabtext[0])
		else:
			speaker_tier.add(wavfiletimes[i-2], wavfiletimes[i-1], groupedlabtext[i-1])
	tg2 = TextGrid(maxTime = duration)
	tg2.append(speaker_tier)
	tg2.write(chapteroutpath2, null = '')



示例#28
0
def ctm_to_textgrid(word_ctm,
                    phone_ctm,
                    out_directory,
                    corpus,
                    dictionary,
                    frameshift=0.01):
    frameshift = Decimal(str(frameshift))
    if not os.path.exists(out_directory):
        os.makedirs(out_directory, exist_ok=True)
    if not corpus.segments:
        for i, (k, v) in enumerate(sorted(word_ctm.items())):
            maxtime = Decimal(str(corpus.get_wav_duration(k)))
            speaker = list(v.keys())[0]
            v = list(v.values())[0]
            try:
                tg = TextGrid(maxTime=maxtime)
                wordtier = IntervalTier(name='words', maxTime=maxtime)
                phonetier = IntervalTier(name='phones', maxTime=maxtime)
                for interval in v:
                    if maxtime - interval[
                            1] < frameshift:  # Fix rounding issues
                        interval[1] = maxtime
                    wordtier.add(*interval)
                for interval in phone_ctm[k][speaker]:
                    if maxtime - interval[1] < frameshift:
                        interval[1] = maxtime
                    phonetier.add(*interval)
                tg.append(wordtier)
                tg.append(phonetier)
                if corpus.speaker_directories:
                    speaker_directory = os.path.join(
                        out_directory, corpus.utt_speak_mapping[k])
                else:
                    speaker_directory = out_directory
                os.makedirs(speaker_directory, exist_ok=True)
                outpath = os.path.join(speaker_directory, k + '.TextGrid')
                tg.write(outpath)
            except ValueError as e:
                print(
                    'There was an error writing the TextGrid for {}, please see below:'
                    .format(k))
                raise
    else:
        silences = {
            dictionary.optional_silence, dictionary.nonoptional_silence
        }
        for i, (filename, speaker_dict) in enumerate(sorted(word_ctm.items())):
            maxtime = corpus.get_wav_duration(filename)
            tg = TextGrid(maxTime=maxtime)
            for speaker, words in speaker_dict.items():
                word_tier_name = '{} - words'.format(speaker)
                phone_tier_name = '{} - phones'.format(speaker)
                word_tier = IntervalTier(name=word_tier_name, maxTime=maxtime)
                phone_tier = IntervalTier(name=phone_tier_name,
                                          maxTime=maxtime)
                for w in words:
                    word_tier.add(*w)
                for p in phone_ctm[filename][speaker]:
                    if len(phone_tier) > 0 and phone_tier[
                            -1].mark in silences and p[2] in silences:
                        phone_tier[-1].maxTime = p[1]
                    else:
                        if len(phone_tier) > 0 and p[2] in silences and p[
                                0] < phone_tier[-1].maxTime:
                            p = phone_tier[-1].maxTime, p[1], p[2]
                        elif len(phone_tier) > 0 and p[2] not in silences and p[0] < phone_tier[-1].maxTime and \
                                        phone_tier[-1].mark in silences:
                            phone_tier[-1].maxTime = p[0]
                        phone_tier.add(*p)
                tg.append(word_tier)
                tg.append(phone_tier)
            tg.write(os.path.join(out_directory, filename + '.TextGrid'))