def reorg_noncollapsed(f): padding = 0.1 print(f) tg_path = os.path.join(noncollapsed_dir, f) tg = TextGrid() tg.read(tg_path) new_tg = TextGrid(maxTime=tg.maxTime) new_tg_path = tg_path.replace(noncollapsed_dir, data_dir) for tier in tg.tiers: new_tier = IntervalTier(name=tier.name, maxTime=tg.maxTime) for i in tier: new_mark = sub_pattern.sub(' ', i.mark).strip() if not new_mark: continue new_begin = i.minTime - padding if new_begin < 0: new_begin = 0 new_end = i.maxTime + padding if new_end > tg.maxTime: new_end = tg.maxTime try: new_tier.add(new_begin, new_end, new_mark) except ValueError: new_tier[-1].maxTime = new_end new_tier[-1].mark += ' ' + new_mark print(len(new_tier)) new_tg.append(new_tier) new_tg.write(new_tg_path)
def export_segments(self, output_directory): from decimal import Decimal from textgrid import TextGrid, IntervalTier file_dict = {} for utt, segment in self.corpus.vad_segments.items(): filename, utt_begin, utt_end = segment utt_begin = Decimal(utt_begin) utt_end = Decimal(utt_end) if filename not in file_dict: file_dict[filename] = {} speaker = 'segments' text = 'speech' if speaker not in file_dict[filename]: file_dict[filename][speaker] = [] file_dict[filename][speaker].append([utt_begin, utt_end, text]) for filename, speaker_dict in file_dict.items(): try: speaker_directory = os.path.join( output_directory, self.corpus.file_directory_mapping[filename]) except KeyError: speaker_directory = output_directory os.makedirs(speaker_directory, exist_ok=True) max_time = self.corpus.get_wav_duration(filename) tg = TextGrid(maxTime=max_time) for speaker in sorted(speaker_dict.keys()): words = speaker_dict[speaker] tier = IntervalTier(name=speaker, maxTime=max_time) for w in words: if w[1] > max_time: w[1] = max_time tier.add(*w) tg.append(tier) tg.write(os.path.join(speaker_directory, filename + '.TextGrid'))
def convert_ctm_to_textgrid(ctm, textgrid): words = [] phonemes = [] with open(ctm, encoding='utf-8') as f: for l in f: tok = l.strip().split() text = tok[4] beg = float(tok[2]) dur = float(tok[3]) if tok[0][0] == '@': if besi.match(text): text = text[:-2] phonemes.append((text, beg, dur)) else: words.append((text, beg, dur)) tw = IntervalTier(name='words') tp = IntervalTier(name='phonemes') for seg in words: try: tw.add(round(seg[1], 2), round(seg[1] + seg[2], 2), seg[0]) except ValueError: print("Error in word seg: " + seg[0]) for seg in phonemes: try: tp.add(round(seg[1], 2), round(seg[1] + seg[2], 2), seg[0]) except ValueError: print("Error in phoneme seg: " + seg[0]) tg = TextGrid() tg.append(tw) tg.append(tp) tg.write(textgrid)
def ctm_to_textgrid(phone_ctm, out_directory, utt2dur, frameshift=0.01): textgrid_write_errors = {} frameshift = Decimal(str(frameshift)) if not os.path.exists(out_directory): os.makedirs(out_directory) utt2dur_mapping = generate_utt2dur(utt2dur) for i, (k, v) in enumerate(sorted(phone_ctm.items())): maxtime = Decimal(str(utt2dur_mapping[k])) try: tg = TextGrid(maxTime=maxtime) phonetier = IntervalTier(name='phones', maxTime=maxtime) for interval in v: if maxtime - interval[1] < frameshift: interval[1] = maxtime #remove B E I and stress (0,1) information from phoneme interval[2] = re.sub("\d+", "", interval[2].split('_')[0]) phonetier.add(*interval) tg.append(phonetier) outpath = os.path.join(out_directory, k + '.TextGrid') tg.write(outpath) except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() textgrid_write_errors[k] = '\n'.join( traceback.format_exception(exc_type, exc_value, exc_traceback)) if textgrid_write_errors: error_log = os.path.join(out_directory, 'output_errors.txt') with io_open(error_log, 'w', encoding='utf-8') as f: f.write( u'The following exceptions were encountered during the ouput of the alignments to TextGrids:\n\n' ) for k, v in textgrid_write_errors.items(): f.write(u'{}:\n'.format(k)) f.write(u'{}\n\n'.format(v))
def loadOrGenerate(self): fname = self.app.Data.checkFileLevel('.TextGrid', shoulderror=False) if fname: self.TextGrid = self.fromFile(fname) else: minTime = 0. if not hasattr(self.app.Audio, 'duration'): self.app.Audio.reset() try: maxTime = self.app.Audio.duration except: warn( 'Audio has no duration attribute after calling reset(), defaulting to 1 second' ) maxTime = 1. self.TextGrid = TextGridFile(maxTime=maxTime) keys = self.app.Data.getFileLevel('all') if not ('.ult' in keys and '.txt' in keys): sentenceTier = IntervalTier("text") sentenceTier.add(minTime, maxTime, "text") self.TextGrid.append(sentenceTier) fname = self.app.Data.unrelativize( self.app.Data.getCurrentFilename() + '.TextGrid') self.app.Data.setFileLevel('.TextGrid', fname) names = self.TextGrid.getNames() for i, n in enumerate(names): if n in ALIGNMENT_TIER_NAMES: if len(self.TextGrid[i]) == 0: self.TextGrid.pop(i) break else: self.frameTierName = n return self.genFramesTier()
def loadOrGenerate(self): fname = self.app.Data.checkFileLevel('.TextGrid', shoulderror=False) if fname: self.TextGrid = self.fromFile(fname) else: minTime = 0. if not hasattr(self.app.Audio, 'duration'): self.app.Audio.reset() maxTime = self.app.Audio.duration self.TextGrid = TextGridFile(maxTime=maxTime) sentenceTier = IntervalTier("text") sentenceTier.add(minTime, maxTime, "text") self.TextGrid.tiers.append(sentenceTier) fname = self.app.Data.unrelativize( self.app.Data.getCurrentFilename() + '.TextGrid') self.app.Data.setFileLevel('.TextGrid', fname) names = self.TextGrid.getNames() for i, n in enumerate(names): if n in ALIGNMENT_TIER_NAMES: if len(self.TextGrid[i]) == 0: self.TextGrid.pop(i) break else: return self.genFramesTier()
def parseFile(path, fn): filename= fn.split(".")[0] #just name of file with open(path, "r") as f1: lines = f1.readlines() SAM = getSAM(lines) allSegs = getMAU(lines, SAM, filename) if allSegs is None: return segs = [] for seg in allSegs: #print("%f %f %s %s"%(seg.start, seg.end, seg.segment, seg.index)) tup = getSegInfo(seg) segs.append(tup) words = getWords(lines, allSegs, filename) maxtime = getMaxTime(allSegs) if maxtime == -1: return tg = TextGrid(maxTime = maxtime) wordtier = IntervalTier(name = 'words', maxTime = maxtime) phonetier = IntervalTier(name = 'phones', maxTime = maxtime) for interval in words: wordtier.add(*interval) for interval in segs: phonetier.add(*interval) tg.append(wordtier) tg.append(phonetier) outpath = "/Users/elias/Desktop/TextGrids/%s.TextGrid"%filename tg.write(outpath)
def createTextGrid(data, tierName = "words"): tier = IntervalTier(tierName) txtgrid = TextGrid() prevTime = 0 for (name, time, dur, words) in data: tier.add(prevTime, prevTime+dur, makeSentence(words)) prevTime += dur txtgrid.append(tier) return txtgrid
def ctm_to_textgrid(word_ctm, phone_ctm, out_directory, corpus, dictionary, frameshift=0.01): textgrid_write_errors = {} frameshift = Decimal(str(frameshift)) if not os.path.exists(out_directory): os.makedirs(out_directory, exist_ok=True) silences = {dictionary.optional_silence, dictionary.nonoptional_silence} for i, (filename, speaker_dict) in enumerate(sorted(word_ctm.items())): maxtime = corpus.get_wav_duration(filename) try: speaker_directory = os.path.join( out_directory, corpus.file_directory_mapping[filename]) tg = TextGrid(maxTime=maxtime) for speaker in corpus.speaker_ordering[filename]: words = speaker_dict[speaker] word_tier_name = '{} - words'.format(speaker) phone_tier_name = '{} - phones'.format(speaker) word_tier = IntervalTier(name=word_tier_name, maxTime=maxtime) phone_tier = IntervalTier(name=phone_tier_name, maxTime=maxtime) for w in words: word_tier.add(*w) for p in phone_ctm[filename][speaker]: if len(phone_tier) > 0 and phone_tier[ -1].mark in silences and p[2] in silences: phone_tier[-1].maxTime = p[1] else: if len(phone_tier) > 0 and p[2] in silences and p[ 0] < phone_tier[-1].maxTime: p = phone_tier[-1].maxTime, p[1], p[2] elif len(phone_tier) > 0 and p[2] not in silences and p[0] < phone_tier[-1].maxTime and \ phone_tier[-1].mark in silences: phone_tier[-1].maxTime = p[0] phone_tier.add(*p) tg.append(word_tier) tg.append(phone_tier) tg.write(os.path.join(speaker_directory, filename + '.TextGrid')) except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() textgrid_write_errors[filename] = '\n'.join( traceback.format_exception(exc_type, exc_value, exc_traceback)) if textgrid_write_errors: error_log = os.path.join(out_directory, 'output_errors.txt') with open(error_log, 'w', encoding='utf8') as f: f.write( 'The following exceptions were encountered during the ouput of the alignments to TextGrids:\n\n' ) for k, v in textgrid_write_errors.items(): f.write('{}:\n'.format(k)) f.write('{}\n\n'.format(v))
def export_classification(self, output_directory): if self.cluster: self.cluster_utterances() else: self.get_classification_stats() from decimal import Decimal from textgrid import TextGrid, IntervalTier spk2utt_path = os.path.join(self.classify_directory, 'spk2utt') utt2spk_path = os.path.join(self.classify_directory, 'utt2spk') if self.corpus.segments: utt2spk = load_scp(utt2spk_path) file_dict = {} for utt, segment in self.corpus.segments.items(): filename, utt_begin, utt_end = segment.split(' ') utt_begin = Decimal(utt_begin) utt_end = Decimal(utt_end) if filename not in file_dict: file_dict[filename] = {} speaker = utt2spk[utt] text = self.corpus.text_mapping[utt] if speaker not in file_dict[filename]: file_dict[filename][speaker] = [] file_dict[filename][speaker].append([utt_begin, utt_end, text]) for filename, speaker_dict in file_dict.items(): try: speaker_directory = os.path.join( output_directory, self.corpus.file_directory_mapping[filename]) except KeyError: speaker_directory = output_directory max_time = self.corpus.get_wav_duration(filename) tg = TextGrid(maxTime=max_time) for speaker in sorted(speaker_dict.keys()): words = speaker_dict[speaker] tier = IntervalTier(name=speaker, maxTime=max_time) for w in words: if w[1] > max_time: w[1] = max_time tier.add(*w) tg.append(tier) tg.write( os.path.join(speaker_directory, filename + '.TextGrid')) else: spk2utt = load_scp(spk2utt_path) for speaker, utts in spk2utt.items(): speaker_dir = os.path.join(output_directory, speaker) os.makedirs(speaker_dir, exist_ok=True) with open(os.path.join(speaker_dir, 'utterances.txt'), 'w', encoding='utf8') as f: for u in utts: f.write('{}\n'.format(u))
def test_time_to_frame_interval_tier_short_seg(self): tier = IntervalTier('test', 0, 0.01) tier.add(0, 0.003, "a") tier.add(0.003, 0.01, "b") frame_tier = utterance.time_to_frame_interval_tier(tier, 5) self.assertEqual(frame_tier.minTime, 0) self.assertEqual(frame_tier.maxTime, 2) self.assertEqual(frame_tier.intervals[0].minTime, 0) self.assertEqual(frame_tier.intervals[0].maxTime, 1) self.assertEqual(frame_tier.intervals[1].minTime, 1) self.assertEqual(frame_tier.intervals[1].maxTime, 2)
def generator_textgrid(maxtime, lines, output): # Download Praat: https://www.fon.hum.uva.nl/praat/ interval = maxtime / (len(lines) + 1) margin = 0.0001 tg = TextGrid(maxTime=maxtime) linetier = IntervalTier(name="line", maxTime=maxtime) i = 0 for l in lines: s, e, w = l.split() linetier.add(minTime=float(s) + margin, maxTime=float(e), mark=w) tg.append(linetier) print("successfully generator {}".format(output)) tg.write(output)
def createNew(textgrid, tier_name, VERBOSE=False): tiers = textgrid.getList(tier_name) tier = tiers[0] new_tier = IntervalTier(tier_name+'_clean') new_txtgrid = TextGrid() if VERBOSE == True: print ("Old tier: %s" % tier) for interval in tier: if isPause(interval.mark) == True: new_tier.add(interval.minTime, interval.maxTime, '') else: new_tier.add(interval.minTime, interval.maxTime, fixString(interval.mark)) new_txtgrid.append(new_tier) if VERBOSE == True: print ("New tier: %s" % new_tier) return new_txtgrid
def convert_ctm_to_textgrid(ctms, textgrid): for ctm in ctms: tiername = ctm.stem ret = [] with open(ctm, encoding='utf-8') as f: for l in f: tok = l.strip().split() word = tok[4] beg = float(tok[2]) dur = float(tok[3]) ret.append((word, beg, dur)) t = IntervalTier(name=tiername) for seg in ret: try: t.add(round(seg[1], 2), round(seg[1] + seg[2], 2), seg[0]) except ValueError: print("Error in seg: " + seg[0]) tg = TextGrid() tg.append(t) tg.write(textgrid)
def read_segment(val: Segment) -> IntervalTier: """Read a Segment message and save it to an IntervalTier object. Args: val: A Segment message as defined in data_utterance.pb. Returns: interval: The Segment message saved in an IntervalTier object. """ symbols = val.symbol start_time = mat_to_numpy(val.start_time) end_time = mat_to_numpy(val.end_time) num_items = val.num_item if not (len(symbols) == len(start_time) == len(end_time) == num_items): raise ValueError("Interval item number is not consistent!") interval = IntervalTier(minTime=start_time[0], maxTime=end_time[-1]) for sym, min_time, max_time in zip(symbols, start_time, end_time): interval.add(min_time, max_time, sym) return interval
def time_to_frame_interval_tier(time_tier: IntervalTier, shift) -> IntervalTier: """Convert an IntervalTier in time to frame. Args: time_tier: IntervalTier represented in seconds. shift: Window shift in ms. Returns: frame_tier: IntervalTier represented in frames. """ max_frame = time_to_frame(time_tier.maxTime, shift) frame_tier = IntervalTier(time_tier.name, 0, max_frame) # Deal with (occasionally) very short segments -- less than a frame shift # If we have consecutive very small segments then the function will raise a # ValueError start_shift = 0 for each_interval in time_tier.intervals: curr_min_frame = time_to_frame(each_interval.minTime, shift) if start_shift > 0: logging.warning("Last segment is too short, have to cut the %d " "frame(s) from the beginning of the current " "segment.", start_shift) curr_min_frame += start_shift start_shift = 0 curr_max_frame = time_to_frame(each_interval.maxTime, shift) if curr_min_frame >= curr_max_frame: curr_max_frame = curr_min_frame + 1 start_shift = curr_max_frame - curr_min_frame logging.warning("The current segment is too short, extend it for " "%d frame(s).", start_shift) if curr_max_frame > frame_tier.maxTime: raise ValueError("Extreme short segments in the tier, please fix " "these.") frame_tier.add(curr_min_frame, curr_max_frame, each_interval.mark) return frame_tier
def genFramesTier(self): debug('generating frames tier for %s' % self.app.Data.getCurrentFilename()) self.frameTierName = 'frames' times = self.app.Dicom.getFrameTimes() self.app.Data.setFileLevel("NumberOfFrames", len(times)) try: maxTime = max(self.app.Audio.duration, times[-1]) except AttributeError: maxTime = times[-1] tier = PointTier('frames', maxTime=maxTime) for f, t in enumerate(times): tier.addPoint(Point(t, str(f))) if not self.TextGrid.maxTime or maxTime > self.TextGrid.maxTime: self.TextGrid.maxTime = maxTime self.TextGrid.append(tier) keys = self.app.Data.getFileLevel('all') if '.ult' in keys and '.txt' in keys: fname = self.app.Data.unrelativize( self.app.Data.getFileLevel('.txt')) f = open(fname, 'rb') s = util.decode_bytes(f.read()) f.close() if s: line = s.splitlines()[0] sentenceTier = IntervalTier("sentence") sentenceTier.add(0, self.app.Audio.duration, line) self.TextGrid.append(sentenceTier) self.TextGrid.tiers = [self.TextGrid.tiers[-1] ] + self.TextGrid.tiers[:-1] path = self.app.Data.unrelativize( self.app.Data.getFileLevel('.TextGrid')) self.TextGrid.write(path) self.TextGrid = TextGridFile.fromFile(path)
def ctm_to_textgrid(word_ctm, phone_ctm, out_directory, corpus, dictionary, frameshift=0.01): textgrid_write_errors = {} frameshift = Decimal(str(frameshift)) if not os.path.exists(out_directory): os.makedirs(out_directory, exist_ok=True) if not corpus.segments: for i, (k, v) in enumerate(sorted(word_ctm.items())): maxtime = Decimal(str(corpus.get_wav_duration(k))) speaker = list(v.keys())[0] v = list(v.values())[0] try: tg = TextGrid(maxTime=maxtime) wordtier = IntervalTier(name='words', maxTime=maxtime) phonetier = IntervalTier(name='phones', maxTime=maxtime) for interval in v: if maxtime - interval[1] < frameshift: # Fix rounding issues interval[1] = maxtime wordtier.add(*interval) for interval in phone_ctm[k][speaker]: if maxtime - interval[1] < frameshift: interval[1] = maxtime phonetier.add(*interval) tg.append(wordtier) tg.append(phonetier) relative = corpus.file_directory_mapping[k] if relative: speaker_directory = os.path.join(out_directory, relative) else: speaker_directory = out_directory os.makedirs(speaker_directory, exist_ok=True) outpath = os.path.join(speaker_directory, k + '.TextGrid') tg.write(outpath) except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() textgrid_write_errors[k] = '\n'.join(traceback.format_exception(exc_type, exc_value, exc_traceback)) else: silences = {dictionary.optional_silence, dictionary.nonoptional_silence} for i, (filename, speaker_dict) in enumerate(sorted(word_ctm.items())): maxtime = corpus.get_wav_duration(filename) try: speaker_directory = os.path.join(out_directory, corpus.file_directory_mapping[filename]) tg = TextGrid(maxTime=maxtime) for speaker in corpus.speaker_ordering[filename]: words = speaker_dict[speaker] word_tier_name = '{} - words'.format(speaker) phone_tier_name = '{} - phones'.format(speaker) word_tier = IntervalTier(name=word_tier_name, maxTime=maxtime) phone_tier = IntervalTier(name=phone_tier_name, maxTime=maxtime) for w in words: word_tier.add(*w) for p in phone_ctm[filename][speaker]: if len(phone_tier) > 0 and phone_tier[-1].mark in silences and p[2] in silences: phone_tier[-1].maxTime = p[1] else: if len(phone_tier) > 0 and p[2] in silences and p[0] < phone_tier[-1].maxTime: p = phone_tier[-1].maxTime, p[1], p[2] elif len(phone_tier) > 0 and p[2] not in silences and p[0] < phone_tier[-1].maxTime and \ phone_tier[-1].mark in silences: phone_tier[-1].maxTime = p[0] phone_tier.add(*p) tg.append(word_tier) tg.append(phone_tier) tg.write(os.path.join(speaker_directory, filename + '.TextGrid')) except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() textgrid_write_errors[filename] = '\n'.join(traceback.format_exception(exc_type, exc_value, exc_traceback)) if textgrid_write_errors: error_log = os.path.join(out_directory, 'output_errors.txt') with open(error_log, 'w', encoding='utf8') as f: f.write('The following exceptions were encountered during the ouput of the alignments to TextGrids:\n\n') for k,v in textgrid_write_errors.items(): f.write('{}:\n'.format(k)) f.write('{}\n\n'.format(v))
#print(transcription) if len(transcription) < 3 and u.duration > 25: continue if u.duration > 25: print(d, u.duration) print([(x._type_node['label'], x.begin, x.duration) for x in u.word]) error transcription = ' '.join(x[0] for x in transcription) begin = u.begin - 0.075 if begin < 0: begin = 0 end = u.end + 0.075 if end > duration: end = duration utterance_tier.add(begin, end, transcription) utt_duration = end - begin utt_name = '{}_{}_{}'.format(d, begin, end) utt_wav_path = os.path.join(lab_dir, utt_name + '.wav') if not os.path.exists(utt_wav_path): extract_audio(wav_path, utt_wav_path, begin, end, padding=0) lab_path = os.path.join(lab_dir, utt_name + '.lab') with open(lab_path, 'w') as f: f.write(transcription) trans_path = os.path.join(lab_dir, utt_name + '.txt') with open(trans_path, 'w') as f: f.write('{}\t{}\t0\t{}\t{}'.format(speaker, speaker, utt_duration, transcription)) utt_tg_path = os.path.join(lab_dir, utt_name + '.TextGrid') utt_tg = TextGrid(maxTime=utt_duration)
class TestUtterance(unittest.TestCase): def setUp(self): self.utt = utterance.Utterance() self.float_mat = FloatMatrix() self.int_mat = Int32Matrix() self.bool_mat = BinaryMatrix() self.seg = Segment() self.tier = IntervalTier('test', 0, 2) self.tier.add(0, 1, "a") self.tier.add(1, 2, "b") def tearDown(self): pass def test_empty_init(self): self.assertTrue(isinstance(self.utt, utterance.Utterance)) def test_full_init(self): utt = utterance.Utterance(np.array([1, 2, 3, 4, 5]), 16000, "test") self.assertTrue(isinstance(utt, utterance.Utterance)) def test_invalid_init(self): try: utt = utterance.Utterance(np.array([1, 2, 3, 4, 5]), text="test") except ValueError: pass def test_rw_internal(self): pb = self.utt.write_internal() self.utt.read_internal(pb) def test_rw(self): with tempfile.TemporaryDirectory() as tpf: output_file = os.path.join(tpf, "test.data") self.utt.write(output_file) self.assertTrue(os.path.exists(output_file)) self.utt.read(output_file) def test_rw_mat_empty(self): np_mat = np.array([]) utterance.numpy_to_mat(np_mat, self.float_mat) self.assertEqual(self.float_mat.num_row, 0) self.assertEqual(self.float_mat.num_col, 0) np_mat_recover = utterance.mat_to_numpy(self.float_mat) self.assertEqual(np_mat_recover.shape, np_mat.shape) self.assertTrue((np_mat == np_mat_recover).all()) def test_rw_mat_scalar(self): np_mat = np.array([1]) utterance.numpy_to_mat(np_mat, self.float_mat) self.assertEqual(self.float_mat.num_row, 1) self.assertEqual(self.float_mat.num_col, 1) np_mat_recover = utterance.mat_to_numpy(self.float_mat) self.assertEqual(np_mat_recover.shape, np_mat.shape) self.assertTrue((np_mat == np_mat_recover).all()) def test_rw_mat_row_vec(self): np_mat = np.array([1, 2, 3, 4]) num_ele = len(np_mat) utterance.numpy_to_mat(np_mat, self.float_mat) self.assertEqual(self.float_mat.num_row, 1) self.assertEqual(self.float_mat.num_col, num_ele) np_mat_recover = utterance.mat_to_numpy(self.float_mat) self.assertEqual(np_mat_recover.shape, np_mat.shape) self.assertTrue((np_mat == np_mat_recover).all()) def test_rw_mat_col_vec(self): np_mat = np.array([[1], [2], [3], [4]]) num_row, num_col = np_mat.shape utterance.numpy_to_mat(np_mat, self.float_mat) self.assertEqual(self.float_mat.num_row, num_row) self.assertEqual(self.float_mat.num_col, num_col) np_mat_recover = utterance.mat_to_numpy(self.float_mat) self.assertEqual(np_mat_recover.shape, (num_row, num_col)) self.assertTrue((np_mat == np_mat_recover).all()) def test_rw_mat_int_mat(self): np_mat = np.array([[1, 1], [2, 2], [3, 3], [4, 4]]) np_mat.astype(int) num_row, num_col = np_mat.shape utterance.numpy_to_mat(np_mat, self.int_mat) self.assertEqual(self.int_mat.num_row, num_row) self.assertEqual(self.int_mat.num_col, num_col) np_mat_recover = utterance.mat_to_numpy(self.int_mat) self.assertEqual(np_mat_recover.shape, (num_row, num_col)) self.assertTrue((np_mat == np_mat_recover).all()) def test_rw_mat_float_mat(self): np_mat = np.array([[1, 1], [2, 2], [3, 3], [4, 4]]) num_row, num_col = np_mat.shape utterance.numpy_to_mat(np_mat, self.float_mat) self.assertEqual(self.float_mat.num_row, num_row) self.assertEqual(self.float_mat.num_col, num_col) np_mat_recover = utterance.mat_to_numpy(self.float_mat) self.assertEqual(np_mat_recover.shape, (num_row, num_col)) self.assertTrue((np_mat == np_mat_recover).all()) def test_rw_mat_binary_mat(self): np_mat = np.array([[1, 0], [0, 1], [0, 0], [1, 1]]) np_mat.astype(bool) num_row, num_col = np_mat.shape utterance.numpy_to_mat(np_mat, self.bool_mat) self.assertEqual(self.bool_mat.num_row, num_row) self.assertEqual(self.bool_mat.num_col, num_col) np_mat_recover = utterance.mat_to_numpy(self.bool_mat) self.assertEqual(np_mat_recover.shape, (num_row, num_col)) self.assertTrue((np_mat == np_mat_recover).all()) def test_rw_segment(self): utterance.write_segment(self.tier, self.seg) self.assertEqual(self.seg.num_item, len(self.tier.intervals)) interval = utterance.read_segment(self.seg) self.assertTrue(isinstance(interval, IntervalTier)) self.assertEqual(len(self.tier.intervals), len(interval.intervals)) def test_rw_data(self): data = self.utt.data self.utt.data = data def test_rw_wav(self): input_wav = np.array([1, 2, 3, 4]) self.utt.wav = input_wav wav = self.utt.wav self.assertTrue((input_wav == wav).any()) def test_rw_fs(self): self.utt.fs = 16000 fs = self.utt.fs self.assertEqual(fs, 16000) def test_set_invalid_fs(self): try: self.utt.fs = 0 except ValueError: pass try: self.utt.fs = -2 except ValueError: pass def test_rw_text(self): self.utt.text = "test" text = self.utt.text self.assertEqual(text, "test") def test_rw_align(self): align = TextGrid() align.read("data/test.TextGrid") self.utt.align = align align_recover = self.utt.align self.assertEqual(len(align_recover.tiers), len(align.tiers)) def test_rw_ppg(self): ppg = np.random.uniform(0, 1, (100, 5816)) self.utt.ppg = ppg ppg_recover = self.utt.ppg # There might be a small difference since the data are saved in float # while the original values are double precision self.assertAlmostEqual(((ppg_recover - ppg)**2).sum(), 0) def test_rw_monophone_ppg(self): mono_ppg = np.random.uniform(0, 1, (100, 40)) self.utt.monophone_ppg = mono_ppg mono_ppg_recover = self.utt.monophone_ppg # There might be a small difference since the data are saved in float # while the original values are double precision self.assertAlmostEqual(((mono_ppg_recover - mono_ppg)**2).sum(), 0) def test_rw_phone(self): self.utt.phone = self.tier interval = self.utt.phone self.assertEqual(len(interval.intervals), len(self.tier.intervals)) def test_rw_word(self): self.utt.word = self.tier interval = self.utt.word self.assertEqual(len(interval.intervals), len(self.tier.intervals)) def test_rw_lab(self): lab = np.array([1, 2, 3, 4, 5, 5, 6, 7]) self.utt.lab = lab lab_recover = self.utt.lab self.assertTrue((lab == lab_recover).all()) def test_rw_utterance_id(self): self.utt.utterance_id = "id" utt_id = self.utt.utterance_id self.assertEqual(utt_id, "id") def test_rw_speaker_id(self): self.utt.speaker_id = "id" spk_id = self.utt.speaker_id self.assertEqual(spk_id, "id") def test_rw_dialect(self): self.utt.dialect = "EN_CN" dialect = self.utt.dialect self.assertEqual(dialect, "EN_CN") def test_rw_gender(self): self.utt.gender = "O" gender = self.utt.gender self.assertEqual(gender, "O") def test_rw_original_file(self): self.utt.original_file = "file.test" origin_file = self.utt.original_file self.assertEqual(origin_file, "file.test") def test_rw_num_channel(self): self.utt.num_channel = 2 channel = self.utt.num_channel self.assertEqual(channel, 2) def test_rw_kaldi_shift(self): self.utt.kaldi_shift = 10 shift = self.utt.kaldi_shift self.assertEqual(shift, 10) def test_rw_kaldi_window_size(self): self.utt.kaldi_window_size = 25 w_size = self.utt.kaldi_window_size self.assertEqual(w_size, 25) def test_rw_kaldi_window_type(self): self.utt.kaldi_window_type = "hamming" w_type = self.utt.kaldi_window_type self.assertEqual(w_type, "hamming") def test_rw_vocoder(self): self.utt.vocoder = "WORLD" vocoder_name = self.utt.vocoder self.assertEqual(vocoder_name, "WORLD") def test_rw_spec(self): spec = np.random.uniform(0, 1, (100, 513)) self.utt.spec = spec spec_recover = self.utt.spec # There might be a small difference since the data are saved in float # while the original values are double precision self.assertAlmostEqual(((spec_recover - spec)**2).sum(), 0) self.assertEqual(self.utt.spec_dim, 513) def test_rw_mfcc(self): mfcc = np.random.uniform(0, 1, (100, 25)) self.utt.mfcc = mfcc mfcc_recover = self.utt.mfcc # There might be a small difference since the data are saved in float # while the original values are double precision self.assertAlmostEqual(((mfcc_recover - mfcc)**2).sum(), 0) self.assertEqual(self.utt.mfcc_dim, 25) def test_rw_mcep(self): mcep = np.random.uniform(0, 1, (100, 60)) self.utt.mcep = mcep mcep_recover = self.utt.mcep # There might be a small difference since the data are saved in float # while the original values are double precision self.assertAlmostEqual(((mcep_recover - mcep)**2).sum(), 0) self.assertEqual(self.utt.mcep_dim, 60) def test_rw_f0(self): f0 = np.array([1, 2, 3, 4, 5]) self.utt.f0 = f0 f0_recover = self.utt.f0 # There might be a small difference since the data are saved in float # while the original values are double precision self.assertAlmostEqual(((f0_recover - f0)**2).sum(), 0) self.assertEqual(self.utt.num_frame, 5) def test_rw_ap(self): ap = np.random.uniform(0, 1, (100, 513)) self.utt.ap = ap ap_recover = self.utt.ap # There might be a small difference since the data are saved in float # while the original values are double precision self.assertAlmostEqual(((ap_recover - ap)**2).sum(), 0) self.assertEqual(self.utt.ap_dim, 513) def test_rw_bap(self): bap = np.array([1, 2, 3, 4, 5]) self.utt.bap = bap bap_recover = self.utt.bap # There might be a small difference since the data are saved in float # while the original values are double precision self.assertAlmostEqual(((bap_recover - bap)**2).sum(), 0) self.assertEqual(self.utt.bap_dim, 1) bap = np.random.uniform(0, 1, (100, 5)) self.utt.bap = bap bap_recover = self.utt.bap # There might be a small difference since the data are saved in float # while the original values are double precision self.assertAlmostEqual(((bap_recover - bap)**2).sum(), 0) self.assertEqual(self.utt.bap_dim, 5) def test_rw_vuv(self): vuv = np.array([1, 2, 3, 4, 5]) self.utt.vuv = vuv vuv_recover = self.utt.vuv # There might be a small difference since the data are saved in float # while the original values are double precision self.assertAlmostEqual(((vuv_recover - vuv)**2).sum(), 0) def test_rw_temporal_position(self): tp = np.array([1, 2, 3, 4, 5]) self.utt.temporal_position = tp tp_recover = self.utt.temporal_position # There might be a small difference since the data are saved in float # while the original values are double precision self.assertAlmostEqual(((tp_recover - tp)**2).sum(), 0) def test_rw_vocoder_shift(self): self.utt.vocoder_shift = 10 shift = self.utt.vocoder_shift self.assertEqual(shift, 10) def test_rw_vocoder_window_size(self): self.utt.vocoder_window_size = 25 w_size = self.utt.vocoder_window_size self.assertEqual(w_size, 25) def test_rw_vocoder_window_type(self): self.utt.vocoder_window_type = "hamming" w_type = self.utt.vocoder_window_type self.assertEqual(w_type, "hamming") def test_rw_num_frame(self): self.utt.num_frame = 10 num_frame = self.utt.num_frame self.assertEqual(num_frame, 10) def test_rw_alpha(self): self.utt.alpha = 0.42 alpha = self.utt.alpha self.assertAlmostEqual(alpha, 0.42) def test_rw_fft_size(self): self.utt.fft_size = 1024 fft_size = self.utt.fft_size self.assertEqual(fft_size, 1024) def test_rw_spec_dim(self): self.utt.spec_dim = 513 spec_dim = self.utt.spec_dim self.assertEqual(spec_dim, 513) def test_rw_mfcc_dim(self): self.utt.mfcc_dim = 60 mfcc_dim = self.utt.mfcc_dim self.assertEqual(mfcc_dim, 60) def test_rw_mcep_dim(self): self.utt.mcep_dim = 60 mcep_dim = self.utt.mcep_dim self.assertEqual(mcep_dim, 60) def test_rw_f0_floor(self): self.utt.f0_floor = 40 f0_floor = self.utt.f0_floor self.assertEqual(f0_floor, 40) def test_rw_f0_ceil(self): self.utt.f0_ceil = 800 f0_ceil = self.utt.f0_ceil self.assertEqual(f0_ceil, 800) def test_rw_timestamp(self): self.utt.timestamp = '1/25/2019' timestamp = self.utt.timestamp self.assertEqual(timestamp, '1/25/2019') def test_rw_ap_dim(self): self.utt.ap_dim = 5 ap_dim = self.utt.ap_dim self.assertEqual(ap_dim, 5) def test_rw_bap_dim(self): self.utt.bap_dim = 1 bap_dim = self.utt.bap_dim self.assertEqual(bap_dim, 1) def test_rw_pitch_tracker(self): self.utt.pitch_tracker = 'dio' pitch_tracker = self.utt.pitch_tracker self.assertEqual(pitch_tracker, 'dio') def test_time_to_frame_valid_input(self): self.assertEqual(utterance.time_to_frame(3.66, 5), 732) self.assertEqual(utterance.time_to_frame(0, 5), 0) def test_time_to_frame_invalid_input(self): try: utterance.time_to_frame(-10, 5) except ValueError: pass def test_time_to_frame_interval_tier_simple_case(self): shift = 5 frame_tier = utterance.time_to_frame_interval_tier(self.tier, shift) self.assertEqual(frame_tier.minTime, 0) self.assertEqual(frame_tier.maxTime, utterance.time_to_frame(self.tier.maxTime, shift)) def test_time_to_frame_interval_tier_short_seg(self): tier = IntervalTier('test', 0, 0.01) tier.add(0, 0.003, "a") tier.add(0.003, 0.01, "b") frame_tier = utterance.time_to_frame_interval_tier(tier, 5) self.assertEqual(frame_tier.minTime, 0) self.assertEqual(frame_tier.maxTime, 2) self.assertEqual(frame_tier.intervals[0].minTime, 0) self.assertEqual(frame_tier.intervals[0].maxTime, 1) self.assertEqual(frame_tier.intervals[1].minTime, 1) self.assertEqual(frame_tier.intervals[1].maxTime, 2) def test_is_sil(self): self.assertTrue(utterance.is_sil("sil")) self.assertTrue(utterance.is_sil("sp")) self.assertTrue(utterance.is_sil("spn")) self.assertTrue(utterance.is_sil("")) self.assertTrue(utterance.is_sil("SIL")) self.assertTrue(utterance.is_sil("SPN")) self.assertTrue(utterance.is_sil("SP")) self.assertFalse(utterance.is_sil("AO")) self.assertFalse(utterance.is_sil("s")) def test_normalize_phone(self): self.assertEqual("ao", utterance.normalize_phone("AO0")) self.assertEqual("ao", utterance.normalize_phone("AO")) self.assertEqual("ao", utterance.normalize_phone("AO0, AH, s")) self.assertEqual("ao,ah,s", utterance.normalize_phone("AO0, AH, s", False)) self.assertEqual("sil", utterance.normalize_phone("SIL")) self.assertEqual("sil", utterance.normalize_phone("")) try: utterance.normalize_phone("1243") except ValueError: pass def test_normalize_tier_mark(self): utterance.normalize_tier_mark(self.tier) utterance.normalize_tier_mark(self.tier, "NormalizePhoneAnnotation") try: utterance.normalize_tier_mark(self.tier, "NormalizePhone") except ValueError: pass def test_read_sym_table(self): sym_table = utterance.read_sym_table("data/phoneme_table") self.assertEqual(len(sym_table), 40) def test_get_hardcoded_sym_table(self): sym_table = utterance.get_hardcoded_sym_table() correct_sym_table = utterance.read_sym_table("data/phoneme_table") self.assertTrue( set(sym_table.items()) == set(correct_sym_table.items())) def test_get_phone_tier_invalid_0(self): try: self.utt.get_phone_tier() except ValueError: pass def test_get_phone_tier_invalid_1(self): try: self.utt.kaldi_shift = 5 self.utt.get_phone_tier() except ValueError: pass def test_get_word_tier_invalid_0(self): try: self.utt.get_word_tier() except ValueError: pass def test_get_word_tier_invalid_1(self): try: self.utt.kaldi_shift = 5 self.utt.get_word_tier() except ValueError: pass def test_get_monophone_ppg_valid(self): fs, wav = wavfile.read('data/test_mono_channel.wav') utt = utterance.Utterance(wav, fs) utt.kaldi_shift = 5 ppgs = utt.get_monophone_ppg() self.assertEqual(ppgs.shape[1], 40) def test_get_monophone_ppg_invalid_0(self): try: self.utt.get_monophone_ppg() except ValueError: pass def test_get_monophone_ppg_invalid_1(self): try: self.utt.kaldi_shift = 5 self.utt.get_monophone_ppg() except ValueError: pass
def ctm_to_textgrid(word_ctm, phone_ctm, out_directory, corpus): if not os.path.exists(out_directory): os.makedirs(out_directory, exist_ok=True) if not corpus.segments: for i,(k,v) in enumerate(word_ctm.items()): maxtime = corpus.get_wav_duration(k) try: tg = TextGrid(maxTime = maxtime) wordtier = IntervalTier(name = 'words', maxTime = maxtime) phonetier = IntervalTier(name = 'phones', maxTime = maxtime) for interval in v: wordtier.add(*interval) for interval in phone_ctm[k]: phonetier.add(*interval) tg.append(wordtier) tg.append(phonetier) if corpus.speaker_directories: speaker_directory = os.path.join(out_directory, corpus.utt_speak_mapping[k]) else: speaker_directory = out_directory os.makedirs(speaker_directory, exist_ok=True) outpath = os.path.join(speaker_directory, k + '.TextGrid') tg.write(outpath) except ValueError as e: print('Could not write textgrid for {}'.format(k)) print(e) else: tgs = {} for i,(k,v) in enumerate(word_ctm.items()): rec = corpus.segments[k] rec, begin, end = rec.split(' ') maxtime = corpus.get_wav_duration(k) if rec not in tgs: tgs[rec] = TextGrid(maxTime = maxtime) tg = tgs[rec] begin = float(begin) speaker = corpus.utt_speak_mapping[k] word_tier_name = '{} - words'.format(speaker) phone_tier_name = '{} - phones'.format(speaker) wordtier = tg.getFirst(word_tier_name) if wordtier is None: wordtier = IntervalTier(name = word_tier_name, maxTime = maxtime) tg.append(wordtier) phonetier = tg.getFirst(phone_tier_name) if phonetier is None: phonetier = IntervalTier(name = phone_tier_name, maxTime = maxtime) tg.append(phonetier) for interval in v: interval = interval[0] + begin, interval[1] + begin, interval[2] wordtier.add(*interval) for interval in phone_ctm[k]: interval = interval[0] + begin, interval[1] + begin, interval[2] phonetier.add(*interval) for k,v in tgs.items(): outpath = os.path.join(out_directory, k + '.TextGrid') try: v.write(outpath) except ValueError as e: print('Could not write textgrid for {}'.format(k)) print(e)
label = ss[4] result.append([begin, end, label]) return result if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('words_ctm') parser.add_argument('phones_ctm') parser.add_argument('output_textgrid') args = parser.parse_args() words = read_ctm(args.words_ctm) phones = read_ctm(args.phones_ctm) max_time = phones[-1][1] tg = TextGrid(maxTime=max_time) word_tier = IntervalTier(name="words", maxTime=max_time) phone_tier = IntervalTier(name="phones", maxTime=max_time) for w in words: word_tier.add(*w) for p in phones: phone_tier.add(*p) tg.append(word_tier) tg.append(phone_tier) tg.write(args.output_textgrid)
def ctm_to_textgrid(word_ctm, phone_ctm, out_directory, corpus, dictionary, frameshift=0.01): textgrid_write_errors = {} frameshift = Decimal(str(frameshift)) if not os.path.exists(out_directory): os.makedirs(out_directory, exist_ok=True) if not corpus.segments: for i, (k, v) in enumerate(sorted(word_ctm.items())): maxtime = Decimal(str(corpus.get_wav_duration(k))) speaker = list(v.keys())[0] v = list(v.values())[0] try: tg = TextGrid(maxTime=maxtime) wordtier = IntervalTier(name='words', maxTime=maxtime) phonetier = IntervalTier(name='phones', maxTime=maxtime) for interval in v: if maxtime - interval[ 1] < frameshift: # Fix rounding issues interval[1] = maxtime wordtier.add(*interval) for interval in phone_ctm[k][speaker]: if maxtime - interval[1] < frameshift: interval[1] = maxtime phonetier.add(*interval) tg.append(wordtier) tg.append(phonetier) if corpus.speaker_directories: speaker_directory = os.path.join( out_directory, corpus.utt_speak_mapping[k]) else: speaker_directory = out_directory os.makedirs(speaker_directory, exist_ok=True) outpath = os.path.join(speaker_directory, k + '.TextGrid') tg.write(outpath) except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() textgrid_write_errors[k] = '\n'.join( traceback.format_exception(exc_type, exc_value, exc_traceback)) else: silences = { dictionary.optional_silence, dictionary.nonoptional_silence } for i, (filename, speaker_dict) in enumerate(sorted(word_ctm.items())): maxtime = corpus.get_wav_duration(filename) try: tg = TextGrid(maxTime=maxtime) for speaker, words in speaker_dict.items(): word_tier_name = '{} - words'.format(speaker) phone_tier_name = '{} - phones'.format(speaker) word_tier = IntervalTier(name=word_tier_name, maxTime=maxtime) phone_tier = IntervalTier(name=phone_tier_name, maxTime=maxtime) for w in words: word_tier.add(*w) for p in phone_ctm[filename][speaker]: if len(phone_tier) > 0 and phone_tier[ -1].mark in silences and p[2] in silences: phone_tier[-1].maxTime = p[1] else: if len(phone_tier) > 0 and p[2] in silences and p[ 0] < phone_tier[-1].maxTime: p = phone_tier[-1].maxTime, p[1], p[2] elif len(phone_tier) > 0 and p[2] not in silences and p[0] < phone_tier[-1].maxTime and \ phone_tier[-1].mark in silences: phone_tier[-1].maxTime = p[0] phone_tier.add(*p) tg.append(word_tier) tg.append(phone_tier) tg.write(os.path.join(out_directory, filename + '.TextGrid')) except Exception as e: exc_type, exc_value, exc_traceback = sys.exc_info() textgrid_write_errors[filename] = '\n'.join( traceback.format_exception(exc_type, exc_value, exc_traceback)) if textgrid_write_errors: error_log = os.path.join(out_directory, 'output_errors.txt') with open(error_log, 'w', encoding='utf8') as f: f.write( 'The following exceptions were encountered during the ouput of the alignments to TextGrids:\n\n' ) for k, v in textgrid_write_errors.items(): f.write('{}:\n'.format(k)) f.write('{}\n\n'.format(v))
if interval.mark == '': continue print(interval.mark, interval.minTime, interval.maxTime) outpath = os.path.join(temp_wav_dir, interval.mark + '.wav') extract_audio(wav_path, outpath, interval.minTime, interval.maxTime, padding = padding) rep = Mfcc(outpath, freq_lims = (80, 7800), num_coeffs = 12, win_len = 0.025, time_step = 0.01) rep.is_windowed = True duration = interval.maxTime - interval.minTime thresh = unnorm(norm(duration, min_duration, max_duration), min_thresh, max_thresh) rep.segment(threshold = thresh) print(sorted(rep._segments.keys())) padded_begin = interval.minTime - padding if padded_begin < 0: padded_begin = 0 for k in sorted(rep._segments.keys()): with open(os.path.join(temp_mfcc_dir, '{}.mfcc'.format(seg_ind)), 'wb') as fh: pickle.dump(rep[k[0],k[1]], fh) with open(os.path.join(temp_mean_dir, '{}.mean'.format(seg_ind)), 'wb') as fh: pickle.dump(rep._segments[k], fh) segs.append(str(seg_ind)) seg_ind += 1 begin = round(k[0] + padded_begin, 3) end = round(k[1] + padded_begin,3) print(begin, end) segmentation_tier.add(begin, end, '{}'.format(seg_ind)) with open(os.path.join(temp_align_dir, '{}.seg'.format(f)), 'w') as fa: fa.write(' '.join(segs)) tg.append(segmentation_tier) tg.write(textgrid_path.replace(data_dir, temp_textgrid_dir))
wordintervals.append(x) elif i == 1: for x in ti: x.maxTime += cur_dur x.minTime += cur_dur phoneintervals.append(x) cur_dur += maxtime words = IntervalTier(name='words') for i in wordintervals: words.addInterval(i) phones = IntervalTier(name='phones') for i in phoneintervals: phones.addInterval(i) tg1 = TextGrid(maxTime=cur_dur) tg1.append(words) tg1.append(phones) tg1.write(chapteroutpath1, null='') speaker_tier = IntervalTier(name=speaker) for i in range(len(groupedwavfiles)): if i == 1: speaker_tier.add(0.0, wavfiletimes[0], groupedlabtext[0]) else: speaker_tier.add(wavfiletimes[i - 2], wavfiletimes[i - 1], groupedlabtext[i - 1]) tg2 = TextGrid(maxTime=duration) tg2.append(speaker_tier) tg2.write(chapteroutpath2, null='')
num_coeffs=12, win_len=0.025, time_step=0.01) rep.is_windowed = True duration = interval.maxTime - interval.minTime thresh = unnorm(norm(duration, min_duration, max_duration), min_thresh, max_thresh) rep.segment(threshold=thresh) print(sorted(rep._segments.keys())) padded_begin = interval.minTime - padding if padded_begin < 0: padded_begin = 0 for k in sorted(rep._segments.keys()): with open(os.path.join(temp_mfcc_dir, '{}.mfcc'.format(seg_ind)), 'wb') as fh: pickle.dump(rep[k[0], k[1]], fh) with open(os.path.join(temp_mean_dir, '{}.mean'.format(seg_ind)), 'wb') as fh: pickle.dump(rep._segments[k], fh) segs.append(str(seg_ind)) seg_ind += 1 begin = round(k[0] + padded_begin, 3) end = round(k[1] + padded_begin, 3) print(begin, end) segmentation_tier.add(begin, end, '{}'.format(seg_ind)) with open(os.path.join(temp_align_dir, '{}.seg'.format(f)), 'w') as fa: fa.write(' '.join(segs)) tg.append(segmentation_tier) tg.write(textgrid_path.replace(data_dir, temp_textgrid_dir))
elif i == 1: for x in ti: x.maxTime += cur_dur x.minTime += cur_dur phoneintervals.append(x) cur_dur += maxtime words = IntervalTier(name='words') for i in wordintervals: words.addInterval(i) phones = IntervalTier(name='phones') for i in phoneintervals: phones.addInterval(i) tg1 = TextGrid(maxTime = cur_dur) tg1.append(words) tg1.append(phones) tg1.write(chapteroutpath1, null = '') speaker_tier = IntervalTier(name=speaker) for i in range(len(groupedwavfiles)): if i == 1: speaker_tier.add(0.0, wavfiletimes[0], groupedlabtext[0]) else: speaker_tier.add(wavfiletimes[i-2], wavfiletimes[i-1], groupedlabtext[i-1]) tg2 = TextGrid(maxTime = duration) tg2.append(speaker_tier) tg2.write(chapteroutpath2, null = '')
def ctm_to_textgrid(word_ctm, phone_ctm, out_directory, corpus, dictionary, frameshift=0.01): frameshift = Decimal(str(frameshift)) if not os.path.exists(out_directory): os.makedirs(out_directory, exist_ok=True) if not corpus.segments: for i, (k, v) in enumerate(sorted(word_ctm.items())): maxtime = Decimal(str(corpus.get_wav_duration(k))) speaker = list(v.keys())[0] v = list(v.values())[0] try: tg = TextGrid(maxTime=maxtime) wordtier = IntervalTier(name='words', maxTime=maxtime) phonetier = IntervalTier(name='phones', maxTime=maxtime) for interval in v: if maxtime - interval[ 1] < frameshift: # Fix rounding issues interval[1] = maxtime wordtier.add(*interval) for interval in phone_ctm[k][speaker]: if maxtime - interval[1] < frameshift: interval[1] = maxtime phonetier.add(*interval) tg.append(wordtier) tg.append(phonetier) if corpus.speaker_directories: speaker_directory = os.path.join( out_directory, corpus.utt_speak_mapping[k]) else: speaker_directory = out_directory os.makedirs(speaker_directory, exist_ok=True) outpath = os.path.join(speaker_directory, k + '.TextGrid') tg.write(outpath) except ValueError as e: print( 'There was an error writing the TextGrid for {}, please see below:' .format(k)) raise else: silences = { dictionary.optional_silence, dictionary.nonoptional_silence } for i, (filename, speaker_dict) in enumerate(sorted(word_ctm.items())): maxtime = corpus.get_wav_duration(filename) tg = TextGrid(maxTime=maxtime) for speaker, words in speaker_dict.items(): word_tier_name = '{} - words'.format(speaker) phone_tier_name = '{} - phones'.format(speaker) word_tier = IntervalTier(name=word_tier_name, maxTime=maxtime) phone_tier = IntervalTier(name=phone_tier_name, maxTime=maxtime) for w in words: word_tier.add(*w) for p in phone_ctm[filename][speaker]: if len(phone_tier) > 0 and phone_tier[ -1].mark in silences and p[2] in silences: phone_tier[-1].maxTime = p[1] else: if len(phone_tier) > 0 and p[2] in silences and p[ 0] < phone_tier[-1].maxTime: p = phone_tier[-1].maxTime, p[1], p[2] elif len(phone_tier) > 0 and p[2] not in silences and p[0] < phone_tier[-1].maxTime and \ phone_tier[-1].mark in silences: phone_tier[-1].maxTime = p[0] phone_tier.add(*p) tg.append(word_tier) tg.append(phone_tier) tg.write(os.path.join(out_directory, filename + '.TextGrid'))